TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; noOutputs = NoOutputs.getSingleton(); // This Builder is just used transiently to fragment // terms into "good" blocks; we don't save the // resulting FST: blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, noOutputs, new FindBlocks(), false, PackedInts.COMPACT, true, 15); postingsWriter.setField(fieldInfo); }
TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; noOutputs = NoOutputs.getSingleton(); // This Builder is just used transiently to fragment // terms into "good" blocks; we don't save the // resulting FST: blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, noOutputs, new FindBlocks(), false, PackedInts.COMPACT, true, 15); this.longsSize = postingsWriter.setField(fieldInfo); }
@Override public synchronized boolean load(DataInput input) throws IOException { count = input.readVLong(); this.higherWeightsCompletion = new FSTCompletion(new FST<>( input, NoOutputs.getSingleton())); this.normalCompletion = new FSTCompletion( higherWeightsCompletion.getFST(), false, exactMatchFirst); return true; }
@Override public synchronized boolean load(InputStream input) throws IOException { try { this.higherWeightsCompletion = new FSTCompletion(new FST<Object>( new InputStreamDataInput(input), NoOutputs.getSingleton())); this.normalCompletion = new FSTCompletion( higherWeightsCompletion.getFST(), false, exactMatchFirst); } finally { IOUtils.close(input); } return true; }
public FstDecompounder(InputStream inputStream, String[] glue) throws IOException { try { this.surfaceForms = new FST<>(new InputStreamDataInput(inputStream), NoOutputs.getSingleton()); } finally { inputStream.close(); } // set up glue morphemes this.glueMorphemes = createGlueMorphemes(glue != null && glue.length > 0 ? glue : morphemes); }
private FST<Object> createGlueMorphemes(String[] glue) throws IOException { for (int i = 0; i < glue.length; i++) { glue[i] = new StringBuilder(glue[i]).reverse().toString(); } Arrays.sort(glue); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final Object nothing = NoOutputs.getSingleton().getNoOutput(); IntsRefBuilder intsBuilder = new IntsRefBuilder(); for (String morpheme : glue) { fromUTF16ToUTF32(morpheme, intsBuilder); builder.add(intsBuilder.get(), nothing); } return builder.finish(); }
@Override public synchronized boolean load(DataInput input) throws IOException { count = input.readVLong(); this.higherWeightsCompletion = new FSTCompletion(new FST<Object>( input, NoOutputs.getSingleton())); this.normalCompletion = new FSTCompletion( higherWeightsCompletion.getFST(), false, exactMatchFirst); return true; }
/** * * @param inputStream the input stream * @param outputStream the output stream * @throws IOException if compilation fails */ public void compile(InputStream inputStream, OutputStream outputStream) throws IOException { final HashSet<BytesRef> words = new HashSet<>(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); String line; String last = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = reader.readLine()) != null) { if (line.indexOf('#') >= 0) { continue; } line = pattern.split(line)[0].trim(); line = line.toLowerCase(); if (line.equals(last)) { continue; } last = line; /* * Add the word to the hash set in left-to-right characters order and reversed * for easier matching later on. */ stringBuilder.setLength(0); stringBuilder.append(line); final int len = stringBuilder.length(); stringBuilder.append('>'); words.add(new BytesRef(stringBuilder)); stringBuilder.setLength(len); stringBuilder.reverse().append('<'); words.add(new BytesRef(stringBuilder)); } reader.close(); final BytesRef [] all = new BytesRef[words.size()]; words.toArray(all); Arrays.sort(all, BytesRef::compareTo); final Object nothing = NoOutputs.getSingleton().getNoOutput(); final Builder<Object> builder = new Builder<>(INPUT_TYPE.BYTE4, NoOutputs.getSingleton()); final IntsRefBuilder intsRef = new IntsRefBuilder(); for (BytesRef bytesRef : all) { intsRef.clear(); intsRef.copyUTF8Bytes(bytesRef); builder.add(intsRef.get(), nothing); } final FST<Object> fst = builder.finish(); try (final OutputStreamDataOutput out = new OutputStreamDataOutput(outputStream)) { fst.save(out); } }