@Override protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths( List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<PairOutputs.Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while(true) { assert scratchArc.label != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
/** * Constructs a new Stemmer which will use the provided Dictionary to create its stems. * * @param dictionary Dictionary that will be used to create the stems */ public Stemmer(Dictionary dictionary) { this.dictionary = dictionary; this.affixReader = new ByteArrayDataInput(dictionary.affixData); for (int level = 0; level < 3; level++) { if (dictionary.prefixes != null) { prefixArcs[level] = new FST.Arc<>(); prefixReaders[level] = dictionary.prefixes.getBytesReader(); } if (dictionary.suffixes != null) { suffixArcs[level] = new FST.Arc<>(); suffixReaders[level] = dictionary.suffixes.getBytesReader(); } } formStep = dictionary.hasStemExceptions ? 2 : 1; }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
@Override protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<PairOutputs.Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
/** * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug * * @param synonymMap */ @Override public void update(SynonymMap synonymMap) { this.synonyms = synonymMap; this.fst = synonyms.fst; if(this.fst == null) { throw new IllegalArgumentException("fst must be non-null"); } else { this.fstReader = this.fst.getBytesReader(); this.rollBufferSize = 1 + synonyms.maxHorizontalContext; this.futureInputs = new DynamicSynonymFilter.PendingInput[this.rollBufferSize]; this.futureOutputs = new DynamicSynonymFilter.PendingOutputs[this.rollBufferSize]; for(int pos = 0; pos < this.rollBufferSize; ++pos) { this.futureInputs[pos] = new DynamicSynonymFilter.PendingInput(); this.futureOutputs[pos] = new DynamicSynonymFilter.PendingOutputs(); } this.scratchArc = new FST.Arc(); } }
/** Lazily accumulate meta data, when we got a accepted term */ void loadMetaData() throws IOException { FST.Arc<FSTTermOutputs.TermData> last, next; last = stack[metaUpto].fstArc; while (metaUpto != level) { metaUpto++; next = stack[metaUpto].fstArc; next.output = fstOutputs.add(next.output, last.output); last = next; } if (last.isFinal()) { meta = fstOutputs.add(last.output, last.nextFinalOutput); } else { meta = last.output; } state.docFreq = meta.docFreq; state.totalTermFreq = meta.totalTermFreq; }
static<T> void walk(FST<T> fst) throws IOException { final ArrayList<FST.Arc<T>> queue = new ArrayList<>(); final BitSet seen = new BitSet(); final FST.BytesReader reader = fst.getBytesReader(); final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); queue.add(startArc); while (!queue.isEmpty()) { final FST.Arc<T> arc = queue.remove(0); final long node = arc.target; //System.out.println(arc); if (FST.targetHasArcs(arc) && !seen.get((int) node)) { seen.set((int) node); fst.readFirstRealTargetArc(node, arc, reader); while (true) { queue.add(new FST.Arc<T>().copyFrom(arc)); if (arc.isLast()) { break; } else { fst.readNextRealArc(arc, reader); } } } } }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc<Output> arc, Output frameData, int length) throws IOException { scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length); final long code = scratchReader.readVLong(); final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; // System.out.println(" fpSeek=" + fpSeek); final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; // Must setFloorData before pushFrame in case pushFrame tries to rewind: if (f.isFloor) { f.termOrdOrig = frameData.startOrd; f.setFloorData(scratchReader, frameData.bytes); } pushFrame(arc, fpSeek, length, frameData.startOrd); return f; }
@Override protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
@Override public boolean load(DataInput input) throws IOException { CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START); count = input.readVLong(); byte separatorOrig = input.readByte(); if (separatorOrig != separator) { throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig); } int gramsOrig = input.readVInt(); if (gramsOrig != grams) { throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig); } totTokens = input.readVLong(); fst = new FST<>(input, PositiveIntOutputs.getSingleton()); return true; }
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader, BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException { Long output = fst.outputs.getNoOutput(); fst.getFirstArc(arc); byte[] bytes = scratch.bytes; int pos = scratch.offset; int end = pos + scratch.length; while (pos < end) { if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return null; } else { output = fst.outputs.add(output, arc.output); } } return output; }
/** * Cache the root node's output arcs starting with completions with the * highest weights. */ @SuppressWarnings({"unchecked","rawtypes"}) private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) { try { List<Arc<Object>> rootArcs = new ArrayList<>(); Arc<Object> arc = automaton.getFirstArc(new Arc<>()); FST.BytesReader fstReader = automaton.getBytesReader(); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.add(new Arc<>().copyFrom(arc)); if (arc.isLast()) break; automaton.readNextArc(arc, fstReader); } Collections.reverse(rootArcs); // we want highest weights first. return rootArcs.toArray(new Arc[rootArcs.size()]); } catch (IOException e) { throw new RuntimeException(e); } }
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException { scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length); final long code = scratchReader.readVLong(); final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); f.maxIDVersion = Long.MAX_VALUE - frameData.output2; f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; if (f.isFloor) { f.setFloorData(scratchReader, frameData.output1); } pushFrame(arc, fpSeek, length); return f; }
private TokenInfoDictionary() throws IOException { super(); InputStream is = null; FST<Long> fst = null; boolean success = false; try { is = getResource(FST_FILENAME_SUFFIX); is = new BufferedInputStream(is); fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton()); success = true; } finally { if (success) { IOUtils.close(is); } else { IOUtils.closeWhileHandlingException(is); } } // TODO: some way to configure? this.fst = new TokenInfoFST(fst, true); }
protected CcWordsFilter(TokenStream input, CcArgs args) { super(input); this.args = args; // this.fst = args.wordSet.fst; this.fstReader = args.wordSet.fst.getBytesReader(); this.fstWords = args.wordSet.words; this.fstFirstArc = new FST.Arc<>(); this.fst.getFirstArc(fstFirstArc); this.scratchWordBytesRef = new BytesRef(); this.scratchArc = new FST.Arc<>(); this.scratchArcOfSep = new FST.Arc<>(); this.scatchArcOfEnd = new FST.Arc<>(); // this.pendingOutputs = new LinkedList<>(); }
@Override public void reset() throws IOException { super.reset(); block.setLength(0); prevToken = null; readBufferIndex = BUFFER_SIZE; readBufferLen = 0; ch = 0; blkStart = 0; nextBlkStart = 0; if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) { lastModified = synonymLoader.getLastModified(); final SynonymMap map = synonymLoader.getSynonymMap(); if (map != null) { synonymMap = map; fst = synonymMap.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } fstReader = fst.getBytesReader(); scratchArc = new FST.Arc<>(); clearAttributes(); } } }
/** * Cache the root node's output arcs starting with completions with the * highest weights. */ @SuppressWarnings({"unchecked","rawtypes"}) private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) { try { List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>(); Arc<Object> arc = automaton.getFirstArc(new Arc<Object>()); FST.BytesReader fstReader = automaton.getBytesReader(); automaton.readFirstTargetArc(arc, arc, fstReader); while (true) { rootArcs.add(new Arc<Object>().copyFrom(arc)); if (arc.isLast()) break; automaton.readNextArc(arc, fstReader); } Collections.reverse(rootArcs); // we want highest weights first. return rootArcs.toArray(new Arc[rootArcs.size()]); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public boolean load(DataInput input) throws IOException { CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START); count = input.readVLong(); byte separatorOrig = input.readByte(); if (separatorOrig != separator) { throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig); } int gramsOrig = input.readVInt(); if (gramsOrig != grams) { throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig); } totTokens = input.readVLong(); fst = new FST<Long>(input, PositiveIntOutputs.getSingleton()); return true; }
private TokenInfoDictionary() throws IOException { super(); IOException priorE = null; InputStream is = null; FST<Long> fst = null; try { is = getResource(FST_FILENAME_SUFFIX); is = new BufferedInputStream(is); fst = new FST<Long>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton(true)); } catch (IOException ioe) { priorE = ioe; } finally { IOUtils.closeWhileHandlingException(priorE, is); } // TODO: some way to configure? this.fst = new TokenInfoFST(fst, true); }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRef scratch = new IntsRef(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; noOutputs = NoOutputs.getSingleton(); // This Builder is just used transiently to fragment // terms into "good" blocks; we don't save the // resulting FST: blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, noOutputs, new FindBlocks(), false, PackedInts.COMPACT, true, 15); postingsWriter.setField(fieldInfo); }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef intsSpare = new IntsRef(); final int size = hash.size(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare); builder.add(intsSpare, new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while(true) { assert scratchArc.label != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
static<T> void walk(FST<T> fst) throws IOException { final ArrayList<FST.Arc<T>> queue = new ArrayList<FST.Arc<T>>(); final BitSet seen = new BitSet(); final FST.BytesReader reader = fst.getBytesReader(); final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); queue.add(startArc); while (!queue.isEmpty()) { final FST.Arc<T> arc = queue.remove(0); final long node = arc.target; //System.out.println(arc); if (FST.targetHasArcs(arc) && !seen.get((int) node)) { seen.set((int) node); fst.readFirstRealTargetArc(node, arc, reader); while (true) { queue.add(new FST.Arc<T>().copyFrom(arc)); if (arc.isLast()) { break; } else { fst.readNextRealArc(arc, reader); } } } } }