@Override protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException { scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length); final long code = scratchReader.readVLong(); final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); f.maxIDVersion = Long.MAX_VALUE - frameData.output2; f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0; f.hasTermsOrig = f.hasTerms; f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0; if (f.isFloor) { f.setFloorData(scratchReader, frameData.output1); } pushFrame(arc, fpSeek, length); return f; }
@Override protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = toLevenshteinAutomata(lookupAutomaton); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
@Override protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
/** Returns all completion paths to initialize the search. */ protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { return prefixPaths; }
/** * Creates a new suggester. * * @param indexAnalyzer Analyzer that will be used for * analyzing suggestions while building the index. * @param queryAnalyzer Analyzer that will be used for * analyzing query text during lookup * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of * surface forms to keep for a single analyzed form. * When there are too many surface forms we discard the * lowest weighted ones. * @param maxGraphExpansions Maximum number of graph paths * to expand from the analyzed form. Set this to -1 for * no limit. */ public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput, int sepLabel, int payloadSep, int endByte, int holeCharacter) { // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput this.indexAnalyzer = indexAnalyzer; this.queryAnalyzer = queryAnalyzer; this.fst = fst; this.hasPayloads = hasPayloads; if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions this.queryPrefix = queryPrefix; // NOTE: this is just an implementation limitation; if // somehow this is a problem we could fix it by using // more than one byte to disambiguate ... but 256 seems // like it should be way more then enough. if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); } this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); } this.maxGraphExpansions = maxGraphExpansions; this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; this.preservePositionIncrements = preservePositionIncrements; this.sepLabel = sepLabel; this.payloadSep = payloadSep; this.endByte = endByte; this.holeCharacter = holeCharacter; }
public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte, int holeCharacter) { this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; this.maxGraphExpansions = maxGraphExpansions; this.hasPayloads = hasPayloads; this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; this.fst = fst; this.sepLabel = sepLabel; this.payloadSep = payloadSep; this.endByte = endByte; this.holeCharacter = holeCharacter; }
/** Returns all prefix paths to initialize the search. */ protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { return prefixPaths; }
private FST.Arc<Pair<BytesRef,Long>> getArc(int ord) { if (ord >= arcs.length) { @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<Pair<BytesRef,Long>>[] next = new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(arcs, 0, next, 0, arcs.length); for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) { next[arcOrd] = new FST.Arc<>(); } arcs = next; } return arcs[ord]; }
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, long fp, int length) throws IOException { final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord); f.arc = arc; if (f.fpOrig == fp && f.nextEnt != -1) { //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); if (f.prefix > targetBeforeCurrentLength) { f.rewind(); } else { // if (DEBUG) { // System.out.println(" skip rewind!"); // } } assert length == f.prefix; } else { f.nextEnt = -1; f.prefix = length; f.state.termBlockOrd = 0; f.fpOrig = f.fp = fp; f.lastSubFP = -1; // if (DEBUG) { // final int sav = term.length; // term.length = length; // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); // term.length = sav; // } } return f; }
VersionFieldReader(VersionBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Pair<BytesRef,Long> rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); this.parent = parent; this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); // } rootBlockFP = (new ByteArrayDataInput(rootCode.output1.bytes, rootCode.output1.offset, rootCode.output1.length)).readVLong() >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS; if (indexIn != null) { final IndexInput clone = indexIn.clone(); //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); clone.seek(indexStartFP); index = new FST<>(clone, VersionBlockTreeTermsWriter.FST_OUTPUTS); /* if (false) { final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(index, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); } */ } else { index = null; } }
public FieldMetaData(FieldInfo fieldInfo, Pair<BytesRef,Long> rootCode, long numTerms, long indexStartFP, int longsSize, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms; this.rootCode = rootCode; this.indexStartFP = indexStartFP; this.numTerms = numTerms; this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; }
public PendingBlock(BytesRef prefix, long maxVersion, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<Pair<BytesRef,Long>>> subIndices) { super(false); this.prefix = prefix; this.maxVersion = maxVersion; this.fp = fp; this.hasTerms = hasTerms; this.isFloor = isFloor; this.floorLeadByte = floorLeadByte; this.subIndices = subIndices; }
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */ public void testShortestPathsWFST() throws Exception { PairOutputs<Long,Long> outputs = new PairOutputs<>( PositiveIntOutputs.getSingleton(), // weight PositiveIntOutputs.getSingleton() // output ); final Builder<Pair<Long,Long>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); final FST<Pair<Long,Long>> fst = builder.finish(); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); //w.close(); Util.TopResults<Pair<Long,Long>> res = Util.shortestPaths(fst, fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()), outputs.getNoOutput(), minPairWeightComparator, 3, true); assertTrue(res.isComplete); assertEquals(3, res.topN.size()); assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), res.topN.get(0).input); assertEquals(7L, res.topN.get(0).output.output1.longValue()); // weight assertEquals(36L, res.topN.get(0).output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), res.topN.get(1).input); assertEquals(17L, res.topN.get(1).output.output1.longValue()); // weight assertEquals(85L, res.topN.get(1).output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), res.topN.get(2).input); assertEquals(22L, res.topN.get(2).output.output1.longValue()); // weight assertEquals(57L, res.topN.get(2).output.output2.longValue()); // output }
@Override public boolean load(InputStream input) throws IOException { DataInput dataIn = new InputStreamDataInput(input); try { this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton())); maxAnalyzedPathsForOneInput = dataIn.readVInt(); } finally { IOUtils.close(input); } return true; }
/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */ public void testShortestPathsWFST() throws Exception { PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>( PositiveIntOutputs.getSingleton(true), // weight PositiveIntOutputs.getSingleton(true) // output ); final Builder<Pair<Long,Long>> builder = new Builder<Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRef scratch = new IntsRef(); builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); final FST<Pair<Long,Long>> fst = builder.finish(); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); //w.close(); Util.MinResult<Pair<Long,Long>>[] r = Util.shortestPaths(fst, fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()), outputs.getNoOutput(), minPairWeightComparator, 3, true); assertEquals(3, r.length); assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input); assertEquals(7L, r[0].output.output1.longValue()); // weight assertEquals(36L, r[0].output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), r[1].input); assertEquals(17L, r[1].output.output1.longValue()); // weight assertEquals(85L, r[1].output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), r[2].input); assertEquals(22L, r[2].output.output1.longValue()); // weight assertEquals(57L, r[2].output.output2.longValue()); // output }
@Override public boolean load(DataInput input) throws IOException { count = input.readVLong(); this.fst = new FST<Pair<Long,BytesRef>>(input, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); maxAnalyzedPathsForOneInput = input.readVInt(); hasPayloads = input.readByte() == 1; return true; }
@Override public boolean load(InputStream input) throws IOException { DataInput dataIn = new InputStreamDataInput(input); try { this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); maxAnalyzedPathsForOneInput = dataIn.readVInt(); hasPayloads = dataIn.readByte() == 1; } finally { IOUtils.close(input); } return true; }
/** like testShortestPaths, but uses pairoutputs so we have both a weight and an output */ public void testShortestPathsWFST() throws Exception { PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>( PositiveIntOutputs.getSingleton(), // weight PositiveIntOutputs.getSingleton() // output ); final Builder<Pair<Long,Long>> builder = new Builder<Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRef scratch = new IntsRef(); builder.add(Util.toIntsRef(new BytesRef("aab"), scratch), outputs.newPair(22L, 57L)); builder.add(Util.toIntsRef(new BytesRef("aac"), scratch), outputs.newPair(7L, 36L)); builder.add(Util.toIntsRef(new BytesRef("ax"), scratch), outputs.newPair(17L, 85L)); final FST<Pair<Long,Long>> fst = builder.finish(); //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); //Util.toDot(fst, w, false, false); //w.close(); Util.MinResult<Pair<Long,Long>>[] r = Util.shortestPaths(fst, fst.getFirstArc(new FST.Arc<Pair<Long,Long>>()), outputs.getNoOutput(), minPairWeightComparator, 3, true); assertEquals(3, r.length); assertEquals(Util.toIntsRef(new BytesRef("aac"), scratch), r[0].input); assertEquals(7L, r[0].output.output1.longValue()); // weight assertEquals(36L, r[0].output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("ax"), scratch), r[1].input); assertEquals(17L, r[1].output.output1.longValue()); // weight assertEquals(85L, r[1].output.output2.longValue()); // output assertEquals(Util.toIntsRef(new BytesRef("aab"), scratch), r[2].input); assertEquals(22L, r[2].output.output1.longValue()); // weight assertEquals(57L, r[2].output.output2.longValue()); // output }
/** * Creates a new suggester. * * @param indexAnalyzer Analyzer that will be used for * analyzing suggestions while building the index. * @param queryAnalyzer Analyzer that will be used for * analyzing query text during lookup * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of * surface forms to keep for a single analyzed form. * When there are too many surface forms we discard the * lowest weighted ones. * @param maxGraphExpansions Maximum number of graph paths * to expand from the analyzed form. Set this to -1 for * no limit. */ public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput, int sepLabel, int payloadSep, int endByte, int holeCharacter) { // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput this.indexAnalyzer = indexAnalyzer; this.queryAnalyzer = queryAnalyzer; this.fst = fst; this.hasPayloads = hasPayloads; if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions this.queryPrefix = queryPrefix; // NOTE: this is just an implementation limitation; if // somehow this is a problem we could fix it by using // more than one byte to disambiguate ... but 256 seems // like it should be way more then enough. if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { throw new IllegalArgumentException( "maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); } this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { throw new IllegalArgumentException( "maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); } this.maxGraphExpansions = maxGraphExpansions; this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; this.preservePositionIncrements = preservePositionIncrements; this.sepLabel = sepLabel; this.payloadSep = payloadSep; this.endByte = endByte; this.holeCharacter = holeCharacter; }
@Override public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) { return left.output1.compareTo(right.output1); }
public FST<Pair<Long, BytesRef>> build() throws IOException { return builder.finish(); }