public void finishTerm(long defaultWeight) throws IOException { ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); int deduplicator = 0; analyzed.append((byte) 0); analyzed.setLength(analyzed.length() + 1); analyzed.grow(analyzed.length()); for (int i = 0; i < count; i++) { analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++); Util.toIntsRef(analyzed.get(), scratchInts); SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload)); } seenSurfaceForms.clear(); count = 0; }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { // write term meta data into fst final BlockTermState state = postingsWriter.newTermState(); final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData(); meta.longs = new long[longsSize]; meta.bytes = null; meta.docFreq = state.docFreq = stats.docFreq; meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq; postingsWriter.finishTerm(state); postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true); final int bytesSize = (int)metaWriter.getFilePointer(); if (bytesSize > 0) { meta.bytes = new byte[bytesSize]; metaWriter.writeTo(meta.bytes, 0); metaWriter.reset(); } builder.add(Util.toIntsRef(text, scratchTerm), meta); numTerms++; }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRef scratch = new IntsRef(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs); IntsRef scratch = new IntsRef(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
@Override public void build(InputIterator iterator) throws IOException { if (iterator.hasPayloads()) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } BytesRef scratch = new BytesRef(); InputIterator iter = new WFSTInputIterator(iterator); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<BytesRef> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch.get(), output); } return builder.finish(); }
@Override public boolean incrementToken() throws IOException { clearAttributes(); if (finiteStrings == null) { Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input); if (strings.size() > MAX_PATHS) { throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS + " finite strings are supported"); } posInc = strings.size(); finiteStrings = strings.iterator(); } if (finiteStrings.hasNext()) { posAttr.setPositionIncrement(posInc); /* * this posInc encodes the number of paths that this surface form * produced. Multi Fields have the same surface form and therefore sum up */ posInc = 0; Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8 if (charTermAttribute != null) { charTermAttribute.setLength(0); charTermAttribute.append(bytesAtt.toUTF16()); } if (payload != null) { payloadAttr.setPayload(this.payload); } return true; } return false; }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(Util.toBytesRef(output, new BytesRefBuilder())); }
/** Load frame for target arc(node) on fst, so that * arc.label >= label and !fsa.reject(arc.label) */ Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc; arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); if (arc == null) { return null; } frame.fsaState = fsa.step(top.fsaState, arc.label); //if (TEST) System.out.println(" loadCeil frame="+frame); if (frame.fsaState == -1) { return loadNextFrame(top, frame); } return frame; }
@Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) { bufferSkip(); } // write term meta data into fst final long longs[] = new long[longsSize]; final long delta = stats.totalTermFreq - stats.docFreq; if (stats.totalTermFreq > 0) { if (delta == 0) { statsOut.writeVInt(stats.docFreq<<1|1); } else { statsOut.writeVInt(stats.docFreq<<1|0); statsOut.writeVLong(stats.totalTermFreq-stats.docFreq); } } else { statsOut.writeVInt(stats.docFreq); } BlockTermState state = postingsWriter.newTermState(); state.docFreq = stats.docFreq; state.totalTermFreq = stats.totalTermFreq; postingsWriter.finishTerm(state); postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true); for (int i = 0; i < longsSize; i++) { metaLongsOut.writeVLong(longs[i] - lastLongs[i]); lastLongs[i] = longs[i]; } metaLongsOut.writeVLong(metaBytesOut.getFilePointer() - lastMetaBytesFP); builder.add(Util.toIntsRef(text, scratchTerm), numTerms); numTerms++; lastMetaBytesFP = metaBytesOut.getFilePointer(); }
/** Load frame for target arc(node) on fst, so that * arc.label >= label and !fsa.reject(arc.label) */ Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { FST.Arc<Long> arc = frame.arc; arc = Util.readCeilArc(label, fst, top.arc, arc, fstReader); if (arc == null) { return null; } frame.state = fsa.step(top.state, arc.label); //if (TEST) System.out.println(" loadCeil frame="+frame); if (frame.state == -1) { return loadNextFrame(top, frame); } return frame; }
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<Output> indexEnt; while ((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} Output output = indexEnt.output; long blockTermCount = output.endOrd - output.startOrd + 1; Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset); //System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd)); builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput); } }
private void loadTermsIndex() throws IOException { if (fst == null) { IndexInput clone = in.clone(); clone.seek(indexStart); fst = new FST<>(clone, fstOutputs); clone.close(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (indexDivisor > 1) { // subsample final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); BytesRefFSTEnum.InputOutput<Long> result; int count = indexDivisor; while((result = fstEnum.next()) != null) { if (count == indexDivisor) { builder.add(Util.toIntsRef(result.input, scratchIntsRef), result.output); count = 0; } count++; } fst = builder.finish(); } } }
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }