/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */ public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); }
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */ public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); }
/** Adds all leaving arcs, including 'finished' arc, if * the node is final, from this node into the queue. */ public void addStartPaths(FST.Arc<T> node, T startOutput, boolean allowEmptyString, IntsRefBuilder input) throws IOException { // De-dup NO_OUTPUT since it must be a singleton: if (startOutput.equals(fst.outputs.getNoOutput())) { startOutput = fst.outputs.getNoOutput(); } FSTPath<T> path = new FSTPath<>(startOutput, node, input); fst.readFirstTargetArc(node, path.arc, bytesReader); //System.out.println("add start paths"); // Bootstrap: find the min starting arc while (true) { if (allowEmptyString || path.arc.label != FST.END_LABEL) { addIfCompetitive(path); } if (path.arc.isLast()) { break; } fst.readNextArc(path.arc, bytesReader); } }
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
/** * Builds the final automaton from a list of entries. */ private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException { // Build the automaton. final Outputs<Object> outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); final Builder<Object> builder = new Builder<>( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, false, PackedInts.DEFAULT, true, 15); BytesRefBuilder scratch = new BytesRefBuilder(); BytesRef entry; final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); int count = 0; BytesRefIterator iter = sorter.iterator(); while((entry = iter.next()) != null) { count++; if (scratch.get().compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } return count == 0 ? null : builder.finish(); }
public void testDuplicateFSAString() throws Exception { String str = "foobar"; final Outputs<Object> outputs = NoOutputs.getSingleton(); final Builder<Object> b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder ints = new IntsRefBuilder(); for(int i=0; i<10; i++) { b.add(Util.toIntsRef(new BytesRef(str), ints), outputs.getNoOutput()); } FST<Object> fst = b.finish(); // count the input paths int count = 0; final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<>(fst); while(fstEnum.next()!=null) { count++; } assertEquals(1, count); assertNotNull(Util.get(fst, new BytesRef(str))); assertNull(Util.get(fst, new BytesRef("foobaz"))); }
public void testInternalFinalState() throws Exception { final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final boolean willRewrite = random().nextBoolean(); final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, willRewrite, PackedInts.DEFAULT, true, 15); builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRefBuilder()), outputs.getNoOutput()); builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRefBuilder()), outputs.getNoOutput()); final FST<Long> fst = builder.finish(); StringWriter w = new StringWriter(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); Util.toDot(fst, w, false, false); w.close(); //System.out.println(w.toString()); // check for accept state at label t assertTrue(w.toString().indexOf("[label=\"t\" style=\"bold\"") != -1); // check for accept state at label n assertTrue(w.toString().indexOf("[label=\"n\" style=\"bold\"") != -1); }
public void testLargeOutputsOnArrayArcs() throws Exception { final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final byte[] bytes = new byte[300]; final IntsRefBuilder input = new IntsRefBuilder(); input.append(0); final BytesRef output = new BytesRef(bytes); for(int arc=0;arc<6;arc++) { input.setIntAt(0, arc); output.bytes[0] = (byte) arc; builder.add(input.get(), BytesRef.deepCopyOf(output)); } final FST<BytesRef> fst = builder.finish(); for(int arc=0;arc<6;arc++) { input.setIntAt(0, arc); final BytesRef result = Util.get(fst, input.get()); assertNotNull(result); assertEquals(300, result.length); assertEquals(result.bytes[result.offset], arc); for(int byteIDX=1;byteIDX<result.length;byteIDX++) { assertEquals(0, result.bytes[result.offset+byteIDX]); } } }
/** * Splits the input sequence of characters into separate words if this sequence is * potentially a compound word. * * @param word The word to be split. * @return Returns <code>null</code> if this word is not recognized at all. Returns a * character sequence with '.'-delimited compound chunks (if ambiguous * interpretations are possible, they are separated by a ',' character). The * returned buffer will change with each call to <code>split</code> so copy the * content if needed. */ public CharSequence split(CharSequence word) { try { StringBuilder builder = new StringBuilder(); builder.append(word); builder.reverse(); for (int i = builder.length(); --i > 0; ) { builder.setCharAt(i, Character.toLowerCase(builder.charAt(i))); } IntsRefBuilder utf32Builder = new IntsRefBuilder(); IntsRef utf32 = fromUTF16ToUTF32(builder, utf32Builder).get(); builder.setLength(0); Deque<Chunk> chunks = new ArrayDeque<>(); matchWord(utf32, utf32.offset, builder, chunks); return builder.length() == 0 ? null : builder; } catch (IOException e) { throw new UncheckedIOException(e); } }
private void append(Builder<BytesRef> builder, FST<BytesRef> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<BytesRef> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<BytesRef> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); List<Integer> entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch.get(), output); } return builder.finish(); }
/** Starting from node, find the top N min cost * completions to a final node. */ public static <T> TopResults<T> shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN, boolean allowEmptyString) throws IOException { // All paths are kept, so we can pass topN for // maxQueueDepth and the pruning is admissible: TopNSearcher<T> searcher = new TopNSearcher<>(fst, topN, topN, comparator); // since this search is initialized with a single start node // it is okay to start with an empty input path here searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRefBuilder()); return searcher.search(); }
/** Just maps each UTF16 unit (char) to the ints in an * IntsRef. */ public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) { final int charLimit = s.length(); scratch.setLength(charLimit); scratch.grow(charLimit); for (int idx = 0; idx < charLimit; idx++) { scratch.setIntAt(idx, (int) s.charAt(idx)); } return scratch.get(); }
/** Decodes the Unicode codepoints from the provided * CharSequence and places them in the provided scratch * IntsRef, which must not be null, returning it. */ public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch) { int charIdx = 0; int intIdx = 0; final int charLimit = s.length(); while(charIdx < charLimit) { scratch.grow(intIdx+1); final int utf32 = Character.codePointAt(s, charIdx); scratch.setIntAt(intIdx, utf32); charIdx += Character.charCount(utf32); intIdx++; } scratch.setLength(intIdx); return scratch.get(); }
/** Decodes the Unicode codepoints from the provided * char[] and places them in the provided scratch * IntsRef, which must not be null, returning it. */ public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder scratch) { int charIdx = offset; int intIdx = 0; final int charLimit = offset + length; while(charIdx < charLimit) { scratch.grow(intIdx+1); final int utf32 = Character.codePointAt(s, charIdx, charLimit); scratch.setIntAt(intIdx, utf32); charIdx += Character.charCount(utf32); intIdx++; } scratch.setLength(intIdx); return scratch.get(); }
/** Just takes unsigned byte values from the BytesRef and * converts into an IntsRef. */ public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) { scratch.clear(); for(int i=0;i<input.length;i++) { scratch.append(input.bytes[i+input.offset] & 0xFF); } return scratch.get(); }
private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<Output> indexEnt; while ((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} Output output = indexEnt.output; long blockTermCount = output.endOrd - output.startOrd + 1; Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset); //System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd)); builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput); } }
private void loadTermsIndex() throws IOException { if (fst == null) { IndexInput clone = in.clone(); clone.seek(indexStart); fst = new FST<>(clone, fstOutputs); clone.close(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (indexDivisor > 1) { // subsample final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); final Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); BytesRefFSTEnum.InputOutput<Long> result; int count = indexDivisor; while((result = fstEnum.next()) != null) { if (count == indexDivisor) { builder.add(Util.toIntsRef(result.input, scratchIntsRef), result.output); count = 0; } count++; } fst = builder.finish(); } } }
/** Sole constructor. */ public Path(int state, FST.Arc<T> fstNode, T output, IntsRefBuilder input) { this.state = state; this.fstNode = fstNode; this.output = output; this.input = input; }
public void testListOfOutputs() throws Exception { PositiveIntOutputs _outputs = PositiveIntOutputs.getSingleton(); ListOfOutputs<Long> outputs = new ListOfOutputs<>(_outputs); final Builder<Object> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); // Add the same input more than once and the outputs // are merged: builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 1L); builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 3L); builder.add(Util.toIntsRef(new BytesRef("a"), scratch), 0L); builder.add(Util.toIntsRef(new BytesRef("b"), scratch), 17L); final FST<Object> fst = builder.finish(); Object output = Util.get(fst, new BytesRef("a")); assertNotNull(output); List<Long> outputList = outputs.asList(output); assertEquals(3, outputList.size()); assertEquals(1L, outputList.get(0).longValue()); assertEquals(3L, outputList.get(1).longValue()); assertEquals(0L, outputList.get(2).longValue()); output = Util.get(fst, new BytesRef("b")); assertNotNull(output); outputList = outputs.asList(output); assertEquals(1, outputList.size()); assertEquals(17L, outputList.get(0).longValue()); }
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException { final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex); BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt; while((indexEnt = subIndexEnum.next()) != null) { //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } }
/** * Returns the strings that can be produced from the given state, or * false if more than <code>limit</code> strings are found. * <code>limit</code><0 means "infinite". */ private static boolean getFiniteStrings(Automaton a, int s, HashSet<Integer> pathstates, HashSet<IntsRef> strings, IntsRefBuilder path, int limit) { pathstates.add(s); Transition t = new Transition(); int count = a.initTransition(s, t); for (int i=0;i<count;i++) { a.getNextTransition(t); if (pathstates.contains(t.dest)) { return false; } for (int n = t.min; n <= t.max; n++) { path.append(n); if (a.isAccept(t.dest)) { strings.add(path.toIntsRef()); if (limit >= 0 && strings.size() > limit) { return false; } } if (!getFiniteStrings(a, t.dest, pathstates, strings, path, limit)) { return false; } path.setLength(path.length() - 1); } } pathstates.remove(s); return true; }
static IntsRef toIntsRef(String s, int inputMode, IntsRefBuilder ir) { if (inputMode == 0) { // utf8 return toIntsRef(new BytesRef(s), ir); } else { // utf32 return toIntsRefUTF32(s, ir); } }
static IntsRef toIntsRefUTF32(String s, IntsRefBuilder ir) { final int charLength = s.length(); int charIdx = 0; int intIdx = 0; ir.clear(); while(charIdx < charLength) { ir.grow(intIdx+1); final int utf32 = s.codePointAt(charIdx); ir.append(utf32); charIdx += Character.charCount(utf32); intIdx++; } return ir.get(); }
static IntsRef toIntsRef(BytesRef br, IntsRefBuilder ir) { ir.grow(br.length); ir.clear(); for(int i=0;i<br.length;i++) { ir.append(br.bytes[br.offset+i]&0xFF); } return ir.get(); }
private T randomAcceptedWord(FST<T> fst, IntsRefBuilder in) throws IOException { FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final List<FST.Arc<T>> arcs = new ArrayList<>(); in.clear(); final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; final FST.BytesReader fstReader = fst.getBytesReader(); while(true) { // read all arcs: fst.readFirstTargetArc(arc, arc, fstReader); arcs.add(new FST.Arc<T>().copyFrom(arc)); while(!arc.isLast()) { fst.readNextArc(arc, fstReader); arcs.add(new FST.Arc<T>().copyFrom(arc)); } // pick one arc = arcs.get(random.nextInt(arcs.size())); arcs.clear(); // accumulate output output = fst.outputs.add(output, arc.output); // append label if (arc.label == FST.END_LABEL) { break; } in.append(arc.label); } return output; }