/** * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. */ public BytesRef get(char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) throws IOException { BytesRef pendingOutput = fst.outputs.getNoOutput(); BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return null; } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += Character.charCount(codePoint); } if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return matchOutput; }
/** Looks up the output for this input, or null if the * input is not accepted. */ public static<T> T get(FST<T> fst, IntsRef input) throws IOException { // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); final BytesReader fstReader = fst.getBytesReader(); // Accumulate output as we go T output = fst.outputs.getNoOutput(); for(int i=0;i<input.length;i++) { if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) { return null; } output = fst.outputs.add(output, arc.output); } if (arc.isFinal()) { return fst.outputs.add(output, arc.nextFinalOutput); } else { return null; } }
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader, BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException { Long output = fst.outputs.getNoOutput(); fst.getFirstArc(arc); byte[] bytes = scratch.bytes; int pos = scratch.offset; int end = pos + scratch.length; while (pos < end) { if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return null; } else { output = fst.outputs.add(output, arc.output); } } return output; }
/** * Consume a maximal glue morpheme, if any, and consume the next word. */ private void matchGlueMorpheme(IntsRef utf32, final int offset, StringBuilder builder, Deque<Chunk> chunks) throws IOException { FST.Arc<Object> arc = glueMorphemes.getFirstArc(new FST.Arc<>()); BytesReader br = glueMorphemes.getBytesReader(); for (int i = offset; i < utf32.length; i++) { int chr = utf32.ints[i]; arc = glueMorphemes.findTargetArc(chr, arc, arc, br); if (arc == null) { break; } if (arc.isFinal()) { chunks.addLast(new Chunk(offset, i + 1, ChunkType.GLUE_MORPHEME)); if (i + 1 < utf32.offset + utf32.length) { matchWord(utf32, i + 1, builder, chunks); } chunks.removeLast(); } } }
/** * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, FST.Arc, FST.BytesReader)} method. */ public BytesReader getBytesReader() { if (fst == null) { return null; } else { return fst.getBytesReader(); } }
public void testIllegallyModifyRootArc() throws Exception { Set<BytesRef> terms = new HashSet<>(); for(int i=0;i<100;i++) { String prefix = Character.toString((char) ('a' + i)); terms.add(new BytesRef(prefix)); if (prefix.equals("m") == false) { for(int j=0;j<20;j++) { // Make a big enough FST that the root cache will be created: String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20); terms.add(new BytesRef(prefix + suffix)); } } } List<BytesRef> termsList = new ArrayList<>(terms); Collections.sort(termsList); ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); Builder<BytesRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); IntsRefBuilder input = new IntsRefBuilder(); for(BytesRef term : termsList) { Util.toIntsRef(term, input); builder.add(input.get(), term); } FST<BytesRef> fst = builder.finish(); Arc<BytesRef> arc = new FST.Arc<>(); fst.getFirstArc(arc); FST.BytesReader reader = fst.getBytesReader(); arc = fst.findTargetArc((int) 'm', arc, arc, reader); assertNotNull(arc); assertEquals(new BytesRef("m"), arc.output); // NOTE: illegal: arc.output.length = 0; fst.getFirstArc(arc); try { arc = fst.findTargetArc((int) 'm', arc, arc, reader); } catch (AssertionError ae) { // expected } }
/** Reverse lookup (lookup by output instead of by input), * in the special case when your FSTs outputs are * strictly ascending. This locates the input/output * pair where the output is equal to the target, and will * return null if that output does not exist. * * <p>NOTE: this only works with {@code FST<Long>}, only * works when the outputs are ascending in order with * the inputs. * For example, simple ordinals (0, 1, * 2, ...), or file offets (when appending to a file) * fit this. */ public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException { final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); FST.Arc<Long> scratchArc = new FST.Arc<>(); final IntsRefBuilder result = new IntsRefBuilder(); return getByOutput(fst, targetOutput, in, arc, scratchArc, result); }
/** Reverse lookup (lookup by output instead of by input), * in the special case when your FSTs outputs are * strictly ascending. This locates the input/output * pair where the output is equal to the target, and will * return null if that output does not exist. * * <p>NOTE: this only works with {@code FST<Long>}, only * works when the outputs are ascending in order with * the inputs. * For example, simple ordinals (0, 1, * 2, ...), or file offets (when appending to a file) * fit this. */ public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException { final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); FST.Arc<Long> scratchArc = new FST.Arc<Long>(); final IntsRef result = new IntsRef(); return getByOutput(fst, targetOutput, in, arc, scratchArc, result); }
/** Reverse lookup (lookup by output instead of by input), * in the special case when your FSTs outputs are * strictly ascending. This locates the input/output * pair where the output is equal to the target, and will * return null if that output does not exist. * * <p>NOTE: this only works with {@code FST<Long>}, only * works when the outputs are ascending in order with * the inputs and only works when you shared * the outputs (pass doShare=true to {@link * PositiveIntOutputs#getSingleton}). * For example, simple ordinals (0, 1, * 2, ...), or file offets (when appending to a file) * fit this. */ public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException { final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); FST.Arc<Long> scratchArc = new FST.Arc<Long>(); final IntsRef result = new IntsRef(); return getByOutput(fst, targetOutput, in, arc, scratchArc, result); }