final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: Automaton automaton = ts2a.toAutomaton(ts); automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings // assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): return automaton; }
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { // Analyze surface form: TokenStream ts = indexAnalyzer.tokenStream("", new StringReader(surfaceForm.utf8ToString())); // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: Automaton automaton = ts2a.toAutomaton(ts); ts.end(); ts.close(); replaceSep(automaton); assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); }
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); Automaton automaton; try (TokenStream ts = stream) { automaton = toAutomaton(ts, ts2a); } LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); Set<IntsRef> set = new HashSet<>(); for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) { set.add(IntsRef.deepCopyOf(string)); } return Collections.unmodifiableSet(set); }
public static boolean isReservedChar(char c) { switch (c) { case '\u001F': case TokenStreamToAutomaton.HOLE: case 0x0: case ContextSuggestField.CONTEXT_SEPARATOR: return true; default: return false; } }
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { // Analyze surface form: Automaton automaton = null; TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); try { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: automaton = ts2a.toAutomaton(ts); } finally { IOUtils.closeWhileHandlingException(ts); } automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings // assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: return Operations.getFiniteStrings(automaton, maxGraphExpansions); }
TokenStreamToAutomaton getTokenStreamToAutomaton() { if (preserveSep) { return new EscapingTokenStreamToAutomaton(); } else { // When we're not preserving sep, we don't steal 0xff // byte, so we don't need to do any escaping: return new TokenStreamToAutomaton(); } }
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { // Analyze surface form: Automaton automaton = null; TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); try { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: automaton = ts2a.toAutomaton(ts); } finally { IOUtils.closeWhileHandlingException(ts); } replaceSep(automaton); automaton = convertAutomaton(automaton); assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions); }
@Override public TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); tsta.setUnicodeArcs(unicodeAware); return tsta; }
final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) { return toAutomaton(ts, ts2a); } }
@Override TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); tsta.setUnicodeArcs(unicodeAware); return tsta; }
TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); tsta.setPreservePositionIncrements(preservePositionIncrements); return tsta; }