final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { automaton = getTokenStreamToAutomaton().toAutomaton(ts); } automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert // This automaton should not blow up during determinize: automaton = Operations.determinize(automaton, Integer.MAX_VALUE); return automaton; }
private Automaton toAutomaton() { Automaton a = null; if (include != null) { a = include.toAutomaton(); } else if (includeValues != null) { a = Automata.makeStringUnion(includeValues); } else { a = Automata.makeAnyString(); } if (exclude != null) { a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } else if (excludeValues != null) { a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } return a; }
public void testFuzzyQueryType() throws Exception { String mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties").startObject("completion") .field("type", "completion") .endObject().endObject() .endObject().endObject().string(); DocumentMapper defaultMapper = createIndex("test").mapperService().documentMapperParser().parse("type1", new CompressedXContent(mapping)); FieldMapper fieldMapper = defaultMapper.mappers().getMapper("completion"); CompletionFieldMapper completionFieldMapper = (CompletionFieldMapper) fieldMapper; Query prefixQuery = completionFieldMapper.fieldType().fuzzyQuery("co", Fuzziness.fromEdits(FuzzyCompletionQuery.DEFAULT_MAX_EDITS), FuzzyCompletionQuery.DEFAULT_NON_FUZZY_PREFIX, FuzzyCompletionQuery.DEFAULT_MIN_FUZZY_LENGTH, Operations.DEFAULT_MAX_DETERMINIZED_STATES, FuzzyCompletionQuery.DEFAULT_TRANSPOSITIONS, FuzzyCompletionQuery.DEFAULT_UNICODE_AWARE); assertThat(prefixQuery, instanceOf(FuzzyCompletionQuery.class)); }
/** * Create a automaton for a given context query this automaton will be used * to find the matching paths with the fst * * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query * @param queries list of {@link ContextQuery} defining the lookup context * * @return Automaton matching the given Query */ public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) { Automaton a = Automata.makeEmptyString(); Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR); if (preserveSep) { // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to // include the SEP_LABEL in the query as well gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL)); } for (ContextQuery query : queries) { a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a)); } // TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? GeoQuery looks safe (union // of strings). return Operations.determinize(a, Integer.MAX_VALUE); }
final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; TokenStream ts = queryAnalyzer.tokenStream("", key.toString()); try { automaton = getTokenStreamToAutomaton().toAutomaton(ts); } finally { IOUtils.closeWhileHandlingException(ts); } automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert automaton = Operations.determinize(automaton, DEFAULT_MAX_DETERMINIZED_STATES); return automaton; }
/** Returns true iff <code>o</code> is equal to this. */ @Override public boolean equals(Object o) { if (!(o instanceof TermAutomatonQuery)) { return false; } TermAutomatonQuery other = (TermAutomatonQuery) o; if (det == null) { throw new IllegalStateException("please call finish first"); } if (other.det == null) { throw new IllegalStateException("please call other.finish first"); } // NOTE: not quite correct, because if terms were added in different // order in each query but the language is the same, we return false: return (this.getBoost() == other.getBoost()) && this.termToID.equals(other.termToID) && Operations.sameLanguage(det, other.det); }
@Override public void setUp() throws Exception { super.setUp(); Automaton single = new Automaton(); int initial = single.createState(); int accept = single.createState(); single.setAccept(accept, true); // build an automaton matching this jvm's letter definition for (int i = 0; i <= 0x10FFFF; i++) { if (Character.isLetter(i)) { single.addTransition(initial, accept, i); } } Automaton repeat = Operations.repeat(single); jvmLetter = new CharacterRunAutomaton(repeat); }
public void testCustomProvider() throws IOException { AutomatonProvider myProvider = new AutomatonProvider() { // automaton that matches quick or brown private Automaton quickBrownAutomaton = Operations.union(Arrays .asList(Automata.makeString("quick"), Automata.makeString("brown"), Automata.makeString("bob"))); @Override public Automaton getAutomaton(String name) { if (name.equals("quickBrown")) return quickBrownAutomaton; else return null; } }; RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL, myProvider, DEFAULT_MAX_DETERMINIZED_STATES); assertEquals(1, searcher.search(query, 5).totalHits); }
/** * Test some very simple automata. */ public void testAutomata() throws IOException { assertAutomatonHits(0, Automata.makeEmpty()); assertAutomatonHits(0, Automata.makeEmptyString()); assertAutomatonHits(2, Automata.makeAnyChar()); assertAutomatonHits(3, Automata.makeAnyString()); assertAutomatonHits(2, Automata.makeString("doc")); assertAutomatonHits(1, Automata.makeChar('a')); assertAutomatonHits(2, Automata.makeCharRange('a', 'b')); assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0)); assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0)); assertAutomatonHits(2, Operations.union(Automata.makeChar('a'), Automata.makeChar('b'))); assertAutomatonHits(0, Operations.intersection(Automata .makeChar('a'), Automata.makeChar('b'))); assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES)); }
/** * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and make sure that it doesn't whitelist * the world. */ static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) { if (whitelist.isEmpty()) { return new CharacterRunAutomaton(Automata.makeEmpty()); } Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY)); automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); if (Operations.isTotal(automaton)) { throw new IllegalArgumentException("Refusing to start because whitelist " + whitelist + " accepts all addresses. " + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs " + "for them."); } return new CharacterRunAutomaton(automaton); }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
protected Automaton convertAutomaton(Automaton a) { if (queryPrefix != null) { a = Operations.concatenate(Arrays.asList(queryPrefix, a)); // This automaton should not blow up during determinize: a = Operations.determinize(a, Integer.MAX_VALUE); } return a; }
/** Return an {@link Automaton} that matches the given pattern. */ public static Automaton simpleMatchToAutomaton(String pattern) { List<Automaton> automata = new ArrayList<>(); int previous = 0; for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) { automata.add(Automata.makeString(pattern.substring(previous, i))); automata.add(Automata.makeAnyString()); previous = i + 1; } automata.add(Automata.makeString(pattern.substring(previous))); return Operations.concatenate(automata); }
/** * Return an Automaton that matches the union of the provided patterns. */ public static Automaton simpleMatchToAutomaton(String... patterns) { if (patterns.length < 1) { throw new IllegalArgumentException("There must be at least one pattern, zero given"); } List<Automaton> automata = new ArrayList<>(); for (String pattern : patterns) { automata.add(simpleMatchToAutomaton(pattern)); } return Operations.union(automata); }
public void testRegexQueryType() throws Exception { String mapping = jsonBuilder().startObject().startObject("type1") .startObject("properties").startObject("completion") .field("type", "completion") .endObject().endObject() .endObject().endObject().string(); DocumentMapper defaultMapper = createIndex("test").mapperService().documentMapperParser().parse("type1", new CompressedXContent(mapping)); FieldMapper fieldMapper = defaultMapper.mappers().getMapper("completion"); CompletionFieldMapper completionFieldMapper = (CompletionFieldMapper) fieldMapper; Query prefixQuery = completionFieldMapper.fieldType() .regexpQuery(new BytesRef("co"), RegExp.ALL, Operations.DEFAULT_MAX_DETERMINIZED_STATES); assertThat(prefixQuery, instanceOf(RegexCompletionQuery.class)); }
@Override public Automaton toAutomaton() { List<Automaton> automatons = new ArrayList<>(); for (CharSequence value : values) { automatons.add(Automata.makeString(value.toString())); } return Operations.union(automatons); }
@Override public Automaton toAutomaton() { Automaton automaton; if(precisions == null || precisions.length == 0) { automaton = Automata.makeString(location); } else { automaton = Automata.makeString(location.substring(0, Math.max(1, Math.min(location.length(), precisions[0])))); for (int i = 1; i < precisions.length; i++) { final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i]))); automaton = Operations.union(automaton, Automata.makeString(cell)); } } return automaton; }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(a); utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES); return utf8automaton; } else { return a; } }
final Set<IntsRef> toFiniteStrings(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { // Analyze surface form: Automaton automaton = null; TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString()); try { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: automaton = ts2a.toAutomaton(ts); } finally { IOUtils.closeWhileHandlingException(ts); } automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings // assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): // TODO: we could walk & add simultaneously, so we // don't have to alloc [possibly biggish] // intermediate HashSet in RAM: return Operations.getFiniteStrings(automaton, maxGraphExpansions); }
/** * Test that a nondeterministic automaton works correctly. (It should will be * determinized) */ public void testNFA() throws IOException { // accept this or three, the union is an NFA (two transitions for 't' from // initial state) Automaton nfa = Operations.union(Automata.makeString("this"), Automata.makeString("three")); assertAutomatonHits(2, nfa); }
public void testEquals() { AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata .makeString("foobar")); // reference to a1 AutomatonQuery a2 = a1; // same as a1 (accepts the same language, same term) AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), Operations.concatenate( Automata.makeString("foo"), Automata.makeString("bar"))); // different than a1 (same term, but different language) AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), Automata.makeString("different")); // different than a1 (different term, same language) AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), Automata.makeString("foobar")); assertEquals(a1.hashCode(), a2.hashCode()); assertEquals(a1, a2); assertEquals(a1.hashCode(), a3.hashCode()); assertEquals(a1, a3); // different class AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); // different class AutomatonQuery w2 = new RegexpQuery(newTerm("foobar")); assertFalse(a1.equals(w1)); assertFalse(a1.equals(w2)); assertFalse(w1.equals(w2)); assertFalse(a1.equals(a4)); assertFalse(a1.equals(a5)); assertFalse(a1.equals(null)); }
/** * Test that rewriting to a prefix query works as expected, preserves * MultiTermQuery semantics. */ public void testRewritePrefix() throws IOException { Automaton pfx = Automata.makeString("do"); Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString()); AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum); assertEquals(3, automatonQueryNrHits(aq)); }
/** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( Operations.complement( Operations.union( Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES)); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
public void testSynOverMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("x", 0, 3), token("b", 3, 1), }); final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); assertSameLanguage(Operations.union(a1, a2), ts); }
private Automaton join(String ... strings) { List<Automaton> as = new ArrayList<>(); for(String s : strings) { as.add(s2a(s)); as.add(SEP_A); } as.remove(as.size()-1); return Operations.concatenate(as); }
public void testOverlappedTokensSausage() throws Exception { // Two tokens on top of each other (sausage): final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 1) }); final Automaton a1 = s2a("abc"); final Automaton a2 = s2a("xyz"); assertSameLanguage(Operations.union(a1, a2), ts); }
public void testOverlappedTokensLattice() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1), }); final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def"); assertSameLanguage(Operations.union(a1, a2), ts); }
public void testSynOverHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 2), token("b", 2, 1), }); final Automaton a1 = Operations.union(join(s2a("a"), SEP_A, HOLE_A), s2a("X")); final Automaton expected = Operations.concatenate(a1, join(SEP_A, s2a("b"))); assertSameLanguage(expected, ts); }
public void testSynOverHole2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1), }); final Automaton expected = Operations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), s2a("abc")); assertSameLanguage(expected, ts); }
public void testOverlappedTokensLattice2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1), }); final Automaton a1 = s2a("xyz"); final Automaton a2 = join("abc", "def", "ghi"); assertSameLanguage(Operations.union(a1, a2), ts); }
public void testSynHangingOverEnd() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 10), }); assertSameLanguage(Operations.union(s2a("a"), s2a("X")), ts); }
/** fragile assert: depends on our implementation, but cleanest way to check for now */ private boolean wasReversed(SolrQueryParser qp, String query) throws Exception { Query q = qp.parse(query); if (!(q instanceof AutomatonQuery)) { return false; } Automaton automaton = ((AutomatonQuery) q).getAutomaton(); String prefix = Operations.getCommonPrefix(Operations.determinize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES)); return prefix.length() > 0 && prefix.charAt(0) == '\u0001'; }
/** Make matches on objects also match dots in field names. * For instance, if the original simple regex is `foo`, this will translate * it into `foo` OR `foo.*`. */ private static Automaton makeMatchDotsInFieldNames(Automaton automaton) { return Operations.union( automaton, Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString()))); }