private Automaton toAutomaton() { Automaton a = null; if (include != null) { a = include.toAutomaton(); } else if (includeValues != null) { a = Automata.makeStringUnion(includeValues); } else { a = Automata.makeAnyString(); } if (exclude != null) { a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } else if (excludeValues != null) { a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES); } return a; }
/** * Create a automaton for a given context query this automaton will be used * to find the matching paths with the fst * * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query * @param queries list of {@link ContextQuery} defining the lookup context * * @return Automaton matching the given Query */ public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) { Automaton a = Automata.makeEmptyString(); Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR); if (preserveSep) { // if separators are preserved the fst contains a SEP_LABEL // behind each gap. To have a matching automaton, we need to // include the SEP_LABEL in the query as well gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL)); } for (ContextQuery query : queries) { a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a)); } // TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? GeoQuery looks safe (union // of strings). return Operations.determinize(a, Integer.MAX_VALUE); }
public void testBoost() throws Exception { CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); PrecedenceQueryParser qp = new PrecedenceQueryParser(); qp.setAnalyzer(oneStopAnalyzer); Query q = qp.parse("on^1.0", "field"); assertNotNull(q); q = qp.parse("\"hello\"^2.0", "field"); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = qp.parse("hello^2.0", "field"); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = qp.parse("\"on\"^1.0", "field"); assertNotNull(q); q = getParser(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)).parse("the^3", "field"); assertNotNull(q); }
public void testBoost() throws Exception { CharacterRunAutomaton stopSet = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopSet); StandardQueryParser qp = new StandardQueryParser(); qp.setAnalyzer(oneStopAnalyzer); Query q = qp.parse("on^1.0", "field"); assertNotNull(q); q = qp.parse("\"hello\"^2.0", "field"); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = qp.parse("hello^2.0", "field"); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = qp.parse("\"on\"^1.0", "field"); assertNotNull(q); StandardQueryParser qp2 = new StandardQueryParser(); qp2.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); q = qp2.parse("the^3", "field"); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEquals("", q.toString()); assertEquals(1.0f, q.getBoost(), 0.01f); }
public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0",qp); assertNotNull(q); q = getQuery("\"hello\"^2.0",qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0",qp); assertNotNull(q); assertEquals(q.getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0",qp); assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); CommonQueryParserConfiguration qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEquals("", q.toString()); assertEquals(1.0f, q.getBoost(), 0.01f); }
public void testCustomProvider() throws IOException { AutomatonProvider myProvider = new AutomatonProvider() { // automaton that matches quick or brown private Automaton quickBrownAutomaton = Operations.union(Arrays .asList(Automata.makeString("quick"), Automata.makeString("brown"), Automata.makeString("bob"))); @Override public Automaton getAutomaton(String name) { if (name.equals("quickBrown")) return quickBrownAutomaton; else return null; } }; RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL, myProvider, DEFAULT_MAX_DETERMINIZED_STATES); assertEquals(1, searcher.search(query, 5).totalHits); }
/** * Test some very simple automata. */ public void testAutomata() throws IOException { assertAutomatonHits(0, Automata.makeEmpty()); assertAutomatonHits(0, Automata.makeEmptyString()); assertAutomatonHits(2, Automata.makeAnyChar()); assertAutomatonHits(3, Automata.makeAnyString()); assertAutomatonHits(2, Automata.makeString("doc")); assertAutomatonHits(1, Automata.makeChar('a')); assertAutomatonHits(2, Automata.makeCharRange('a', 'b')); assertAutomatonHits(2, Automata.makeInterval(1233, 2346, 0)); assertAutomatonHits(1, Automata.makeInterval(0, 2000, 0)); assertAutomatonHits(2, Operations.union(Automata.makeChar('a'), Automata.makeChar('b'))); assertAutomatonHits(0, Operations.intersection(Automata .makeChar('a'), Automata.makeChar('b'))); assertAutomatonHits(1, Operations.minus(Automata.makeCharRange('a', 'b'), Automata.makeChar('a'), DEFAULT_MAX_DETERMINIZED_STATES)); }
@Test public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); QueryParser qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0", qp); Assert.assertNotNull(q); q = getQuery("\"hello\"^2.0", qp); Assert.assertNotNull(q); Assert.assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0", qp); Assert.assertNotNull(q); Assert.assertEquals(((BoostQuery) q).getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0", qp); Assert.assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); QueryParser qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: Assert.assertNotNull(q); assertMatchNoDocsQuery(q); Assert.assertFalse(q instanceof BoostQuery); }
public void testBoost() throws Exception { CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("on")); Analyzer oneStopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); CommonQueryParserConfiguration qp = getParserConfig(oneStopAnalyzer); Query q = getQuery("on^1.0",qp); assertNotNull(q); q = getQuery("\"hello\"^2.0",qp); assertNotNull(q); assertEquals(getBoost(q), (float) 2.0, (float) 0.5); q = getQuery("hello^2.0",qp); assertNotNull(q); assertEquals(((BoostQuery)q).getBoost(), (float) 2.0, (float) 0.5); q = getQuery("\"on\"^1.0",qp); assertNotNull(q); Analyzer a2 = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET); CommonQueryParserConfiguration qp2 = getParserConfig(a2); q = getQuery("the^3", qp2); // "the" is a stop word so the result is an empty query: assertNotNull(q); assertEmpty(q); assertEquals(1.0f, getBoost(q), 0.01f); }
/** * Build the {@link CharacterRunAutomaton} that represents the reindex-from-remote whitelist and make sure that it doesn't whitelist * the world. */ static CharacterRunAutomaton buildRemoteWhitelist(List<String> whitelist) { if (whitelist.isEmpty()) { return new CharacterRunAutomaton(Automata.makeEmpty()); } Automaton automaton = Regex.simpleMatchToAutomaton(whitelist.toArray(Strings.EMPTY_ARRAY)); automaton = MinimizationOperations.minimize(automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES); if (Operations.isTotal(automaton)) { throw new IllegalArgumentException("Refusing to start because whitelist " + whitelist + " accepts all addresses. " + "This would allow users to reindex-from-remote any URL they like effectively having Elasticsearch make HTTP GETs " + "for them."); } return new CharacterRunAutomaton(automaton); }
/** Return an {@link Automaton} that matches the given pattern. */ public static Automaton simpleMatchToAutomaton(String pattern) { List<Automaton> automata = new ArrayList<>(); int previous = 0; for (int i = pattern.indexOf('*'); i != -1; i = pattern.indexOf('*', i + 1)) { automata.add(Automata.makeString(pattern.substring(previous, i))); automata.add(Automata.makeAnyString()); previous = i + 1; } automata.add(Automata.makeString(pattern.substring(previous))); return Operations.concatenate(automata); }
@Override public Automaton toAutomaton() { List<Automaton> automatons = new ArrayList<>(); for (CharSequence value : values) { automatons.add(Automata.makeString(value.toString())); } return Operations.union(automatons); }
@Override public Automaton toAutomaton() { Automaton automaton; if(precisions == null || precisions.length == 0) { automaton = Automata.makeString(location); } else { automaton = Automata.makeString(location.substring(0, Math.max(1, Math.min(location.length(), precisions[0])))); for (int i = 1; i < precisions.length; i++) { final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i]))); automaton = Operations.union(automaton, Automata.makeString(cell)); } } return automaton; }
@BeforeClass public static void beforeClass() throws Exception { Random random = random(); directory = newDirectory(); stopword = "" + randomChar(); CharacterRunAutomaton stopset = new CharacterRunAutomaton(Automata.makeString(stopword)); analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false, stopset); RandomIndexWriter iw = new RandomIndexWriter(random, directory, analyzer); Document doc = new Document(); Field id = new StringField("id", "", Field.Store.NO); Field field = new TextField("field", "", Field.Store.NO); doc.add(id); doc.add(field); // index some docs int numDocs = atLeast(1000); for (int i = 0; i < numDocs; i++) { id.setStringValue(Integer.toString(i)); field.setStringValue(randomFieldContents()); iw.addDocument(doc); } // delete some docs int numDeletes = numDocs/20; for (int i = 0; i < numDeletes; i++) { Term toDelete = new Term("id", Integer.toString(random.nextInt(numDocs))); if (random.nextBoolean()) { iw.deleteDocuments(toDelete); } else { iw.deleteDocuments(new TermQuery(toDelete)); } } reader = iw.getReader(); s1 = newSearcher(reader); s2 = newSearcher(reader); iw.close(); }
/** * Test that a nondeterministic automaton works correctly. (It should will be * determinized) */ public void testNFA() throws IOException { // accept this or three, the union is an NFA (two transitions for 't' from // initial state) Automaton nfa = Operations.union(Automata.makeString("this"), Automata.makeString("three")); assertAutomatonHits(2, nfa); }
public void testEquals() { AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), Automata .makeString("foobar")); // reference to a1 AutomatonQuery a2 = a1; // same as a1 (accepts the same language, same term) AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), Operations.concatenate( Automata.makeString("foo"), Automata.makeString("bar"))); // different than a1 (same term, but different language) AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), Automata.makeString("different")); // different than a1 (different term, same language) AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), Automata.makeString("foobar")); assertEquals(a1.hashCode(), a2.hashCode()); assertEquals(a1, a2); assertEquals(a1.hashCode(), a3.hashCode()); assertEquals(a1, a3); // different class AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); // different class AutomatonQuery w2 = new RegexpQuery(newTerm("foobar")); assertFalse(a1.equals(w1)); assertFalse(a1.equals(w2)); assertFalse(w1.equals(w2)); assertFalse(a1.equals(a4)); assertFalse(a1.equals(a5)); assertFalse(a1.equals(null)); }
/** * Test that rewriting to a single term works as expected, preserves * MultiTermQuery semantics. */ public void testRewriteSingleTerm() throws IOException { AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeString("piece")); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof SingleTermsEnum); assertEquals(1, automatonQueryNrHits(aq)); }
/** * Test that rewriting to a prefix query works as expected, preserves * MultiTermQuery semantics. */ public void testRewritePrefix() throws IOException { Automaton pfx = Automata.makeString("do"); Automaton prefixAutomaton = Operations.concatenate(pfx, Automata.makeAnyString()); AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum); assertEquals(3, automatonQueryNrHits(aq)); }
/** * Test handling of the empty language */ public void testEmptyOptimization() throws IOException { AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), Automata.makeEmpty()); // not yet available: assertTrue(aq.getEnum(searcher.getIndexReader()) // instanceof EmptyTermEnum); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertSame(TermsEnum.EMPTY, aq.getTermsEnum(terms)); assertEquals(0, automatonQueryNrHits(aq)); }
public void testHugeAutomaton() { List<BytesRef> terms = new ArrayList<>(); while (terms.size() < 10000) { terms.add(new BytesRef(TestUtil.randomUnicodeString(random()))); } Collections.sort(terms); new AutomatonQuery(new Term("foo", "bar"), Automata.makeStringUnion(terms), Integer.MAX_VALUE); }
/** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( Operations.complement( Operations.union( Arrays.asList(Automata.makeString("foo"), Automata.makeString("bar"))), DEFAULT_MAX_DETERMINIZED_STATES)); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords); assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
@Override @SuppressWarnings("unchecked") Filter makeFilter(String fname, Iterator<BytesRef> it) { Automaton union = Automata.makeStringUnion(IteratorUtils.toList(it)); return new MultiTermQueryWrapperFilter<AutomatonQuery>(new AutomatonQuery(new Term(fname), union)) { }; }
public static void main(String[] args) throws IOException { Directory dir = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_4, analyzer); iwc.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new TextField("text", "muffin", Field.Store.YES)); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("text", "zmuffin", Field.Store.YES)); writer.addDocument(doc); doc = new Document(); doc.add(new TextField("text", "mufffin", Field.Store.YES)); writer.addDocument(doc); writer.close(); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); MultiTermQuery query = new AutomatonQuery(new Term("text"), Automata.makeAnyString()); query.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); System.out.println("query: " + query); TopDocs results = searcher.search(query, null, 100); ScoreDoc[] scoreDocs = results.scoreDocs; for (int i = 0; i < scoreDocs.length; ++i) { System.out.println(searcher.explain(query, scoreDocs[i].doc)); } }
/** Make matches on objects also match dots in field names. * For instance, if the original simple regex is `foo`, this will translate * it into `foo` OR `foo.*`. */ private static Automaton makeMatchDotsInFieldNames(Automaton automaton) { return Operations.union( automaton, Operations.concatenate(Arrays.asList(automaton, Automata.makeChar('.'), Automata.makeAnyString()))); }
public void testMaxSizeHighlightTruncates() throws Exception { TestHighlightRunner helper = new TestHighlightRunner() { @Override public void run() throws Exception { String goodWord = "goodtoken"; CharacterRunAutomaton stopWords = new CharacterRunAutomaton(Automata.makeString("stoppedtoken")); // we disable MockTokenizer checks because we will forcefully limit the // tokenstream and call end() before incrementToken() returns false. final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, stopWords); analyzer.setEnableChecks(false); TermQuery query = new TermQuery(new Term("data", goodWord)); String match; StringBuilder sb = new StringBuilder(); sb.append(goodWord); for (int i = 0; i < 10000; i++) { sb.append(" "); // only one stopword sb.append("stoppedtoken"); } SimpleHTMLFormatter fm = new SimpleHTMLFormatter(); Highlighter hg = getHighlighter(query, "data", fm);// new Highlighter(fm, // new // QueryTermScorer(query)); hg.setTextFragmenter(new NullFragmenter()); hg.setMaxDocCharsToAnalyze(100); match = hg.getBestFragment(analyzer, "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); // add another tokenized word to the overrall length - but set way // beyond // the length of text under consideration (after a large slug of stop // words // + whitespace) sb.append(" "); sb.append(goodWord); match = hg.getBestFragment(analyzer, "data", sb.toString()); assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg .getMaxDocCharsToAnalyze()); } }; helper.start(); }
private Automaton s2a(String s) { return Automata.makeString(s); }
@Override Filter makeFilter(String fname, BytesRef[] byteRefs) { Automaton union = Automata.makeStringUnion(Arrays.asList(byteRefs)); return new MultiTermQueryWrapperFilter<AutomatonQuery>(new AutomatonQuery(new Term(fname), union)) { }; }
@Override @SuppressWarnings("unchecked") Query makeQuery(String fname, Iterator<BytesRef> it) { Automaton union = Automata.makeStringUnion(IteratorUtils.toList(it)); return new AutomatonQuery(new Term(fname), union); }