final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); ts.end(); ts.close(); // TODO: we could use the end offset to "guess" // whether the final token was a partial token; this // would only be a heuristic ... but maybe an OK one. // This way we could eg differentiate "net" from "net ", // which we can't today... replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert BasicOperations.determinize(automaton); return automaton; }
@Override public void setUp() throws Exception { super.setUp(); // build an automaton matching this jvm's letter definition State initial = new State(); State accept = new State(); accept.setAccept(true); for (int i = 0; i <= 0x10FFFF; i++) { if (Character.isLetter(i)) { initial.addTransition(new Transition(i, i, accept)); } } Automaton single = new Automaton(initial); single.reduce(); Automaton repeat = BasicOperations.repeat(single); jvmLetter = new CharacterRunAutomaton(repeat); }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; AutomatonQuery other = (AutomatonQuery) obj; if (automaton == null) { if (other.automaton != null) return false; } else if (!BasicOperations.sameLanguage(automaton, other.automaton)) return false; if (term == null) { if (other.term != null) return false; } else if (!term.equals(other.term)) return false; return true; }
/** initialize levenshtein DFAs up to maxDistance, if possible */ private List<CompiledAutomaton> initAutomata(int maxDistance) { final List<CompiledAutomaton> runAutomata = dfaAtt.automata(); //System.out.println("cached automata size: " + runAutomata.size()); if (runAutomata.size() <= maxDistance && maxDistance <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { LevenshteinAutomata builder = new LevenshteinAutomata(UnicodeUtil.newString(termText, realPrefixLength, termText.length - realPrefixLength), transpositions); for (int i = runAutomata.size(); i <= maxDistance; i++) { Automaton a = builder.toAutomaton(i); //System.out.println("compute automaton n=" + i); // constant prefix if (realPrefixLength > 0) { Automaton prefix = BasicAutomata.makeString( UnicodeUtil.newString(termText, 0, realPrefixLength)); a = BasicOperations.concatenate(prefix, a); } runAutomata.add(new CompiledAutomaton(a, true, false)); } } return runAutomata; }
public void testCustomProvider() throws IOException { AutomatonProvider myProvider = new AutomatonProvider() { // automaton that matches quick or brown private Automaton quickBrownAutomaton = BasicOperations.union(Arrays .asList(BasicAutomata.makeString("quick"), BasicAutomata.makeString("brown"), BasicAutomata.makeString("bob"))); @Override public Automaton getAutomaton(String name) { if (name.equals("quickBrown")) return quickBrownAutomaton; else return null; } }; RegexpQuery query = new RegexpQuery(newTerm("<quickBrown>"), RegExp.ALL, myProvider); assertEquals(1, searcher.search(query, 5).totalHits); }
/** * Test some very simple automata. */ public void testBasicAutomata() throws IOException { assertAutomatonHits(0, BasicAutomata.makeEmpty()); assertAutomatonHits(0, BasicAutomata.makeEmptyString()); assertAutomatonHits(2, BasicAutomata.makeAnyChar()); assertAutomatonHits(3, BasicAutomata.makeAnyString()); assertAutomatonHits(2, BasicAutomata.makeString("doc")); assertAutomatonHits(1, BasicAutomata.makeChar('a')); assertAutomatonHits(2, BasicAutomata.makeCharRange('a', 'b')); assertAutomatonHits(2, BasicAutomata.makeInterval(1233, 2346, 0)); assertAutomatonHits(1, BasicAutomata.makeInterval(0, 2000, 0)); assertAutomatonHits(2, BasicOperations.union(BasicAutomata.makeChar('a'), BasicAutomata.makeChar('b'))); assertAutomatonHits(0, BasicOperations.intersection(BasicAutomata .makeChar('a'), BasicAutomata.makeChar('b'))); assertAutomatonHits(1, BasicOperations.minus(BasicAutomata.makeCharRange('a', 'b'), BasicAutomata.makeChar('a'))); }
public void testOverlappedTokensLattice() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 2), token("def", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def"); final Automaton expected = BasicOperations.union(a1, a2); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynOverHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 2), token("b", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicOperations.union( join(s2a("a"), SEP_A, HOLE_A), BasicAutomata.makeString("X")); final Automaton expected = BasicOperations.concatenate(a1, join(SEP_A, s2a("b"))); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testOverlappedTokensLattice2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 3), token("def", 1, 1), token("ghi", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("xyz"); final Automaton a2 = join("abc", "def", "ghi"); final Automaton expected = BasicOperations.union(a1, a2); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; TokenStream ts = queryAnalyzer.tokenStream("", key.toString()); try { automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); } finally { IOUtils.closeWhileHandlingException(ts); } // TODO: we could use the end offset to "guess" // whether the final token was a partial token; this // would only be a heuristic ... but maybe an OK one. // This way we could eg differentiate "net" from "net ", // which we can't today... replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert BasicOperations.determinize(automaton); return automaton; }
/** * Test that a nondeterministic automaton works correctly. (It should will be * determinized) */ public void testNFA() throws IOException { // accept this or three, the union is an NFA (two transitions for 't' from // initial state) Automaton nfa = BasicOperations.union(BasicAutomata.makeString("this"), BasicAutomata.makeString("three")); assertAutomatonHits(2, nfa); }
public void testEquals() { AutomatonQuery a1 = new AutomatonQuery(newTerm("foobar"), BasicAutomata .makeString("foobar")); // reference to a1 AutomatonQuery a2 = a1; // same as a1 (accepts the same language, same term) AutomatonQuery a3 = new AutomatonQuery(newTerm("foobar"), BasicOperations .concatenate(BasicAutomata.makeString("foo"), BasicAutomata .makeString("bar"))); // different than a1 (same term, but different language) AutomatonQuery a4 = new AutomatonQuery(newTerm("foobar"), BasicAutomata .makeString("different")); // different than a1 (different term, same language) AutomatonQuery a5 = new AutomatonQuery(newTerm("blah"), BasicAutomata .makeString("foobar")); assertEquals(a1, a2); assertEquals(a1, a3); // different class AutomatonQuery w1 = new WildcardQuery(newTerm("foobar")); // different class AutomatonQuery w2 = new RegexpQuery(newTerm("foobar")); assertFalse(a1.equals(w1)); assertFalse(a1.equals(w2)); assertFalse(w1.equals(w2)); assertFalse(a1.equals(a4)); assertFalse(a1.equals(a5)); assertFalse(a1.equals(null)); }
/** * Test that rewriting to a prefix query works as expected, preserves * MultiTermQuery semantics. */ public void testRewritePrefix() throws IOException { Automaton pfx = BasicAutomata.makeString("do"); pfx.expandSingleton(); // expand singleton representation for testing Automaton prefixAutomaton = BasicOperations.concatenate(pfx, BasicAutomata .makeAnyString()); AutomatonQuery aq = new AutomatonQuery(newTerm("bogus"), prefixAutomaton); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), FN); assertTrue(aq.getTermsEnum(terms) instanceof PrefixTermsEnum); assertEquals(3, automatonQueryNrHits(aq)); }
/** Test a configuration that behaves a lot like KeepWordFilter */ public void testKeep() throws Exception { CharacterRunAutomaton keepWords = new CharacterRunAutomaton( BasicOperations.complement( Automaton.union( Arrays.asList(BasicAutomata.makeString("foo"), BasicAutomata.makeString("bar"))))); Analyzer a = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, keepWords, true); assertAnalyzesTo(a, "quick foo brown bar bar fox foo", new String[] { "foo", "bar", "bar", "foo" }, new int[] { 2, 2, 1, 2 }); }
public void testSingleToken() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicAutomata.makeString("abc"); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("b", 3, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynOverMultipleHoles() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("x", 0, 3), token("b", 3, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = join(s2a("a"), SEP_A, HOLE_A, SEP_A, HOLE_A, SEP_A, s2a("b")); final Automaton a2 = join(s2a("x"), SEP_A, s2a("b")); final Automaton expected = BasicOperations.union(a1, a2); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
private Automaton join(String ... strings) { List<Automaton> as = new ArrayList<Automaton>(); for(String s : strings) { as.add(BasicAutomata.makeString(s)); as.add(SEP_A); } as.remove(as.size()-1); return BasicOperations.concatenate(as); }
public void testTwoTokens() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 1, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join("abc", "def"); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(s2a("abc"), SEP_A, HOLE_A, SEP_A, s2a("def")); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testOverlappedTokensSausage() throws Exception { // Two tokens on top of each other (sausage): final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 1, 1), token("xyz", 0, 1) }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton a1 = BasicAutomata.makeString("abc"); final Automaton a2 = BasicAutomata.makeString("xyz"); final Automaton expected = BasicOperations.union(a1, a2); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynOverHole2() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("xyz", 1, 1), token("abc", 0, 3), token("def", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union( join(s2a("xyz"), SEP_A, HOLE_A, SEP_A, s2a("def")), BasicAutomata.makeString("abc")); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testStartsWithHole() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("abc", 2, 1), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = join(HOLE_A, SEP_A, s2a("abc")); //toDot(actual); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
public void testSynHangingOverEnd() throws Exception { final TokenStream ts = new CannedTokenStream( new Token[] { token("a", 1, 1), token("X", 0, 10), }); final Automaton actual = (new TokenStreamToAutomaton()).toAutomaton(ts); final Automaton expected = BasicOperations.union(BasicAutomata.makeString("a"), BasicAutomata.makeString("X")); assertTrue(BasicOperations.sameLanguage(expected, actual)); }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(a); BasicOperations.determinize(utf8automaton); return utf8automaton; } else { return a; } }