static Function<Query, Result> phraseQuery() { return query -> { Term[] terms = ((PhraseQuery) query).getTerms(); if (terms.length == 0) { return new Result(true, Collections.emptySet()); } // the longest term is likely to be the rarest, // so from a performance perspective it makes sense to extract that Term longestTerm = terms[0]; for (Term term : terms) { if (longestTerm.bytes().length < term.bytes().length) { longestTerm = term; } } return new Result(false, Collections.singleton(longestTerm)); }; }
public void testToQueryPhraseQuery() throws IOException { assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0); Query query = queryStringQuery("\"term1 term2\"") .defaultField(STRING_FIELD_NAME) .phraseSlop(3) .toQuery(createShardContext()); assertThat(query, instanceOf(DisjunctionMaxQuery.class)); DisjunctionMaxQuery disjunctionMaxQuery = (DisjunctionMaxQuery) query; assertThat(disjunctionMaxQuery.getDisjuncts().size(), equalTo(1)); assertThat(disjunctionMaxQuery.getDisjuncts().get(0), instanceOf(PhraseQuery.class)); PhraseQuery phraseQuery = (PhraseQuery)disjunctionMaxQuery.getDisjuncts().get(0); assertThat(phraseQuery.getTerms().length, equalTo(2)); assertThat(phraseQuery.getTerms()[0], equalTo(new Term(STRING_FIELD_NAME, "term1"))); assertThat(phraseQuery.getTerms()[1], equalTo(new Term(STRING_FIELD_NAME, "term2"))); assertThat(phraseQuery.getSlop(), equalTo(3)); }
public void testToQueryPhraseQueryBoostAndSlop() throws IOException { assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0); QueryStringQueryBuilder queryStringQueryBuilder = new QueryStringQueryBuilder("\"test phrase\"~2").field(STRING_FIELD_NAME, 5f); Query query = queryStringQueryBuilder.toQuery(createShardContext()); assertThat(query, instanceOf(DisjunctionMaxQuery.class)); DisjunctionMaxQuery disjunctionMaxQuery = (DisjunctionMaxQuery) query; assertThat(disjunctionMaxQuery.getDisjuncts().size(), equalTo(1)); assertThat(disjunctionMaxQuery.getDisjuncts().get(0), instanceOf(BoostQuery.class)); BoostQuery boostQuery = (BoostQuery) disjunctionMaxQuery.getDisjuncts().get(0); assertThat(boostQuery.getBoost(), equalTo(5f)); assertThat(boostQuery.getQuery(), instanceOf(PhraseQuery.class)); PhraseQuery phraseQuery = (PhraseQuery) boostQuery.getQuery(); assertThat(phraseQuery.getSlop(), Matchers.equalTo(2)); assertThat(phraseQuery.getTerms().length, equalTo(2)); }
@Override public Query build(QueryNode queryNode) throws QueryNodeException { SlopQueryNode phraseSlopNode = (SlopQueryNode) queryNode; Query query = (Query) phraseSlopNode.getChild().getTag( QueryTreeBuilder.QUERY_TREE_BUILDER_TAGID); if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(phraseSlopNode.getValue()); } else { ((MultiPhraseQuery) query).setSlop(phraseSlopNode.getValue()); } return query; }
private Query applySlop(Query q, int slop) { if (q instanceof PhraseQuery) { PhraseQuery pq = (PhraseQuery) q; PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); final Term[] terms = pq.getTerms(); final int[] positions = pq.getPositions(); for (int i = 0; i < terms.length; ++i) { builder.add(terms[i], positions[i]); } pq = builder.build(); pq.setBoost(q.getBoost()); return pq; } else if (q instanceof MultiPhraseQuery) { ((MultiPhraseQuery) q).setSlop(slop); return q; } else { return q; } }
private Optional<Query> limitingFilter(Query query, boolean isNegated) { if (query instanceof SpanQuery) { return limitingFilterForSpan((SpanQuery) query, isNegated); } else if (query instanceof Filter) { return Optional.of(query); } else if (query instanceof BooleanQuery) { return boolQuery((BooleanQuery) query, isNegated); } else if (query instanceof TermQuery) { return Optional.of(query); } else if (query instanceof PhraseQuery) { return phraseFilter((PhraseQuery) query, isNegated); } else if (query instanceof MultiTermQuery) { return Optional.of(query); } else if (query instanceof WildcardPhraseQuery) { return wildcardPhraseFilter((WildcardPhraseQuery) query, isNegated); } else if (query instanceof ToParentBlockJoinQuery) { //This can be really bad for performance, if the nested query contains expensive operations (phrases/spans) //On the other hand, it is only slow if the field actually has any data, and we currently do not have // any data in the only nested text field (enrichments.sentences) return Optional.of(query); } else { //This should never happen, but if it does, it might be really bad for performance //logger.warn("failed to limit query, this should never happen. Query : [{}]", query.toString()); return Optional.of(query); } }
private static Optional<Query> phraseFilter(PhraseQuery query, boolean isNegated) { Term[] terms = query.getTerms(); if (terms.length == 0) { return Optional.absent(); } else if (terms.length == 1) { return Optional.<Query>of(new TermQuery(terms[0])); } else if (!isNegated) { List<Query> ret = new ArrayList<>(); for (Term t : terms) { ret.add(new TermQuery(t)); } return Optional.<Query>of(all(ret)); } else { return Optional.absent(); } }
@Test public void testBuildPhraseQuery() { String field = "testField"; String text = "Foo Bar Baz"; PhraseQuery pq = LuceneQueryFunctions.toPhraseQuery.apply(field,text); Term[] terms = pq.getTerms(); assertThat(terms[0].field(),is(field)); assertThat(terms[0].text(),is("Foo")); assertThat(terms[1].field(),is(field)); assertThat(terms[1].text(),is("Bar")); assertThat(terms[2].field(),is(field)); assertThat(terms[2].text(),is("Baz")); }
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this.fieldMatch = fieldMatch; Set<Query> flatQueries = new LinkedHashSet<>(); flatten( query, reader, flatQueries ); saveTerms( flatQueries, reader ); Collection<Query> expandQueries = expand( flatQueries ); for( Query flatQuery : expandQueries ){ QueryPhraseMap rootMap = getRootMap( flatQuery ); rootMap.add( flatQuery, reader ); if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)flatQuery; if( pq.getTerms().length > 1 ){ for( Term term : pq.getTerms() ) rootMap.addTerm( term, flatQuery.getBoost() ); } } } }
private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){ // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() // converts PhraseQuery to TermQuery) for( int i = 1; i < src.length; i++ ){ boolean overlap = true; for( int j = i; j < src.length; j++ ){ if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ overlap = false; break; } } if( overlap && src.length - i < dest.length ){ PhraseQuery pq = new PhraseQuery(); for( Term srcTerm : src ) pq.add( srcTerm ); for( int k = src.length - i; k < dest.length; k++ ){ pq.add( new Term( src[0].field(), dest[k].text() ) ); } pq.setSlop( slop ); pq.setBoost( boost ); if(!expandQueries.contains( pq ) ) expandQueries.add( pq ); } } }
void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{ for( Query query : flatQueries ){ Set<String> termSet = getTermSet( query ); if( query instanceof TermQuery ) termSet.add( ((TermQuery)query).getTerm().text() ); else if( query instanceof PhraseQuery ){ for( Term term : ((PhraseQuery)query).getTerms() ) termSet.add( term.text() ); } else if (query instanceof MultiTermQuery && reader != null) { BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader); for (BooleanClause clause : mtqTerms.getClauses()) { termSet.add (((TermQuery) clause.getQuery()).getTerm().text()); } } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } }
void add( Query query, IndexReader reader ) { if( query instanceof TermQuery ){ addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); } else if( query instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)query; Term[] terms = pq.getTerms(); Map<String, QueryPhraseMap> map = subMap; QueryPhraseMap qpm = null; for( Term term : terms ){ qpm = getOrNewMap( map, term.text() ); map = qpm.subMap; } qpm.markTerminal( pq.getSlop(), pq.getBoost() ); } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); }
public void testSameFieldAddedMultipleTimes() throws IOException { MemoryIndex mindex = new MemoryIndex(random().nextBoolean(), random().nextInt(50) * 1024 * 1024); MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); mindex.addField("field", "the quick brown fox", mockAnalyzer); mindex.addField("field", "jumps over the", mockAnalyzer); AtomicReader reader = (AtomicReader) mindex.createSearcher().getIndexReader(); assertEquals(7, reader.terms("field").getSumTotalTermFreq()); PhraseQuery query = new PhraseQuery(); query.add(new Term("field", "fox")); query.add(new Term("field", "jumps")); assertTrue(mindex.search(query) > 0.1); mindex.reset(); mockAnalyzer.setPositionIncrementGap(1 + random().nextInt(10)); mindex.addField("field", "the quick brown fox", mockAnalyzer); mindex.addField("field", "jumps over the", mockAnalyzer); assertEquals(0, mindex.search(query), 0.00001f); query.setSlop(10); assertTrue("posGap" + mockAnalyzer.getPositionIncrementGap("field") , mindex.search(query) > 0.0001); }
public void testPositionIncrement() throws Exception { StandardQueryParser qp = new StandardQueryParser(); qp.setAnalyzer( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 int expectedPositions[] = { 1, 3, 4, 6, 9 }; PhraseQuery pq = (PhraseQuery) qp.parse(qtxt, "a"); // System.out.println("Query text: "+qtxt); // System.out.println("Result: "+pq); Term t[] = pq.getTerms(); int pos[] = pq.getPositions(); for (int i = 0; i < t.length; i++) { // System.out.println(i+". "+t[i]+" pos: "+pos[i]); assertEquals("term " + i + " = " + t[i] + " has wrong term-position!", expectedPositions[i], pos[i]); } }
public void testStopwordsPosIncHole() throws Exception { Directory dir = newDirectory(); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET); return new TokenStreamComponents(tokenizer, stream); } }; RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a); Document doc = new Document(); doc.add(new TextField("body", "just a", Field.Store.NO)); doc.add(new TextField("body", "test of gaps", Field.Store.NO)); iw.addDocument(doc); IndexReader ir = iw.getReader(); iw.close(); IndexSearcher is = newSearcher(ir); PhraseQuery pq = new PhraseQuery(); pq.add(new Term("body", "just"), 0); pq.add(new Term("body", "test"), 2); // body:"just ? test" assertEquals(1, is.search(pq, 5).totalHits); ir.close(); dir.close(); }
public void testPositionIncrementMultiFields() throws Exception { Directory dir = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), dir); writer.addDocument(makeDocumentWithFields()); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); PhraseQuery query = new PhraseQuery(); query.add(new Term("indexed_not_tokenized", "test1")); query.add(new Term("indexed_not_tokenized", "test2")); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); doAssert(searcher.doc(hits[0].doc), true); writer.close(); reader.close(); dir.close(); }
/** * Base implementation delegates to {@link #getFieldQuery(String,String,boolean)}. * This method may be overridden, for example, to return * a SpanNearQuery instead of a PhraseQuery. * */ protected Query getFieldQuery(String field, String queryText, int slop) throws SyntaxError { Query query = getFieldQuery(field, queryText, true); // only set slop of the phrase query was a result of this parser // and not a sub-parser. if (subQParser == null) { if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(slop); } if (query instanceof MultiPhraseQuery) { ((MultiPhraseQuery) query).setSlop(slop); } } return query; }
/** * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); } return builder.build(); }
@Test public void testPositionIncrement() throws Exception { QueryParser qp = getParserConfig( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)); qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 int expectedPositions[] = { 1, 3, 4, 6, 9 }; PhraseQuery pq = (PhraseQuery) getQuery(qtxt, qp); // System.out.println("Query text: "+qtxt); // System.out.println("Result: "+pq); Term t[] = pq.getTerms(); int pos[] = pq.getPositions(); for (int i = 0; i < t.length; i++) { // System.out.println(i+". "+t[i]+" pos: "+pos[i]); Assert.assertEquals("term " + i + " = " + t[i] + " has wrong term-position!", expectedPositions[i], pos[i]); } }
public static Query rewrite(org.apache.lucene.search.Query q, Set<String> intFields) { if (q instanceof TermQuery) { return rewrite((TermQuery)q, intFields); } else if (q instanceof BooleanQuery) { return rewrite((BooleanQuery)q, intFields); } else if (q instanceof RangeQuery) { return rewrite((RangeQuery)q, intFields); } else if (q instanceof ConstantScoreRangeQuery) { return rewrite((ConstantScoreRangeQuery)q, intFields); } else if (q instanceof PrefixQuery) { return rewrite((PrefixQuery)q, intFields); } else if (q instanceof PhraseQuery) { return rewrite((PhraseQuery)q, intFields); } throw new IllegalArgumentException("unsupported lucene query type: " + q.getClass().getSimpleName()); }
FieldQuery( Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch ) throws IOException { this.fieldMatch = fieldMatch; Set<Query> flatQueries = new LinkedHashSet<Query>(); flatten( query, reader, flatQueries ); saveTerms( flatQueries, reader ); Collection<Query> expandQueries = expand( flatQueries ); for( Query flatQuery : expandQueries ){ QueryPhraseMap rootMap = getRootMap( flatQuery ); rootMap.add( flatQuery, reader ); if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)flatQuery; if( pq.getTerms().length > 1 ){ for( Term term : pq.getTerms() ) rootMap.addTerm( term, flatQuery.getBoost() ); } } } }
public void testPositionIncrement() throws Exception { StandardQueryParser qp = new StandardQueryParser(); qp.setAnalyzer( new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)); qp.setEnablePositionIncrements(true); String qtxt = "\"the words in poisitions pos02578 are stopped in this phrasequery\""; // 0 2 5 7 8 int expectedPositions[] = { 1, 3, 4, 6, 9 }; PhraseQuery pq = (PhraseQuery) qp.parse(qtxt, "a"); // System.out.println("Query text: "+qtxt); // System.out.println("Result: "+pq); Term t[] = pq.getTerms(); int pos[] = pq.getPositions(); for (int i = 0; i < t.length; i++) { // System.out.println(i+". "+t[i]+" pos: "+pos[i]); assertEquals("term " + i + " = " + t[i] + " has wrong term-position!", expectedPositions[i], pos[i]); } }
protected Query getFieldQuery(String defaultField, String queryText) throws ParseException { Query orig = super.getFieldQuery(defaultField, queryText); if (!(orig instanceof PhraseQuery)) { log.debug("Returning default query. No phrase query translation."); return orig; } /** * A ngram when parsed will become a series of smaller search terms, * these terms are grouped together into a PhraseQuery. We are taking * that PhraseQuery and breaking out each ngram term then combining all * ngrams together to form a BooleanQuery. */ PhraseQuery pq = (PhraseQuery)orig; return new NGramQuery(pq, useMust); }
private Query applySlop(Query q, int slop) { if (q instanceof PhraseQuery) { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); PhraseQuery pq = (PhraseQuery) q; org.apache.lucene.index.Term[] terms = pq.getTerms(); int[] positions = pq.getPositions(); for (int i = 0; i < terms.length; ++i) { builder.add(terms[i], positions[i]); } q = builder.build(); } else if (q instanceof MultiPhraseQuery) { MultiPhraseQuery mpq = (MultiPhraseQuery) q; if (slop != mpq.getSlop()) { q = new MultiPhraseQuery.Builder(mpq).setSlop(slop).build(); } } return q; }