protected Query doToQuery(QueryShardContext context) throws IOException { // Analyzer analyzer = context.getMapperService().searchAnalyzer(); Analyzer analyzer = new WhitespaceAnalyzer(); try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) { CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source)); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; } List<CustomSpanTermQuery> clauses = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { Term term = new Term(fieldName, termAtt.getBytesRef()); clauses.add(new CustomSpanTermQuery(term)); } return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount); } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } }
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); Document doc = new Document(); try (TokenStream stream = analyzer.tokenStream("field", "abcd ")) { stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct? TokenStream cachedStream = new CachingTokenFilter(stream); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = new Field("field", cachedStream, customType); doc.add(f); doc.add(f); w.addDocument(doc); } w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); }
/** * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field. * @param query The current Query * @param fieldName The name of the field * @param request The current SolrQueryRequest * @param tokenStream document text CachingTokenStream * @throws IOException If there is a low-level I/O error. */ protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException { SolrParams params = request.getParams(); Highlighter highlighter = null; highlighter = new Highlighter( getFormatter(fieldName, params), getEncoder(fieldName, params), getSpanQueryScorer(query, fieldName, tokenStream, request)); highlighter.setTextFragmenter(getFragmenter(fieldName, params)); return highlighter; }
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for(int i=0;i<2;i++) { termArr[i] = null; offsetArr[i] = 0; } while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if(termArr[0] == null) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else if(termArr[1] == null) { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } else { // shift termArr[0] = termArr[1]; offsetArr[0] = offsetArr[1]; // fill termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } if(termArr[0] != null && termArr[1] != null) { long offsetDiff = offsetArr[1] - offsetArr[0]; if(offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } } } }
private void createNaiveKmerQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); builder.add(new TermQuery(t), BooleanClause.Occur.SHOULD); } }
private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for(int i=0;i<2;i++) { termArr[i] = null; offsetArr[i] = 0; } int count = 0; while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if(count % 2 == 0) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); long offsetDiff = offsetArr[1] - offsetArr[0]; if(offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } termArr[0] = null; termArr[1] = null; } count++; } if(termArr[0] != null) { builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD); termArr[0] = null; } }
protected BooleanQuery createQueryClauses(KmerQueryAnalyzer analyzer, String field, String queryText, QueryGenerationAlgorithm queryGenerationAlgorithm) { try (TokenStream source = analyzer.tokenStream(field, queryText); CachingTokenFilter stream = new CachingTokenFilter(source)) { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; } // phase 1: read through the stream and assess the situation: // counting the number of tokens/positions and marking if we have any synonyms. int numTokens = 0; stream.reset(); while (stream.incrementToken()) { numTokens++; } // phase 2: based on token count, presence of synonyms, and options // formulate a single term, boolean, or phrase. if (numTokens == 0) { return null; } else if (numTokens == 1) { // single term return null; } else { BooleanQuery.Builder q = new BooleanQuery.Builder(); q.setDisableCoord(false); TermToBytesRefAttribute termAttB = stream.getAttribute(TermToBytesRefAttribute.class); OffsetAttribute offsetAtt = stream.getAttribute(OffsetAttribute.class); stream.reset(); if (queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.NAIVE_KMER)) { createNaiveKmerQueryClauses(q, field, stream, termAttB, offsetAtt); } else if(queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.CHAIN_PROXIMITY)) { createChainProximityQueryClauses(q, field, stream, termAttB, offsetAtt); } else if(queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.PAIRED_PROXIMITY)) { createPairedProximityQueryClauses(q, field, stream, termAttB, offsetAtt); } return q.build(); } } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } }
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random()); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); IOException priorException = null; TokenStream stream = analyzer.tokenStream("field", "abcd "); try { stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct? TokenStream cachedStream = new CachingTokenFilter(stream); FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); customType.setStoreTermVectors(true); customType.setStoreTermVectorPositions(true); customType.setStoreTermVectorOffsets(true); Field f = new Field("field", cachedStream, customType); doc.add(f); doc.add(f); w.addDocument(doc); } catch (IOException e) { priorException = e; } finally { IOUtils.closeWhileHandlingException(priorException, stream); } w.close(); IndexReader r = DirectoryReader.open(dir); TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null); assertNotNull(termsEnum.next()); DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null); assertEquals(2, termsEnum.totalTermFreq()); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); dpEnum.nextPosition(); assertEquals(0, dpEnum.startOffset()); assertEquals(4, dpEnum.endOffset()); dpEnum.nextPosition(); assertEquals(8, dpEnum.startOffset()); assertEquals(12, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); r.close(); dir.close(); }