Java 类org.apache.lucene.analysis.CachingTokenFilter 实例源码

项目:pyramid    文件:PhraseCountQueryBuilder.java   
protected Query doToQuery(QueryShardContext context) throws IOException {
//        Analyzer analyzer = context.getMapperService().searchAnalyzer();
        Analyzer analyzer = new WhitespaceAnalyzer();
        try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) {
            CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source));
            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
            if (termAtt == null) {
                return null;
            }
            List<CustomSpanTermQuery> clauses = new ArrayList<>();
            stream.reset();
            while (stream.incrementToken()) {
                Term term = new Term(fieldName, termAtt.getBytesRef());
                    clauses.add(new CustomSpanTermQuery(term));
            }
            return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount);
        } catch (IOException e) {
            throw new RuntimeException("Error analyzing query text", e);
        }


    }
项目:search    文件:TestTermVectorsWriter.java   
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random());
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
  Document doc = new Document();
  try (TokenStream stream = analyzer.tokenStream("field", "abcd   ")) {
    stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
    TokenStream cachedStream = new CachingTokenFilter(stream);
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorOffsets(true);
    Field f = new Field("field", cachedStream, customType);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
  }
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
  assertNotNull(termsEnum.next());
  DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
  assertEquals(2, termsEnum.totalTermFreq());

  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(0, dpEnum.startOffset());
  assertEquals(4, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(12, dpEnum.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

  r.close();
  dir.close();
}
项目:search    文件:DefaultSolrHighlighter.java   
/**
 * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
 * @param query The current Query
 * @param fieldName The name of the field
 * @param request The current SolrQueryRequest
 * @param tokenStream document text CachingTokenStream
 * @throws IOException If there is a low-level I/O error.
 */
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
  SolrParams params = request.getParams();
  Highlighter highlighter = null;

  highlighter = new Highlighter(
      getFormatter(fieldName, params),
      getEncoder(fieldName, params),
      getSpanQueryScorer(query, fieldName, tokenStream, request));

  highlighter.setTextFragmenter(getFragmenter(fieldName, params));

  return highlighter;
}
项目:biospectra    文件:Classifier.java   
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for(int i=0;i<2;i++) {
        termArr[i] = null;
        offsetArr[i] = 0;
    }

    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if(termArr[0] == null) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else if(termArr[1] == null) {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        } else {
            // shift
            termArr[0] = termArr[1];
            offsetArr[0] = offsetArr[1];
            // fill
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();
        }

        if(termArr[0] != null && termArr[1] != null) {
            long offsetDiff = offsetArr[1] - offsetArr[0];
            if(offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }
        }
    }
}
项目:NYBC    文件:DefaultSolrHighlighter.java   
/**
 * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
 * @param query The current Query
 * @param fieldName The name of the field
 * @param request The current SolrQueryRequest
 * @param tokenStream document text CachingTokenStream
 * @throws IOException If there is a low-level I/O error.
 */
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
  SolrParams params = request.getParams();
  Highlighter highlighter = null;

  highlighter = new Highlighter(
      getFormatter(fieldName, params),
      getEncoder(fieldName, params),
      getSpanQueryScorer(query, fieldName, tokenStream, request));

  highlighter.setTextFragmenter(getFragmenter(fieldName, params));

  return highlighter;
}
项目:search-core    文件:DefaultSolrHighlighter.java   
/**
 * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
 * @param query The current Query
 * @param fieldName The name of the field
 * @param request The current SolrQueryRequest
 * @param tokenStream document text CachingTokenStream
 * @throws IOException If there is a low-level I/O error.
 */
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
  SolrParams params = request.getParams();
  Highlighter highlighter = null;

  highlighter = new Highlighter(
      getFormatter(fieldName, params),
      getEncoder(fieldName, params),
      getSpanQueryScorer(query, fieldName, tokenStream, request));

  highlighter.setTextFragmenter(getFragmenter(fieldName, params));

  return highlighter;
}
项目:read-open-source-code    文件:DefaultSolrHighlighter.java   
/**
 * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
 * @param query The current Query
 * @param fieldName The name of the field
 * @param request The current SolrQueryRequest
 * @param tokenStream document text CachingTokenStream
 * @throws IOException If there is a low-level I/O error.
 */
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
  SolrParams params = request.getParams();
  Highlighter highlighter = null;

  highlighter = new Highlighter(
      getFormatter(fieldName, params),
      getEncoder(fieldName, params),
      getSpanQueryScorer(query, fieldName, tokenStream, request));

  highlighter.setTextFragmenter(getFragmenter(fieldName, params));

  return highlighter;
}
项目:read-open-source-code    文件:DefaultSolrHighlighter.java   
/**
 * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
 * @param query The current Query
 * @param fieldName The name of the field
 * @param request The current SolrQueryRequest
 * @param tokenStream document text CachingTokenStream
 * @throws IOException If there is a low-level I/O error.
 */
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
  SolrParams params = request.getParams();
  Highlighter highlighter = null;

  highlighter = new Highlighter(
      getFormatter(fieldName, params),
      getEncoder(fieldName, params),
      getSpanQueryScorer(query, fieldName, tokenStream, request));

  highlighter.setTextFragmenter(getFragmenter(fieldName, params));

  return highlighter;
}
项目:biospectra    文件:Classifier.java   
private void createNaiveKmerQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        builder.add(new TermQuery(t), BooleanClause.Occur.SHOULD);
    }
}
项目:biospectra    文件:Classifier.java   
private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException {
    Term termArr[] = new Term[2];
    long offsetArr[] = new long[2];
    for(int i=0;i<2;i++) {
        termArr[i] = null;
        offsetArr[i] = 0;
    }

    int count = 0;
    while (stream.incrementToken()) {
        Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef()));
        if(count % 2 == 0) {
            termArr[0] = t;
            offsetArr[0] = offsetAtt.startOffset();
        } else {
            termArr[1] = t;
            offsetArr[1] = offsetAtt.startOffset();

            long offsetDiff = offsetArr[1] - offsetArr[0];
            if(offsetDiff > 0) {
                PhraseQuery.Builder pq = new PhraseQuery.Builder();

                pq.setSlop((int) (offsetDiff) + 1);
                pq.add(termArr[0]);
                pq.add(termArr[1]);

                builder.add(pq.build(), BooleanClause.Occur.SHOULD);
            }

            termArr[0] = null;
            termArr[1] = null;
        }

        count++;
    }

    if(termArr[0] != null) {
        builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD);
        termArr[0] = null;
    }
}
项目:biospectra    文件:Classifier.java   
protected BooleanQuery createQueryClauses(KmerQueryAnalyzer analyzer, String field, String queryText, QueryGenerationAlgorithm queryGenerationAlgorithm) {
    try (TokenStream source = analyzer.tokenStream(field, queryText);
        CachingTokenFilter stream = new CachingTokenFilter(source)) {

        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);

        if (termAtt == null) {
            return null;
        }

        // phase 1: read through the stream and assess the situation:
        // counting the number of tokens/positions and marking if we have any synonyms.
        int numTokens = 0;

        stream.reset();
        while (stream.incrementToken()) {
            numTokens++;
        }

        // phase 2: based on token count, presence of synonyms, and options
        // formulate a single term, boolean, or phrase.
        if (numTokens == 0) {
            return null;
        } else if (numTokens == 1) {
            // single term
            return null;
        } else {
            BooleanQuery.Builder q = new BooleanQuery.Builder();
            q.setDisableCoord(false);

            TermToBytesRefAttribute termAttB = stream.getAttribute(TermToBytesRefAttribute.class);
            OffsetAttribute offsetAtt = stream.getAttribute(OffsetAttribute.class);

            stream.reset();

            if (queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.NAIVE_KMER)) {
                createNaiveKmerQueryClauses(q, field, stream, termAttB, offsetAtt);
            } else if(queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.CHAIN_PROXIMITY)) {
                createChainProximityQueryClauses(q, field, stream, termAttB, offsetAtt);
            } else if(queryGenerationAlgorithm.equals(QueryGenerationAlgorithm.PAIRED_PROXIMITY)) {
                createPairedProximityQueryClauses(q, field, stream, termAttB, offsetAtt);
            }

            return q.build();
        }
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing query text", e);
    }
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestTermVectorsWriter.java   
public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
  Directory dir = newDirectory();
  Analyzer analyzer = new MockAnalyzer(random());
  IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
  Document doc = new Document();
  IOException priorException = null;
  TokenStream stream = analyzer.tokenStream("field", "abcd   ");
  try {
    stream.reset(); // TODO: weird to reset before wrapping with CachingTokenFilter... correct?
    TokenStream cachedStream = new CachingTokenFilter(stream);
    FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
    customType.setStoreTermVectors(true);
    customType.setStoreTermVectorPositions(true);
    customType.setStoreTermVectorOffsets(true);
    Field f = new Field("field", cachedStream, customType);
    doc.add(f);
    doc.add(f);
    w.addDocument(doc);
  } catch (IOException e) {
    priorException = e;
  } finally {
    IOUtils.closeWhileHandlingException(priorException, stream);
  }
  w.close();

  IndexReader r = DirectoryReader.open(dir);
  TermsEnum termsEnum = r.getTermVectors(0).terms("field").iterator(null);
  assertNotNull(termsEnum.next());
  DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
  assertEquals(2, termsEnum.totalTermFreq());

  assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
  dpEnum.nextPosition();
  assertEquals(0, dpEnum.startOffset());
  assertEquals(4, dpEnum.endOffset());

  dpEnum.nextPosition();
  assertEquals(8, dpEnum.startOffset());
  assertEquals(12, dpEnum.endOffset());
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());

  r.close();
  dir.close();
}