Java 类org.apache.lucene.index.PostingsEnum 实例源码

项目:elasticsearch_my    文件:OrdinalsBuilder.java   
/**
 * This method iterates all terms in the given {@link TermsEnum} and
 * associates each terms ordinal with the terms documents. The caller must
 * exhaust the returned {@link BytesRefIterator} which returns all values
 * where the first returned value is associated with the ordinal <tt>1</tt>
 * etc.
 * <p>
 * If the {@link TermsEnum} contains prefix coded numerical values the terms
 * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)}
 * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If
 * the {@link TermsEnum} is not wrapped the returned
 * {@link BytesRefIterator} will contain partial precision terms rather than
 * only full-precision terms.
 * </p>
 */
public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
    return new BytesRefIterator() {
        private PostingsEnum docsEnum = null;

        @Override
        public BytesRef next() throws IOException {
            BytesRef ref;
            if ((ref = termsEnum.next()) != null) {
                docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
                nextOrdinal();
                int docId;
                while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    addDoc(docId);
                }
            }
            return ref;
        }
    };
}
项目:elasticsearch_my    文件:FreqTermsEnum.java   
public FreqTermsEnum(IndexReader reader, String field, boolean needDocFreq, boolean needTotalTermFreq, @Nullable Query filter, BigArrays bigArrays) throws IOException {
    super(reader, field, needTotalTermFreq ? PostingsEnum.FREQS : PostingsEnum.NONE, filter);
    this.bigArrays = bigArrays;
    this.needDocFreqs = needDocFreq;
    this.needTotalTermFreqs = needTotalTermFreq;
    if (needDocFreq) {
        termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false);
    } else {
        termDocFreqs = null;
    }
    if (needTotalTermFreq) {
        termsTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false);
    } else {
        termsTotalFreqs = null;
    }
    cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays);
}
项目:elasticsearch_my    文件:TermVectorsWriter.java   
private PostingsEnum writeTermWithDocsAndPos(TermsEnum iterator, PostingsEnum docsAndPosEnum, boolean positions,
                                                     boolean offsets, boolean payloads) throws IOException {
    docsAndPosEnum = iterator.postings(docsAndPosEnum, PostingsEnum.ALL);
    // for each term (iterator next) in this field (field)
    // iterate over the docs (should only be one)
    int nextDoc = docsAndPosEnum.nextDoc();
    assert nextDoc != DocIdSetIterator.NO_MORE_DOCS;
    final int freq = docsAndPosEnum.freq();
    writeFreq(freq);
    for (int j = 0; j < freq; j++) {
        int curPos = docsAndPosEnum.nextPosition();
        if (positions) {
            writePosition(curPos);
        }
        if (offsets) {
            writeOffsets(docsAndPosEnum.startOffset(), docsAndPosEnum.endOffset());
        }
        if (payloads) {
            writePayload(docsAndPosEnum.getPayload());
        }
    }
    nextDoc = docsAndPosEnum.nextDoc();
    assert nextDoc == DocIdSetIterator.NO_MORE_DOCS;
    return docsAndPosEnum;
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
项目:elasticsearch_my    文件:TermVectorsResponse.java   
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}
项目:lucene-custom-query    文件:PostingsAndFreq.java   
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
  this.postings = postings;
  this.position = position;
  nTerms = terms==null ? 0 : terms.length;
  if (nTerms>0) {
    if (terms.length==1) {
      this.terms = terms;
    } else {
      Term[] terms2 = new Term[terms.length];
      System.arraycopy(terms, 0, terms2, 0, terms.length);
      Arrays.sort(terms2);
      this.terms = terms2;
    }
  } else {
    this.terms = null;
  }
}
项目:Elasticsearch    文件:SignificantTermsAggregatorFactory.java   
/**
 * Creates the TermsEnum (if not already created) and must be called before any calls to getBackgroundFrequency
 * @param context The aggregation context 
 * @return The number of documents in the index (after an optional filter might have been applied)
 */
public long prepareBackground(AggregationContext context) {
    if (termsEnum != null) {
        // already prepared - return 
        return termsEnum.getNumDocs();
    }
    SearchContext searchContext = context.searchContext();
    IndexReader reader = searchContext.searcher().getIndexReader();
    try {
        if (numberOfAggregatorsCreated == 1) {
            // Setup a termsEnum for sole use by one aggregator
            termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
        } else {
            // When we have > 1 agg we have possibility of duplicate term frequency lookups 
            // and so use a TermsEnum that caches results of all term lookups
            termsEnum = new FreqTermsEnum(reader, indexedFieldName, true, false, filter, searchContext.bigArrays());
        }
    } catch (IOException e) {
        throw new ElasticsearchException("failed to build terms enumeration", e);
    }
    return termsEnum.getNumDocs();
}
项目:Elasticsearch    文件:OrdinalsBuilder.java   
/**
 * This method iterates all terms in the given {@link TermsEnum} and
 * associates each terms ordinal with the terms documents. The caller must
 * exhaust the returned {@link BytesRefIterator} which returns all values
 * where the first returned value is associted with the ordinal <tt>1</tt>
 * etc.
 * <p>
 * If the {@link TermsEnum} contains prefix coded numerical values the terms
 * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)}
 * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If
 * the {@link TermsEnum} is not wrapped the returned
 * {@link BytesRefIterator} will contain partial precision terms rather than
 * only full-precision terms.
 * </p>
 */
public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
    return new BytesRefIterator() {
        private PostingsEnum docsEnum = null;

        @Override
        public BytesRef next() throws IOException {
            BytesRef ref;
            if ((ref = termsEnum.next()) != null) {
                docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
                nextOrdinal();
                int docId;
                while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    addDoc(docId);
                }
            }
            return ref;
        }
    };
}
项目:Elasticsearch    文件:FreqTermsEnum.java   
public FreqTermsEnum(IndexReader reader, String field, boolean needDocFreq, boolean needTotalTermFreq, @Nullable Query filter, BigArrays bigArrays) throws IOException {
    super(reader, field, needTotalTermFreq ? PostingsEnum.FREQS : PostingsEnum.NONE, filter);
    this.bigArrays = bigArrays;
    this.needDocFreqs = needDocFreq;
    this.needTotalTermFreqs = needTotalTermFreq;
    if (needDocFreq) {
        termDocFreqs = bigArrays.newIntArray(INITIAL_NUM_TERM_FREQS_CACHED, false);
    } else {
        termDocFreqs = null;
    }
    if (needTotalTermFreq) {
        termsTotalFreqs = bigArrays.newLongArray(INITIAL_NUM_TERM_FREQS_CACHED, false);
    } else {
        termsTotalFreqs = null;
    }
    cachedTermOrds = new BytesRefHash(INITIAL_NUM_TERM_FREQS_CACHED, bigArrays);
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}
项目:Elasticsearch    文件:TermVectorsResponse.java   
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException {
    for (int j = 0; j < termFreq; j++) {
        int nextPos = posEnum.nextPosition();
        if (curTerms.hasPositions()) {
            currentPositions[j] = nextPos;
        }
        if (curTerms.hasOffsets()) {
            currentStartOffset[j] = posEnum.startOffset();
            currentEndOffset[j] = posEnum.endOffset();
        }
        if (curTerms.hasPayloads()) {
            BytesRef curPayload = posEnum.getPayload();
            if (curPayload != null) {
                currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length);
            } else {
                currentPayloads[j] = null;
            }
        }
    }
}
项目:solrplugins    文件:ProofOfConceptPayloadHandler.java   
private NamedList<Object> buildEntryValue(long count, Term t, List<Entry<LeafReader, Bits>> leaves) throws IOException {
  NamedList<Object> entry = new NamedList<>();
  entry.add("count", count);
  int i = -1;
  for (Entry<LeafReader, Bits> e : leaves) {
    PostingsEnum postings = e.getKey().postings(t, PostingsEnum.PAYLOADS);
    Bits liveDocs = e.getValue();
    while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (!liveDocs.get(postings.docID())) {
        continue;
      }
      i++;
      NamedList<Object> documentEntry = new NamedList<>();
      entry.add("doc" + i, documentEntry);
      for (int j = 0; j < postings.freq(); j++) {
        postings.nextPosition();
        String extra = postings.getPayload().utf8ToString();
        documentEntry.add("position" + j, extra);
      }
    }
  }
  return entry;
}
项目:semanticvectors    文件:LuceneUtils.java   
/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 *       n   = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term))
    return termEntropy.get(term);
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term, 1 + (float) entropy);
  return (float) (1 + entropy);
}
项目:clue    文件:ReconstructCommand.java   
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
  List<String> textList = new ArrayList<String>();
  BytesRef text;
  PostingsEnum postings = null;
  while ((text = te.next()) != null) {
    postings = te.postings(postings, PostingsEnum.FREQS);
    int iterDoc = postings.advance(docid);
    if (iterDoc == docid) {
      textList.add(text.utf8ToString());
    }
  }
  StringBuilder buf = new StringBuilder();
  for (String s : textList) {
    buf.append(s+" ");
  }
  return buf.toString();
}
项目:elasticsearch_my    文件:ThrowingLeafReaderWrapper.java   
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
    if ((flags & PostingsEnum.POSITIONS) != 0) {
        thrower.maybeThrow(Flags.DocsAndPositionsEnum);
    } else {
        thrower.maybeThrow(Flags.DocsEnum);
    }
    return super.postings(reuse, flags);
}
项目:elasticsearch_my    文件:SignificantTermsAggregatorFactory.java   
private FilterableTermsEnum getTermsEnum(String field) throws IOException {
    if (termsEnum != null) {
        return termsEnum;
    }
    IndexReader reader = context.searcher().getIndexReader();
    if (numberOfAggregatorsCreated > 1) {
        termsEnum = new FreqTermsEnum(reader, field, true, false, filter, context.bigArrays());
    } else {
        termsEnum = new FilterableTermsEnum(reader, indexedFieldName, PostingsEnum.NONE, filter);
    }
    return termsEnum;
}
项目:elasticsearch_my    文件:TermsSliceQuery.java   
/**
 * Returns a DocIdSet per segments containing the matching docs for the specified slice.
 */
private DocIdSet build(LeafReader reader) throws IOException {
    final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
    final Terms terms = reader.terms(getField());
    final TermsEnum te = terms.iterator();
    PostingsEnum docsEnum = null;
    for (BytesRef term = te.next(); term != null; term = te.next()) {
        int hashCode = term.hashCode();
        if (contains(hashCode)) {
            docsEnum = te.postings(docsEnum, PostingsEnum.NONE);
            builder.add(docsEnum);
        }
    }
    return builder.build();
}
项目:elasticsearch_my    文件:IndexFieldTerm.java   
private int convertToLuceneFlags(int flags) {
    int lucenePositionsFlags = PostingsEnum.NONE;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_FREQUENCIES) > 0 ? PostingsEnum.FREQS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_POSITIONS) > 0 ? PostingsEnum.POSITIONS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_PAYLOADS) > 0 ? PostingsEnum.PAYLOADS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_OFFSETS) > 0 ? PostingsEnum.OFFSETS : 0x0;
    return lucenePositionsFlags;
}
项目:elasticsearch_my    文件:IndexFieldTerm.java   
private PostingsEnum getPostings(int luceneFlags, LeafReader reader) throws IOException {
    assert identifier.field() != null;
    assert identifier.bytes() != null;
    final Fields fields = reader.fields();
    PostingsEnum newPostings = null;
    if (fields != null) {
        final Terms terms = fields.terms(identifier.field());
        if (terms != null) {
            TermsEnum termsEnum = terms.iterator();
            if (termsEnum.seekExact(identifier.bytes())) {
                newPostings = termsEnum.postings(postings, luceneFlags);
                final Bits liveDocs = reader.getLiveDocs();
                if (liveDocs != null) {
                    newPostings = new FilterPostingsEnum(newPostings) {
                        private int doNext(int d) throws IOException {
                            for (; d != NO_MORE_DOCS; d = super.nextDoc()) {
                                if (liveDocs.get(d)) {
                                    return d;
                                }
                            }
                            return NO_MORE_DOCS;
                        }
                        @Override
                        public int nextDoc() throws IOException {
                            return doNext(super.nextDoc());
                        }
                        @Override
                        public int advance(int target) throws IOException {
                            return doNext(super.advance(target));
                        }
                    };
                }
            }
        }
    }
    return newPostings;
}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 * @param fieldName Optional field name of the terms for skip terms
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while((text = termsEnum.next()) != null) {
        spare.copyUTF8Bytes(text);
        final String term = spare.toString();
        if (isNoiseWord(term)) {
            continue;
        }
        if (isSkipTerm(fieldName, term)) {
            continue;
        }

        final PostingsEnum docs = termsEnum.postings(null);
        int freq = 0;
        while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            freq += docs.freq();
        }

        // increment frequency
        Int cnt = termFreqMap.get(term);
        if (cnt == null) {
            cnt = new Int();
            termFreqMap.put(term, cnt);
            cnt.x = freq;
        } else {
            cnt.x += freq;
        }
    }
}
项目:elasticsearch_my    文件:TermVectorsFields.java   
private PostingsEnum reset(int[] positions, int[] startOffsets, int[] endOffsets, BytesRefBuilder[] payloads, int freq) {
    curPos = -1;
    doc = -1;
    this.hasPositions = positions != null;
    this.hasOffsets = startOffsets != null;
    this.hasPayloads = payloads != null;
    this.freq = freq;
    this.startOffsets = startOffsets;
    this.endOffsets = endOffsets;
    this.payloads = payloads;
    this.positions = positions;
    return this;
}
项目:elasticsearch_my    文件:TermVectorsWriter.java   
private PostingsEnum writeTermWithDocsOnly(TermsEnum iterator, PostingsEnum docsEnum) throws IOException {
    docsEnum = iterator.postings(docsEnum);
    int nextDoc = docsEnum.nextDoc();
    assert nextDoc != DocIdSetIterator.NO_MORE_DOCS;
    writeFreq(docsEnum.freq());
    nextDoc = docsEnum.nextDoc();
    assert nextDoc == DocIdSetIterator.NO_MORE_DOCS;
    return docsEnum;
}
项目:elasticsearch_my    文件:TextFieldMapperTests.java   
public void testDefaultPositionIncrementGap() throws IOException {
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
            .startObject("properties").startObject("field").field("type", "text").endObject().endObject()
            .endObject().endObject().string();

    DocumentMapper mapper = indexService.mapperService().merge("type",
            new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
            .startObject()
            .array("field", new String[] {"a", "b"})
            .endObject()
            .bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", doc.uid() ), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(TextFieldMapper.Defaults.POSITION_INCREMENT_GAP + 1, postings.nextPosition());
    }
}
项目:elasticsearch_my    文件:TextFieldMapperTests.java   
public void testPositionIncrementGap() throws IOException {
    final int positionIncrementGap = randomIntBetween(1, 1000);
    String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
            .startObject("properties").startObject("field")
                .field("type", "text")
                .field("position_increment_gap", positionIncrementGap)
            .endObject().endObject()
            .endObject().endObject().string();

    DocumentMapper mapper = indexService.mapperService().merge("type",
            new CompressedXContent(mapping), MergeReason.MAPPING_UPDATE, false);

    assertEquals(mapping, mapper.mappingSource().toString());

    ParsedDocument doc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder()
            .startObject()
            .array("field", new String[] {"a", "b"})
            .endObject()
            .bytes());

    IndexableField[] fields = doc.rootDoc().getFields("field");
    assertEquals(2, fields.length);

    assertEquals("a", fields[0].stringValue());
    assertEquals("b", fields[1].stringValue());

    IndexShard shard = indexService.getShard(0);
    shard.index(new Engine.Index(new Term("_uid", doc.uid()), doc));
    shard.refresh("test");
    try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
        LeafReader leaf = searcher.getDirectoryReader().leaves().get(0).reader();
        TermsEnum terms = leaf.terms("field").iterator();
        assertTrue(terms.seekExact(new BytesRef("b")));
        PostingsEnum postings = terms.postings(null, PostingsEnum.POSITIONS);
        assertEquals(0, postings.nextDoc());
        assertEquals(positionIncrementGap + 1, postings.nextPosition());
    }
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException {
    String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
    int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
    int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
    int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
    int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};

    Terms terms = fields.terms(fieldName);
    assertThat(terms.size(), equalTo(8L));
    TermsEnum iterator = terms.iterator();
    for (int j = 0; j < values.length; j++) {
        String string = values[j];
        BytesRef next = iterator.next();
        assertThat(next, notNullValue());
        assertThat("expected " + string, string, equalTo(next.utf8ToString()));
        assertThat(next, notNullValue());
        // do not test ttf or doc frequency, because here we have many
        // shards and do not know how documents are distributed
        PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL);
        assertThat(docsAndPositions.nextDoc(), equalTo(0));
        assertThat(freq[j], equalTo(docsAndPositions.freq()));
        int[] termPos = pos[j];
        int[] termStartOffset = startOffset[j];
        int[] termEndOffset = endOffset[j];
        assertThat(termPos.length, equalTo(freq[j]));
        assertThat(termStartOffset.length, equalTo(freq[j]));
        assertThat(termEndOffset.length, equalTo(freq[j]));
        for (int k = 0; k < freq[j]; k++) {
            int nextPosition = docsAndPositions.nextPosition();
            assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
            assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
            assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
            if (withPayloads) {
                assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
            }
        }
    }
    assertThat(iterator.next(), nullValue());
}
项目:elasticsearch_my    文件:GetTermVectorsIT.java   
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));

    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());

        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));

        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));

        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));

        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}
项目:Elasticsearch    文件:IndexFieldTerm.java   
private int convertToLuceneFlags(int flags) {
    int lucenePositionsFlags = PostingsEnum.NONE;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_FREQUENCIES) > 0 ? PostingsEnum.FREQS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_POSITIONS) > 0 ? PostingsEnum.POSITIONS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_PAYLOADS) > 0 ? PostingsEnum.PAYLOADS : 0x0;
    lucenePositionsFlags |= (flags & IndexLookup.FLAG_OFFSETS) > 0 ? PostingsEnum.OFFSETS : 0x0;
    return lucenePositionsFlags;
}
项目:Elasticsearch    文件:IndexFieldTerm.java   
private PostingsEnum getPostings(int luceneFlags, LeafReader reader) throws IOException {
    assert identifier.field() != null;
    assert identifier.bytes() != null;
    final Fields fields = reader.fields();
    PostingsEnum newPostings = null;
    if (fields != null) {
        final Terms terms = fields.terms(identifier.field());
        if (terms != null) {
            TermsEnum termsEnum = terms.iterator();
            if (termsEnum.seekExact(identifier.bytes())) {
                newPostings = termsEnum.postings(postings, luceneFlags);
                final Bits liveDocs = reader.getLiveDocs();
                if (liveDocs != null) {
                    newPostings = new FilterPostingsEnum(newPostings) {
                        private int doNext(int d) throws IOException {
                            for (; d != NO_MORE_DOCS; d = super.nextDoc()) {
                                if (liveDocs.get(d)) {
                                    return d;
                                }
                            }
                            return NO_MORE_DOCS;
                        }
                        @Override
                        public int nextDoc() throws IOException {
                            return doNext(super.nextDoc());
                        }
                        @Override
                        public int advance(int target) throws IOException {
                            return doNext(super.advance(target));
                        }
                    };
                }
            }
        }
    }
    return newPostings;
}
项目:Elasticsearch    文件:TermVectorsFields.java   
private PostingsEnum reset(int[] positions, int[] startOffsets, int[] endOffsets, BytesRefBuilder[] payloads, int freq) {
    curPos = -1;
    doc = -1;
    this.hasPositions = positions != null;
    this.hasOffsets = startOffsets != null;
    this.hasPayloads = payloads != null;
    this.freq = freq;
    this.startOffsets = startOffsets;
    this.endOffsets = endOffsets;
    this.payloads = payloads;
    this.positions = positions;
    return this;
}
项目:ir-generalized-translation-models    文件:AugmentedTermScorer.java   
/**
 * Construct an <code>query.{@link AugmentedTermScorer}</code>.
 *
 * @param weight
 *          The weight of the <code>Term</code> in the query.
 * @param mainTerm
 *          An iterator over the documents matching the main <code>Term</code>.
 * @param similarPostings
 *          A list of <code>PostingsEnumWeightTuple</code>: term iterator, weight pairs
 * @param docScorer
 *          The <code>Similarity.SimScorer</code> implementation
 *          to be used for score computations.
 */
public AugmentedTermScorer(Weight weight, PostingsEnum mainTerm, List<PostingsEnumWeightTuple> similarPostings, Similarity.SimScorer docScorer) {
    super(weight);

    this.postings = new PostingsEnumWeightTuple[similarPostings.size() + 1];
    this.postings[0] = new PostingsEnumWeightTuple(mainTerm,1f);
    for (int i = 0; i < similarPostings.size(); i++) {
        this.postings[i + 1] = similarPostings.get(i);
    }

    this.iterator = new MultiDocIdSetIterator(this.postings);

    this.docScorer = docScorer;
}
项目:solr    文件:FieldFeatureTFIDFExtractor.java   
public FieldFeatureTFIDFExtractor(PostingsEnum pe, int numDocs, int docFreq){
  this.pe = pe;
  assert numDocs >= docFreq;
  this.numDocs = numDocs + 1;
  this.docFreq = docFreq <= 0 ? 1 : docFreq;
  idf = (float)Math.log((double)this.numDocs/(double)this.docFreq);
}
项目:KEA-lucene    文件:KeyphraseExtractor2.java   
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
    throws IOException {
  PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
  int ret = de.advance(docId);
  if(ret == PostingsEnum.NO_MORE_DOCS){
    throw new RuntimeException("no more docs...");
  }
  else{
    int freq = de.freq();
    if(freq < 2) return null;

    PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
    int ret2 = pe.advance(docId);
    if(ret2 == PostingsEnum.NO_MORE_DOCS){
      throw new RuntimeException("no more docs...");
    }
    else{
      double[] features = new double[2];
      int pos = pe.nextPosition();
      int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
      if(inc){
        docFreq++;
        numDocs++;
      }
      features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
      features[1] = Commons.calcFirstOccurrence(pos, docSize);

      return features;
    }
  }
}
项目:KEA-lucene    文件:KeyphraseExtractor.java   
static double[] getFeatures(IndexReader ir, String fieldName, BytesRef rawPhrase, int docId, int docSize, int numDocs, boolean inc)
    throws IOException {
  PostingsEnum de = MultiFields.getTermDocsEnum(ir, fieldName, rawPhrase);
  int ret = de.advance(docId);
  if(ret == PostingsEnum.NO_MORE_DOCS){
    throw new RuntimeException("no more docs...");
  }
  else{
    int freq = de.freq();
    if(freq < 2) return null;

    PostingsEnum pe = MultiFields.getTermPositionsEnum(ir, fieldName, rawPhrase);
    int ret2 = pe.advance(docId);
    if(ret2 == PostingsEnum.NO_MORE_DOCS){
      throw new RuntimeException("no more docs...");
    }
    else{
      double[] features = new double[2];
      int pos = pe.nextPosition();
      int docFreq = ir.docFreq(new Term(fieldName, rawPhrase));
      if(inc){
        docFreq++;
        numDocs++;
      }
      features[0] = Commons.calcTfIdf(freq, docSize, docFreq, numDocs);
      features[1] = Commons.calcFirstOccurrence(pos, docSize);

      return features;
    }
  }
}
项目:elasticsearch-vectorize    文件:VectorizeService.java   
private void processTermVectorsFields(Vectorizer vectorizer, Fields termVectorsFields) throws IOException {
    for (String fieldName : termVectorsFields) {
        TermsEnum termsEnum = termVectorsFields.terms(fieldName).iterator();
        while (termsEnum.next() != null) {
            Term term = new Term(fieldName, termsEnum.term());
            TermStatistics termStatistics = new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
            int freq = termsEnum.postings(null, null, PostingsEnum.ALL).freq();
            vectorizer.add(term, termStatistics, freq);
        }
    }
}
项目:lucenelab    文件:IndexUtils.java   
/** Prints the terms indexed under the given fields with full postings information. */
public static void printFieldTermsWithInfo(LeafReader reader, String... fields) throws IOException {
    for (final String field : fields) {
        System.out.println(format("Terms for field [%s], with positional info:", field));
        final TermsEnum te = reader.terms(field).iterator();
        BytesRef scratch;
        PostingsEnum postings = null;
        while ((scratch = te.next()) != null) {
            System.out.println(format("  %s", scratch.utf8ToString()));
            postings = te.postings(postings, PostingsEnum.ALL);
            for (postings.nextDoc(); postings.docID() != DocIdSetIterator.NO_MORE_DOCS; postings.nextDoc()) {
                final Map<Integer, BytesRef> positions = Maps.newTreeMap();
                boolean addedPayload = false;
                for (int i = 0; i < postings.freq(); i++) {
                    final int pos = postings.nextPosition();
                    final BytesRef payload = postings.getPayload();
                    if (payload != null) {
                        positions.put(pos, BytesRef.deepCopyOf(payload));
                        addedPayload = true;
                    } else {
                        positions.put(pos, null);
                    }
                }
                if (addedPayload) {
                    System.out.println(format("    doc=%d, freq=%d", postings.docID(), postings.freq(), positions));
                    for (final Entry<Integer, BytesRef> e : positions.entrySet()) {
                        System.out.println(format("      pos=%d, payload=%s", e.getKey(), e.getValue()));
                    }
                } else {
                    System.out.println(format("    doc=%d, freq=%d, pos=%s", postings.docID(), postings.freq(),
                            positions.keySet()));
                }
            }
        }
    }
}
项目:lucenelab    文件:AnnotationsUtils.java   
public static void printAnnotations(LeafReader reader, Term term) throws IOException {
    System.out.println("Annotations for " + term);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
    for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
        final int freq = postings.freq();
        System.out.println("  doc=" + docID + ", freq=" + freq);
        for (int i = 0; i < freq; i++) {
            postings.nextPosition();
            final BytesRef payload = postings.getPayload();
            in.reset(payload.bytes, payload.offset, payload.length);
            System.out.println("    start=" + in.readVInt() + ", length=" + in.readVInt());
        }
    }
}
项目:pyramid    文件:ESIndex.java   
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;

    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
        String term = termBytes.utf8ToString();

        postings = iterator.postings(postings, PostingsEnum.ALL);

        //there can only be one doc since we are getting with id. get the doc and the position 
        postings.nextDoc();

        int tf = postings.freq();

        for (int i = 0; i < tf; i++) {
            int pos = postings.nextPosition();
            map.put(pos,term);
        }

    }

    return map;
}
项目:jate    文件:FrequencyCtxSentenceBasedFBWorker.java   
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}
项目:lucene-addons    文件:TestSimpleAnalyzerUtil.java   
private void executeNeedleTests(Analyzer analyzer) throws Exception {

    String needle = getNeedle(analyzer);
    int numFieldValues = 23;

    Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues);

    IndexReader reader = DirectoryReader.open(directory);

    LeafReaderContext ctx = reader.leaves().get(0);
    LeafReader r = ctx.reader();

    PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL);
    int numTests = 0;
    try {
      while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        int frq = dpe.freq();
        int advanced = 0;

        String[] fieldValues = r.document(dpe.docID()).getValues(FIELD);
        while (++advanced < frq) {
          dpe.nextPosition();
          String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(),
              dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | ");
          assertEquals(needle, rebuilt);
          numTests++;
        }
      }
    } finally {
      reader.close();
      directory.close();
    }
    assertEquals("number of tests", numFieldValues - 1, numTests);
  }
项目:lucene-addons    文件:SearchSingleTerm.java   
private Document getDoc(String s, IndexReader reader) throws IOException {
    //TODO: normalize s?
    BytesRef bytesRef = new BytesRef(s);

    PostingsEnum docsEnum = MultiFields.getTermDocsEnum(reader,
            SyntacticSynsConfig.getSynsTargetFieldName(), bytesRef);
    if (docsEnum == null) {
        //couldn't find search term
        return null;
    }

    int i = 0;
    int tmpDocID = docsEnum.nextDoc();
    int docID = -1;
    while (tmpDocID != PostingsEnum.NO_MORE_DOCS) {
        docID = tmpDocID;
        tmpDocID = docsEnum.nextDoc();
        i++;
    }
    if (i > 1) {
        //TODO: log or do something "there should only be one key term!"
    }
    if (docID > -1) {
        System.out.println(docID);
        return reader.document(docID);
    }
    return null;
}