private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException { // start term, optimized writing BytesRef term = termIter.next(); spare.copyUTF8Bytes(term); builder.startObject(spare.toString()); buildTermStatistics(builder, termIter); // finally write the term vectors PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL); int termFreq = posEnum.freq(); builder.field(FieldStrings.TERM_FREQ, termFreq); initMemory(curTerms, termFreq); initValues(curTerms, posEnum, termFreq); buildValues(builder, curTerms, termFreq); buildScore(builder, boostAtt); builder.endObject(); }
/** * @return the estimate for loading the entire term set into field data, or 0 if unavailable */ public long estimateStringFieldData() { try { LeafReader reader = context.reader(); Terms terms = reader.terms(getFieldName()); Fields fields = reader.fields(); final Terms fieldTerms = fields.terms(getFieldName()); if (fieldTerms instanceof FieldReader) { final Stats stats = ((FieldReader) fieldTerms).getStats(); long totalTermBytes = stats.totalTermBytes; if (logger.isTraceEnabled()) { logger.trace("totalTermBytes: {}, terms.size(): {}, terms.getSumDocFreq(): {}", totalTermBytes, terms.size(), terms.getSumDocFreq()); } long totalBytes = totalTermBytes + (2 * terms.size()) + (4 * terms.getSumDocFreq()); return totalBytes; } } catch (Exception e) { logger.warn("Unable to estimate memory overhead", e); } return 0; }
protected TermsEnum filter(Terms terms, TermsEnum iterator, LeafReader reader) throws IOException { if (iterator == null) { return null; } int docCount = terms.getDocCount(); if (docCount == -1) { docCount = reader.maxDoc(); } if (docCount >= minSegmentSize) { final int minFreq = minFrequency > 1.0 ? (int) minFrequency : (int)(docCount * minFrequency); final int maxFreq = maxFrequency > 1.0 ? (int) maxFrequency : (int)(docCount * maxFrequency); if (minFreq > 1 || maxFreq < docCount) { iterator = new FrequencyFilter(iterator, minFreq, maxFreq); } } return iterator; }
public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException { super(indices); // build a search request with a query of all the terms final BoolQueryBuilder boolBuilder = boolQuery(); for (String fieldName : termVectorsFields) { if ((selectedFields != null) && (!selectedFields.contains(fieldName))) { continue; } Terms terms = termVectorsFields.terms(fieldName); TermsEnum iterator = terms.iterator(); while (iterator.next() != null) { String text = iterator.term().utf8ToString(); boolBuilder.should(QueryBuilders.termQuery(fieldName, text)); } } // wrap a search request object this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder)); }
@Override public Query rewrite(IndexReader reader) throws IOException { Query rewritten = super.rewrite(reader); if (rewritten != this) { return rewritten; } boolean hasPayloads = false; for (LeafReaderContext context : reader.leaves()) { final Terms terms = context.reader().terms(term.field()); if (terms != null) { if (terms.hasPayloads()) { hasPayloads = true; break; } } } // if the terms does not exist we could return a MatchNoDocsQuery but this would break the unified highlighter // which rewrites query with an empty reader. if (hasPayloads == false) { return new TermQuery(term); } return this; }
static CodecReader wrap(CodecReader reader) throws IOException { final FieldInfos fieldInfos = reader.getFieldInfos(); final FieldInfo versionInfo = fieldInfos.fieldInfo(VersionFieldMapper.NAME); if (versionInfo != null && versionInfo.getDocValuesType() != DocValuesType.NONE) { // the reader is a recent one, it has versions and they are stored // in a numeric doc values field return reader; } // The segment is an old one, look at the _uid field final Terms terms = reader.terms(UidFieldMapper.NAME); if (terms == null || !terms.hasPayloads()) { // The segment doesn't have an _uid field or doesn't have payloads // don't try to do anything clever. If any other segment has versions // all versions of this segment will be initialized to 0 return reader; } // convert _uid payloads -> _version docvalues return new VersionFieldUpgrader(reader); }
private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException { long sumDocFreq = curTerms.getSumDocFreq(); int docCount = curTerms.getDocCount(); long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq(); if (docCount > 0) { assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!"; builder.startObject(FieldStrings.FIELD_STATISTICS); builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq); builder.field(FieldStrings.DOC_COUNT, docCount); builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies); builder.endObject(); } else if (docCount == -1) { // this should only be -1 if the field // statistics were not requested at all. In // this case all 3 values should be -1 assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!"; assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!"; } else { throw new IllegalStateException( "Something is wrong with the field statistics of the term vector request: Values are " + "\n" + FieldStrings.SUM_DOC_FREQ + " " + sumDocFreq + "\n" + FieldStrings.DOC_COUNT + " " + docCount + "\n" + FieldStrings.SUM_TTF + " " + sumTotalTermFrequencies); } }
private void buildValues(XContentBuilder builder, Terms curTerms, int termFreq) throws IOException { if (!(curTerms.hasPayloads() || curTerms.hasOffsets() || curTerms.hasPositions())) { return; } builder.startArray(FieldStrings.TOKENS); for (int i = 0; i < termFreq; i++) { builder.startObject(); if (curTerms.hasPositions()) { builder.field(FieldStrings.POS, currentPositions[i]); } if (curTerms.hasOffsets()) { builder.field(FieldStrings.START_OFFSET, currentStartOffset[i]); builder.field(FieldStrings.END_OFFSET, currentEndOffset[i]); } if (curTerms.hasPayloads() && (currentPayloads[i].length() > 0)) { builder.field(FieldStrings.PAYLOAD, currentPayloads[i]); } builder.endObject(); } builder.endArray(); }
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; } if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
@Override public Terms terms(String field) throws IOException { final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); if (fieldInfo == null) { // No such field return null; } final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number); if (fieldIndex == null) { // Term vectors were not indexed for this field return null; } return new TVTerms(fieldFPs[fieldIndex]); }
@Override protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) { ShardId shardId = request.shardId(); Map<String, FieldStats> fieldStats = new HashMap<>(); IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex()); MapperService mapperService = indexServices.mapperService(); IndexShard shard = indexServices.shardSafe(shardId.id()); try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) { for (String field : request.getFields()) { MappedFieldType fieldType = mapperService.fullName(field); if (fieldType != null) { IndexReader reader = searcher.reader(); Terms terms = MultiFields.getTerms(reader, field); if (terms != null) { fieldStats.put(field, fieldType.stats(terms, reader.maxDoc())); } } else { throw new IllegalArgumentException("field [" + field + "] doesn't exist"); } } } catch (IOException e) { throw ExceptionsHelper.convertToElastic(e); } return new FieldStatsShardResponse(shardId, fieldStats); }
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the * given selection of fields from terms with a document frequency greater than * the given maxDocFreq * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param fields Selection of fields to calculate stopwords for * @param maxDocFreq Document frequency terms should be above in order to be stopwords * @throws IOException Can be thrown while reading from the IndexReader */ public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, Collection<String> fields, int maxDocFreq) throws IOException { super(delegate.getReuseStrategy()); this.delegate = delegate; for (String field : fields) { Set<String> stopWords = new HashSet<>(); Terms terms = MultiFields.getTerms(indexReader, field); CharsRefBuilder spare = new CharsRefBuilder(); if (terms != null) { TermsEnum te = terms.iterator(null); BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { spare.copyUTF8Bytes(text); stopWords.add(spare.toString()); } } } stopWordsPerField.put(field, stopWords); } }
/** Return a {@link TermsEnum} intersecting the provided {@link Terms} * with the terms accepted by this automaton. */ public TermsEnum getTermsEnum(Terms terms) throws IOException { switch(type) { case NONE: return TermsEnum.EMPTY; case ALL: return terms.iterator(null); case SINGLE: return new SingleTermsEnum(terms.iterator(null), term); case PREFIX: // TODO: this is very likely faster than .intersect, // but we should test and maybe cutover return new PrefixTermsEnum(terms.iterator(null), term); case NORMAL: return terms.intersect(this, null); default: // unreachable throw new RuntimeException("unhandled case"); } }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { /* check term presence in index here for symmetry with other SimpleTerm's */ Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } } }
@Override public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception { LeafReader reader = context.reader(); Terms terms = reader.terms(getFieldNames().indexName()); AtomicGeoPointFieldData data = null; // TODO: Use an actual estimator to estimate before loading. NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA)); if (terms == null) { data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc()); estimator.afterLoad(null, data.ramBytesUsed()); return data; } return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ? loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data); }
Query createCandidateQuery(IndexReader indexReader) throws IOException { List<BytesRef> extractedTerms = new ArrayList<>(); LeafReader reader = indexReader.leaves().get(0).reader(); Fields fields = reader.fields(); for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } BytesRef fieldBr = new BytesRef(field); TermsEnum tenum = terms.iterator(); for (BytesRef term = tenum.next(); term != null; term = tenum.next()) { BytesRefBuilder builder = new BytesRefBuilder(); builder.append(fieldBr); builder.append(FIELD_VALUE_SEPARATOR); builder.append(term); extractedTerms.add(builder.toBytesRef()); } } Query extractionSuccess = new TermInSetQuery(queryTermsField.name(), extractedTerms); // include extractionResultField:failed, because docs with this term have no extractedTermsField // and otherwise we would fail to return these docs. Docs that failed query term extraction // always need to be verified by MemoryIndex: Query extractionFailure = new TermQuery(new Term(extractionResultField.name(), EXTRACTION_FAILED)); return new BooleanQuery.Builder() .add(extractionSuccess, Occur.SHOULD) .add(extractionFailure, Occur.SHOULD) .build(); }
/** * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>O(numIndexedFields)</code>. * * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes * separately in the returned {@link CompletionStats} */ public static CompletionStats completionStats(IndexReader indexReader, String ... fieldNamePatterns) { long sizeInBytes = 0; ObjectLongHashMap<String> completionFields = null; if (fieldNamePatterns != null && fieldNamePatterns.length > 0) { completionFields = new ObjectLongHashMap<>(fieldNamePatterns.length); } for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); try { Fields fields = atomicReader.fields(); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms instanceof CompletionTerms) { // TODO: currently we load up the suggester for reporting its size long fstSize = ((CompletionTerms) terms).suggester().ramBytesUsed(); if (fieldNamePatterns != null && fieldNamePatterns.length > 0 && Regex.simpleMatch(fieldNamePatterns, fieldName)) { completionFields.addTo(fieldName, fstSize); } sizeInBytes += fstSize; } } } catch (IOException ioe) { throw new ElasticsearchException(ioe); } } return new CompletionStats(sizeInBytes, completionFields == null ? null : new FieldMemoryStats(completionFields)); }
public LinearInterpolatingScorer(IndexReader reader, Terms terms, String field, double realWordLikelyhood, BytesRef separator, double trigramLambda, double bigramLambda, double unigramLambda) throws IOException { super(reader, terms, field, realWordLikelyhood, separator); double sum = unigramLambda + bigramLambda + trigramLambda; this.unigramLambda = unigramLambda / sum; this.bigramLambda = bigramLambda / sum; this.trigramLambda = trigramLambda / sum; }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { long minValue = NumericUtils.getMinInt(terms); long maxValue = NumericUtils.getMaxInt(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue ); }
private void initMemory(Terms curTerms, int termFreq) { // init memory for performance reasons if (curTerms.hasPositions()) { currentPositions = ArrayUtil.grow(currentPositions, termFreq); } if (curTerms.hasOffsets()) { currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq); currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq); } if (curTerms.hasPayloads()) { currentPayloads = new BytesArray[termFreq]; } }
/** * Returns a DocIdSet per segments containing the matching docs for the specified slice. */ private DocIdSet build(LeafReader reader) throws IOException { final DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc()); final Terms terms = reader.terms(getField()); final TermsEnum te = terms.iterator(); PostingsEnum docsEnum = null; for (BytesRef term = te.next(); term != null; term = te.next()) { int hashCode = term.hashCode(); if (contains(hashCode)) { docsEnum = te.postings(docsEnum, PostingsEnum.NONE); builder.add(docsEnum); } } return builder.build(); }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { long minValue = NumericUtils.getMinLong(terms); long maxValue = NumericUtils.getMaxLong(terms); return new FieldStats.Long( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue ); }
/** * Determine whether the BlockTreeTermsReader.FieldReader can be used * for estimating the field data, adding the estimate to the circuit * breaker if it can, otherwise wrapping the terms in a * RamAccountingTermsEnum to be estimated on a per-term basis. * * @param terms terms to be estimated * @return A possibly wrapped TermsEnum for the terms */ @Override public TermsEnum beforeLoad(Terms terms) throws IOException { LeafReader reader = context.reader(); TermsEnum iterator = terms.iterator(); TermsEnum filteredIterator = filter(terms, iterator, reader); final boolean filtered = iterator != filteredIterator; iterator = filteredIterator; if (filtered) { if (logger.isTraceEnabled()) { logger.trace("Filter exists, can't circuit break normally, using RamAccountingTermsEnum"); } return new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName); } else { estimatedBytes = this.estimateStringFieldData(); // If we weren't able to estimate, wrap in the RamAccountingTermsEnum if (estimatedBytes == 0) { iterator = new RamAccountingTermsEnum(iterator, breaker, this, this.fieldName); } else { breaker.addEstimateBytesAndMaybeBreak(estimatedBytes, fieldName); } return iterator; } }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms */ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null); int freq = 0; while(docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException { // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually. List<LeafReaderContext> leaves = reader.leaves(); for (LeafReaderContext leaf : leaves) { Terms _terms = leaf.reader().terms(field); if (_terms == null) { continue; } TermsEnum termsEnum = _terms.iterator(); TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes()); if (TermsEnum.SeekStatus.END == seekStatus) { continue; } for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) { if (!StringHelper.startsWith(term, prefix.bytes())) { break; } terms.add(new Term(field, BytesRef.deepCopyOf(term))); if (terms.size() >= maxExpansions) { return; } } } }
@Override public FieldStats stats(Terms terms, int maxDoc) throws IOException { double minValue = NumericUtils.sortableLongToDouble(NumericUtils.getMinLong(terms)); double maxValue = NumericUtils.sortableLongToDouble(NumericUtils.getMaxLong(terms)); return new FieldStats.Double( maxDoc, terms.getDocCount(), terms.getSumDocFreq(), terms.getSumTotalTermFreq(), minValue, maxValue ); }
private void writeFieldStatistics(Terms topLevelTerms) throws IOException { long sttf = topLevelTerms.getSumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = topLevelTerms.getSumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = topLevelTerms.getDocCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException { String fieldName = fieldIter.next(); builder.startObject(fieldName); Terms curTerms = theFields.terms(fieldName); // write field statistics buildFieldStatistics(builder, curTerms); builder.startObject(FieldStrings.TERMS); TermsEnum termIter = curTerms.iterator(); BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class); for (int i = 0; i < curTerms.size(); i++) { buildTerm(builder, spare, curTerms, termIter, boostAtt); } builder.endObject(); builder.endObject(); }
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws IOException { String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; Terms terms = fields.terms(fieldName); assertThat(terms.size(), equalTo(8L)); TermsEnum iterator = terms.iterator(); for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); assertThat(next, notNullValue()); assertThat("expected " + string, string, equalTo(next.utf8ToString())); assertThat(next, notNullValue()); // do not test ttf or doc frequency, because here we have many // shards and do not know how documents are distributed PostingsEnum docsAndPositions = iterator.postings(null, PostingsEnum.ALL); assertThat(docsAndPositions.nextDoc(), equalTo(0)); assertThat(freq[j], equalTo(docsAndPositions.freq())); int[] termPos = pos[j]; int[] termStartOffset = startOffset[j]; int[] termEndOffset = endOffset[j]; assertThat(termPos.length, equalTo(freq[j])); assertThat(termStartOffset.length, equalTo(freq[j])); assertThat(termEndOffset.length, equalTo(freq[j])); for (int k = 0; k < freq[j]; k++) { int nextPosition = docsAndPositions.nextPosition(); assertThat("term: " + string, nextPosition, equalTo(termPos[k])); assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); if (withPayloads) { assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); } } } assertThat(iterator.next(), nullValue()); }
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { Terms terms0 = fields0.terms(fieldName); Terms terms1 = fields1.terms(fieldName); assertThat(terms0, notNullValue()); assertThat(terms1, notNullValue()); assertThat(terms0.size(), equalTo(terms1.size())); TermsEnum iter0 = terms0.iterator(); TermsEnum iter1 = terms1.iterator(); for (int i = 0; i < terms0.size(); i++) { BytesRef next0 = iter0.next(); assertThat(next0, notNullValue()); BytesRef next1 = iter1.next(); assertThat(next1, notNullValue()); // compare field value String string0 = next0.utf8ToString(); String string1 = next1.utf8ToString(); assertThat("expected: " + string0, string0, equalTo(string1)); // compare df and ttf assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); // compare freq and docs PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL); PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL); assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); // compare position, start offsets and end offsets for (int j = 0; j < docsAndPositions0.freq(); j++) { assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); } } assertThat(iter0.next(), nullValue()); assertThat(iter1.next(), nullValue()); }
public void testArtificialNoDoc() throws IOException { // setup indices Settings.Builder settings = Settings.builder() .put(indexSettings()) .put("index.analysis.analyzer", "standard"); assertAcked(prepareCreate("test") .setSettings(settings) .addMapping("type1", "field1", "type=text")); ensureGreen(); // request tvs from artificial document String text = "the quick brown fox jumps over the lazy dog"; TermVectorsResponse resp = client().prepareTermVectors() .setIndex("test") .setType("type1") .setDoc(jsonBuilder() .startObject() .field("field1", text) .endObject()) .setOffsets(true) .setPositions(true) .setFieldStatistics(true) .setTermStatistics(true) .get(); assertThat(resp.isExists(), equalTo(true)); checkBrownFoxTermVector(resp.getFields(), "field1", false); // Since the index is empty, all of artificial document's "term_statistics" should be 0/absent Terms terms = resp.getFields().terms("field1"); assertEquals("sumDocFreq should be 0 for a non-existing field!", 0, terms.getSumDocFreq()); assertEquals("sumTotalTermFreq should be 0 for a non-existing field!", 0, terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); // we're guaranteed to receive terms for that field while (termsEnum.next() != null) { String term = termsEnum.term().utf8ToString(); assertEquals("term [" + term + "] does not exist in the index; ttf should be 0!", 0, termsEnum.totalTermFreq()); } }
private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException { final TermsEnum termsEnum = terms.iterator(); List<String> bestTerms = new ArrayList<>(); BytesRef text; while((text = termsEnum.next()) != null) { bestTerms.add(text.utf8ToString()); } Collections.sort(expectedTerms); Collections.sort(bestTerms); assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray()); }
@Override public Terms terms(String field) throws IOException { final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); if (fieldInfo == null) { return null; } int idx = -1; for (int i = 0; i < fieldNumOffs.length; ++i) { if (fieldNums[fieldNumOffs[i]] == fieldInfo.number) { idx = i; break; } } if (idx == -1 || numTerms[idx] == 0) { // no term return null; } int fieldOff = 0, fieldLen = -1; for (int i = 0; i < fieldNumOffs.length; ++i) { if (i < idx) { fieldOff += fieldLengths[i]; } else { fieldLen = fieldLengths[i]; break; } } assert fieldLen >= 0; return new TVTerms(numTerms[idx], fieldFlags[idx], prefixLengths[idx], suffixLengths[idx], termFreqs[idx], positionIndex[idx], positions[idx], startOffsets[idx], lengths[idx], payloadIndex[idx], payloadBytes, new BytesRef(suffixBytes.bytes, suffixBytes.offset + fieldOff, fieldLen)); }