public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { float N, n, idf, adl; idf = 1.0f; N = collectionStats.maxDoc(); adl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf = log(N/n); } else { for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf += log(N/n); } } return new TFIDFWeight(collectionStats.field(), idf, adl); }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for each term in the * phrase. * * @param collectionStats * collection-level statistics * @param termStats * term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf score factor for the * phrase and an explanation for each term. */ public Explanation idfExplain(final CollectionStatistics collectionStats, final TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats) { final long docFreq = stat.docFreq(); final float termIdf = idf(docFreq, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + docFreq + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; }
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { out.writeVInt(fieldStatistics.size()); for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) { out.writeString(c.key); CollectionStatistics statistics = c.value; assert statistics.maxDoc() >= 0; out.writeVLong(statistics.maxDoc()); out.writeVLong(addOne(statistics.docCount())); out.writeVLong(addOne(statistics.sumTotalTermFreq())); out.writeVLong(addOne(statistics.sumDocFreq())); } }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { BasicStats stats[] = new BasicStats[termStats.length]; for (int i = 0; i < termStats.length; i++) { stats[i] = newStats(collectionStats.field(), queryBoost); fillBasicStats(stats[i], collectionStats, termStats[i]); } return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats); }
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. * Subclasses can override this method to fill additional stats. */ protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // #positions(field) must be >= #positions(term) assert collectionStats.sumTotalTermFreq() == -1 || collectionStats.sumTotalTermFreq() >= termStats.totalTermFreq(); long numberOfDocuments = collectionStats.maxDoc(); long docFreq = termStats.docFreq(); long totalTermFreq = termStats.totalTermFreq(); // codec does not supply totalTermFreq: substitute docFreq if (totalTermFreq == -1) { totalTermFreq = docFreq; } final long numberOfFieldTokens; final float avgFieldLength; long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { // field does not exist; // We have to provide something if codec doesnt supply these measures, // or if someone omitted frequencies for the field... negative values cause // NaN/Inf for some scorers. numberOfFieldTokens = docFreq; avgFieldLength = 1; } else { numberOfFieldTokens = sumTotalTermFreq; avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments; } // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.setNumberOfDocuments(numberOfDocuments); stats.setNumberOfFieldTokens(numberOfFieldTokens); stats.setAvgFieldLength(avgFieldLength); stats.setDocFreq(docFreq); stats.setTotalTermFreq(totalTermFreq); }
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq (Lucene 3.x indexes * or any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc()); } }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; final Explanation exp = new Explanation(); exp.setDescription("idf(), sum of:"); for (final TermStatistics stat : termStats ) { final long df = stat.docFreq(); final float termIdf = idf(df, max); exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } exp.setValue(idf); return exp; }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); // compute freq-independent part of bm25 equation across all norm values float cache[] = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); } return new BM25Stats(collectionStats.field(), idf, queryBoost, avgdl, cache); }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { PerFieldSimWeight weight = new PerFieldSimWeight(); weight.delegate = get(collectionStats.field()); weight.delegateWeight = weight.delegate.computeWeight(queryBoost, collectionStats, termStats); return weight; }
@Override public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { SimWeight subStats[] = new SimWeight[sims.length]; for (int i = 0; i < subStats.length; i++) { subStats[i] = sims[i].computeWeight(queryBoost, collectionStats, termStats); } return new MultiStats(subStats); }
/** * Computes the collection probability of the current term in addition to the * usual statistics. */ @Override protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { super.fillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats) stats; lmStats.setCollectionProbability(collectionModel.computeProbability(stats)); }
@Override public final SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); return new IDFStats(collectionStats.field(), idf, queryBoost); }
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); return new BM25StatsFixed(collectionStats.field(), k1, b, idf, avgdl); }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idfManager.getIDF(termStats.term().utf8ToString()); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); // compute freq-independent part of bm25 equation across all norm values float cache[] = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); } return new BM25Stats(collectionStats.field(), idf, avgdl, cache); }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { long N, n; float idf_, avdl; idf_ = 1.0f; N = collectionStats.docCount(); if (N == -1) N = collectionStats.maxDoc(); avdl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf_ = idf(n, N); } else { /* computation for a phrase */ for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf_ += idf(n, N); } } return new TFIDFWeight(collectionStats.field(), idf_, avdl); }
public void reportCollectionStatistics()throws IOException { IndexSearcher searcher = new IndexSearcher(reader); CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL); long token_count = collectionStats.sumTotalTermFreq(); long doc_count = collectionStats.docCount(); long sum_doc_count = collectionStats.sumDocFreq(); long avg_doc_length = token_count / doc_count; System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); }
@Override public final SimWeight computeWeight(final float queryBoost, final CollectionStatistics collectionStats, final TermStatistics... termStats) { final Explanation idf = termStats.length == 1 ? this.idfExplain( collectionStats, termStats[0]) : this.idfExplain( collectionStats, termStats); return new IDFStats(collectionStats.field(), idf, queryBoost); }
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { if (termStats.length == 1) { return new SimpleScore(boost, collectionStats, termStats[0]); } else { return new SimpleScore(boost, collectionStats, termStats); } }
SimpleScore(float boost, CollectionStatistics collectionStats, TermStatistics termStats[]) { float total = 0.0f; List<Explanation> scores = new ArrayList<>(); for (final TermStatistics stat : termStats) { String description = String.format("simple score for (%s:%s)", collectionStats.field(), stat.term().utf8ToString()); scores.add(Explanation.match(1.0f, description)); total += 1.0f; } this.score = Explanation.match(total, "total score, sum of:", scores); this.boost = Explanation.match(boost, "boost"); }
@Override public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) { return new SimWeight() { @Override public void normalize(float queryNorm, float topLevelBoost) { } @Override public float getValueForNormalization() { return 0; } }; }