Java 类org.apache.lucene.util.PriorityQueue 实例源码

项目:elasticsearch_my    文件:CandidateScorer.java   
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
        throws IOException {
    score = Math.exp(score);
    assert Math.abs(score - score(path, candidates)) < 0.00001;
    if (score > cutoffScore) {
        if (corrections.size() < maxNumCorrections) {
            Candidate[] c = new Candidate[candidates.length];
            System.arraycopy(path, 0, c, 0, path.length);
            corrections.add(new Correction(score, c));
        } else if (corrections.top().compareTo(score, path) < 0) {
            Correction top = corrections.top();
            System.arraycopy(path, 0, top.candidates, 0, path.length);
            top.score = score;
            corrections.updateTop();
        }
    }
}
项目:Elasticsearch    文件:CandidateScorer.java   
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
        throws IOException {
    score = Math.exp(score);
    assert Math.abs(score - score(path, candidates)) < 0.00001;
    if (score > cutoffScore) {
        if (corrections.size() < maxNumCorrections) {
            Candidate[] c = new Candidate[candidates.length];
            System.arraycopy(path, 0, c, 0, path.length);
            corrections.add(new Correction(score, c));
        } else if (corrections.top().compareTo(score, path) < 0) {
            Correction top = corrections.top();
            System.arraycopy(path, 0, top.candidates, 0, path.length);
            top.score = score;
            corrections.updateTop();
        }
    }
}
项目:Elasticsearch    文件:XMoreLikeThis.java   
/**
 * Add to an existing boolean query the More Like This query from this PriorityQueue
 */
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
    ScoreTerm scoreTerm;
    float bestScore = -1;

    while ((scoreTerm = q.pop()) != null) {
        TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));

        if (boost) {
            if (bestScore == -1) {
                bestScore = (scoreTerm.score);
            }
            float myScore = (scoreTerm.score);
            tq.setBoost(boostFactor * myScore / bestScore);
        }

        try {
            query.add(tq, BooleanClause.Occur.SHOULD);
        }
        catch (BooleanQuery.TooManyClauses ignore) {
            break;
        }
    }
}
项目:Alix    文件:MoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum
 *          the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException
{
  Map<String, Int> termFreqMap = new HashMap<>();
  for (String fieldName : fieldNames) {
    final Terms vector = ir.getTermVector(docNum, fieldName);
    // field does not store term vector info
    if (vector == null) {
      Document d = ir.document(docNum);
      IndexableField[] fields = d.getFields(fieldName);
      for (IndexableField field : fields) {
        final String stringValue = field.stringValue();
        if (stringValue != null) {
          addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
        }
      }
    }
    else {
      addTermFrequencies(termFreqMap, vector);
    }
  }

  return createQueue(termFreqMap);
}
项目:Alix    文件:MoreLikeThis.java   
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws IOException
{
  HashMap<String, Int> termFreqMap = new HashMap<>();
  for (String fieldName : fieldNames) {
    for (String field : fields.keySet()) {
      Collection<Object> fieldValues = fields.get(field);
      if (fieldValues == null)
        continue;
      for (Object fieldValue : fieldValues) {
        if (fieldValue != null) {
          addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap, fieldName);
        }
      }
    }
  }
  return createQueue(termFreqMap);
}
项目:search    文件:MoreLikeThis.java   
/**
 * Create the More like query from a PriorityQueue
 */
private Query createQuery(PriorityQueue<ScoreTerm> q) {
  BooleanQuery query = new BooleanQuery();
  ScoreTerm scoreTerm;
  float bestScore = -1;

  while ((scoreTerm = q.pop()) != null) {
    TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));

    if (boost) {
      if (bestScore == -1) {
        bestScore = (scoreTerm.score);
      }
      float myScore = (scoreTerm.score);
      tq.setBoost(boostFactor * myScore / bestScore);
    }

    try {
      query.add(tq, BooleanClause.Occur.SHOULD);
    }
    catch (BooleanQuery.TooManyClauses ignore) {
      break;
    }
  }
  return query;
}
项目:NYBC    文件:IntFacetResultsHandler.java   
@Override
protected final int addSiblings(int ordinal, int[] siblings, PriorityQueue<FacetResultNode> pq) {
  FacetResultNode top = pq.top();
  int numResults = 0;
  while (ordinal != TaxonomyReader.INVALID_ORDINAL) {
    int value = values[ordinal];
    if (value > top.value) {
      top.value = value;
      top.ordinal = ordinal;
      top = pq.updateTop();
      ++numResults;
    }
    ordinal = siblings[ordinal];
  }
  return numResults;
}
项目:NYBC    文件:FloatFacetResultsHandler.java   
@Override
protected final int addSiblings(int ordinal, int[] siblings, PriorityQueue<FacetResultNode> pq) {
  FacetResultNode top = pq.top();
  int numResults = 0;
  while (ordinal != TaxonomyReader.INVALID_ORDINAL) {
    float value = values[ordinal];
    if (value > top.value) {
      top.value = value;
      top.ordinal = ordinal;
      top = pq.updateTop();
      ++numResults;
    }
    ordinal = siblings[ordinal];
  }
  return numResults;
}
项目:dash-xtf    文件:GroupCounts.java   
/** Construct the array of doc hits for the hit group. */
private void buildDocHits(int group, ResultGroup resultGroup) 
{
  PriorityQueue queue = hitQueue[group];
  int nFound = queue.size();
  DocHitImpl[] hitArray = new DocHitImpl[nFound];
  for (int i = 0; i < nFound; i++) {
    int index = nFound - i - 1;
    hitArray[index] = (DocHitImpl)queue.pop();
  }

  int start = startDoc[group];
  int max = maxDocs[group];

  int nHits = Math.max(0, Math.min(nFound - start, max));
  resultGroup.docHits = new DocHit[nHits];

  resultGroup.totalDocs = nDocHits(group);
  resultGroup.startDoc = start;
  resultGroup.endDoc = start + nHits;

  for (int i = startDoc[group]; i < nFound; i++)
    resultGroup.docHits[i - start] = hitArray[i];
}
项目:dash-xtf    文件:MoreLikeThisQuery.java   
/**
 * Create a PriorityQueue from a word->tf map.
 *
 * @param words a map of words keyed on the word(String) with Int objects as the values.
 */
private PriorityQueue createQueue(IndexReader indexReader, Map words)
  throws IOException 
{
  // Will order words by score
  int queueSize = Math.min(words.size(), maxQueryTerms);
  QueryWordQueue queue = new QueryWordQueue(queueSize);

  // For each term...
  Iterator it = words.keySet().iterator();
  while (it.hasNext()) 
  {
    String word = (String)it.next();
    float score = ((Flt)words.get(word)).x;

    // Okay, add an entry to the queue.
    queue.insert(new QueryWord(word, score));
  }

  return queue;
}
项目:dash-xtf    文件:DefaultQueryProcessor.java   
public final boolean insertInto(PriorityQueue queue) 
{
  if (docHit == null)
    docHit = new DocHitImpl(doc, score);

  try 
  {
    docHit.setSpanSource(spanSrc);
    boolean inserted = queue.insert(docHit);

    // If we're keeping this hit, make sure spans have been grabbed. 
    if (inserted)
      docHit.totalSnippets();

    return inserted;
  }
  finally {
    docHit.setSpanSource(null); // prevent memory leaks
  }

}
项目:read-open-source-code    文件:MoreLikeThis.java   
/**
 * Create the More like query from a PriorityQueue
 */
private Query createQuery(PriorityQueue<ScoreTerm> q) {
  BooleanQuery query = new BooleanQuery();
  ScoreTerm scoreTerm;
  float bestScore = -1;

  while ((scoreTerm = q.pop()) != null) {
    TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));

    if (boost) {
      if (bestScore == -1) {
        bestScore = (scoreTerm.score);
      }
      float myScore = (scoreTerm.score);
      tq.setBoost(boostFactor * myScore / bestScore);
    }

    try {
      query.add(tq, BooleanClause.Occur.SHOULD);
    }
    catch (BooleanQuery.TooManyClauses ignore) {
      break;
    }
  }
  return query;
}
项目:elasticsearch_my    文件:CandidateScorer.java   
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft,
        PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException {
    CandidateSet current = candidates[ord];
    if (ord == candidates.length - 1) {
        path[ord] = current.originalTerm;
        updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        if (numMissspellingsLeft > 0) {
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        }
    } else {
        if (numMissspellingsLeft > 0) {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        } else {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        }
    }

}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
项目:elasticsearch_my    文件:XMoreLikeThis.java   
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
项目:RelevancyFeedback    文件:RelevancyFeedback.java   
private RFQuery buildQueryFromFieldTermFrequencies(Map<String, Map<String, Flt>> fieldTermFreq, boolean contentStreamQuery) throws IOException {

        List<RFTerm> interestingTerms = new ArrayList<RFTerm>();
        for(String fieldName: fieldTermFreq.keySet()){
            Map<String,Flt> words = fieldTermFreq.get(fieldName);
            PriorityQueue<RFTerm> queue = createQueue(fieldName, words, contentStreamQuery);
            interestingTerms.addAll(getMostInterestingTerms(queue));
        }

        RFQuery rfResult = new RFQuery(interestingTerms, getMm());
        return rfResult;
    }
项目:RelevancyFeedback    文件:RelevancyFeedback.java   
/**
 * Compute the top most interesting terms from the priority queue of all RF Terms
 */
private List<RFTerm> getMostInterestingTerms(PriorityQueue<RFTerm> q) {

    int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField;
    double sumQuaredBoost = 0.0f;

    List<RFTerm> interestingTerms = new ArrayList<RFTerm>();
    RFTerm currentTerm = null;
    while ((currentTerm = q.pop()) != null
            && interestingTerms.size() < maxTerms) {
        // if not boost, then set score to 1.0 not tf.idf
        // now implemented inside RFTerm

        // if not boost, boostValue == 1.0, so this just adds 1 as desired
        sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2);
        interestingTerms.add(currentTerm);
    }

    float vectorLength = (float) Math.sqrt(sumQuaredBoost);
    if(vectorLength <= 0.0){
        return new ArrayList<RFTerm>();
    }

    if(this.isNormalizeFieldBoosts()){
        for(RFTerm term: interestingTerms){
            term.setVectorLength(vectorLength);
        }
    }
    return interestingTerms;
}
项目:Elasticsearch    文件:CandidateScorer.java   
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft,
        PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException {
    CandidateSet current = candidates[ord];
    if (ord == candidates.length - 1) {
        path[ord] = current.originalTerm;
        updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        if (numMissspellingsLeft > 0) {
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        }
    } else {
        if (numMissspellingsLeft > 0) {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        } else {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        }
    }

}
项目:Elasticsearch    文件:XMoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField fields[] = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector, fieldName);
        }
    }

    return createQueue(termFreqMap);
}
项目:Elasticsearch    文件:XMoreLikeThis.java   
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
项目:Elasticsearch    文件:XMoreLikeThis.java   
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
    ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
    PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
    ScoreTerm scoreTerm;
    int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
    // we just want to return the top words
    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
        al.add(scoreTerm.word); // the 1st entry is the interesting word
    }
    String[] res = new String[al.size()];
    return al.toArray(res);
}
项目:Alix    文件:MoreLikeThis.java   
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException
{
  ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not
                           // useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:Alix    文件:MoreLikeThis.java   
/**
 * Convenience routine to make it easy to return the most interesting words in a
 * document. More advanced users will call {@link #retrieveTerms(Reader, String)
 * retrieveTerms()} directly.
 *
 * @param r
 *          the source document
 * @param fieldName
 *          field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException
{
  ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not
                           // useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:search    文件:QualityQueriesFinder.java   
private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiFields.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator(null);
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}
项目:search    文件:MoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
  Map<String, Int> termFreqMap = new HashMap<>();
  for (String fieldName : fieldNames) {
    final Fields vectors = ir.getTermVectors(docNum);
    final Terms vector;
    if (vectors != null) {
      vector = vectors.terms(fieldName);
    } else {
      vector = null;
    }

    // field does not store term vector info
    if (vector == null) {
      Document d = ir.document(docNum);
      IndexableField fields[] = d.getFields(fieldName);
      for (IndexableField field : fields) {
        final String stringValue = field.stringValue();
        if (stringValue != null) {
          addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
        }
      }
    } else {
      addTermFrequencies(termFreqMap, vector);
    }
  }

  return createQueue(termFreqMap);
}
项目:search    文件:MoreLikeThis.java   
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
  ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:search    文件:MoreLikeThis.java   
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
  ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
  PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
  ScoreTerm scoreTerm;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
    al.add(scoreTerm.word); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:search    文件:CommonTermsQueryTest.java   
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) {
  List<TermAndFreq> terms = new ArrayList<>();
  while (queue.size() > 0) {
    terms.add(queue.pop());
  }
  return terms;
}
项目:SolrPlugins    文件:MoreLikeThis.java   
private MLTQuery buildQueryFromFieldTermFrequencies(Map<String, Map<String, Flt>> fieldTermFreq, boolean contentStreamQuery) throws IOException {

        List<MLTTerm> interestingTerms = new ArrayList<MLTTerm>();
        for(String fieldName: fieldTermFreq.keySet()){
            Map<String,Flt> words = fieldTermFreq.get(fieldName);
            PriorityQueue<MLTTerm> queue = createQueue(fieldName, words, contentStreamQuery);
            interestingTerms.addAll(getMostInterestingTerms(queue));
        }

        MLTQuery mltResult = new MLTQuery(interestingTerms, getMm());
        return mltResult;
    }
项目:SolrPlugins    文件:MoreLikeThis.java   
/**
 * Compute the top most interesting terms from the priority queue of all MLT Terms
 */
private List<MLTTerm> getMostInterestingTerms(PriorityQueue<MLTTerm> q) {

    int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField;
    double sumQuaredBoost = 0.0f;

    List<MLTTerm> interestingTerms = new ArrayList<MLTTerm>();
    MLTTerm currentTerm = null;
    while ((currentTerm = q.pop()) != null
            && interestingTerms.size() < maxTerms) {
        // if not boost, then set score to 1.0 not tf.idf
        // now implemented inside MLTTerm

        // if not boost, boostValue == 1.0, so this just adds 1 as desired
        sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2);
        interestingTerms.add(currentTerm);
    }

    float vectorLength = (float) Math.sqrt(sumQuaredBoost);
    if(vectorLength <= 0.0){
        return new ArrayList<MLTTerm>();
    }

    if(this.isNormalizeFieldBoosts()){
        for(MLTTerm term: interestingTerms){
            term.setVectorLength(vectorLength);
        }
    }
    return interestingTerms;
}
项目:NYBC    文件:QualityQueriesFinder.java   
private String [] bestTerms(String field,int numTerms) throws IOException {
  PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
  IndexReader ir = DirectoryReader.open(dir);
  try {
    int threshold = ir.maxDoc() / 10; // ignore words too common.
    Terms terms = MultiFields.getTerms(ir, field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator(null);
      while (termsEnum.next() != null) {
        int df = termsEnum.docFreq();
        if (df<threshold) {
          String ttxt = termsEnum.term().utf8ToString();
          pq.insertWithOverflow(new TermDf(ttxt,df));
        }
      }
    }
  } finally {
    ir.close();
  }
  String res[] = new String[pq.size()];
  int i = 0;
  while (pq.size()>0) {
    TermDf tdf = pq.pop(); 
    res[i++] = tdf.word;
    System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
  }
  return res;
}
项目:NYBC    文件:MoreLikeThis.java   
/**
 * Create the More like query from a PriorityQueue
 */
private Query createQuery(PriorityQueue<Object[]> q) {
  BooleanQuery query = new BooleanQuery();
  Object cur;
  int qterms = 0;
  float bestScore = 0;

  while ((cur = q.pop()) != null) {
    Object[] ar = (Object[]) cur;
    TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));

    if (boost) {
      if (qterms == 0) {
        bestScore = ((Float) ar[2]);
      }
      float myScore = ((Float) ar[2]);

      tq.setBoost(boostFactor * myScore / bestScore);
    }

    try {
      query.add(tq, BooleanClause.Occur.SHOULD);
    }
    catch (BooleanQuery.TooManyClauses ignore) {
      break;
    }

    qterms++;
    if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
      break;
    }
  }

  return query;
}
项目:NYBC    文件:MoreLikeThis.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
  Map<String, Int> termFreqMap = new HashMap<String, Int>();
  for (String fieldName : fieldNames) {
    final Fields vectors = ir.getTermVectors(docNum);
    final Terms vector;
    if (vectors != null) {
      vector = vectors.terms(fieldName);
    } else {
      vector = null;
    }

    // field does not store term vector info
    if (vector == null) {
      Document d = ir.document(docNum);
      IndexableField fields[] = d.getFields(fieldName);
      for (IndexableField field : fields) {
        final String stringValue = field.stringValue();
        if (stringValue != null) {
          addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
        }
      }
    } else {
      addTermFrequencies(termFreqMap, vector);
    }
  }

  return createQueue(termFreqMap);
}
项目:NYBC    文件:MoreLikeThis.java   
/**
 * @see #retrieveInterestingTerms(java.io.Reader, String)
 */
public String[] retrieveInterestingTerms(int docNum) throws IOException {
  ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
  PriorityQueue<Object[]> pq = retrieveTerms(docNum);
  Object cur;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((cur = pq.pop()) != null) && lim-- > 0) {
    Object[] ar = (Object[]) cur;
    al.add(ar[0]); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:NYBC    文件:MoreLikeThis.java   
/**
 * Convenience routine to make it easy to return the most interesting words in a document.
 * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
 *
 * @param r the source document
 * @param fieldName field passed to analyzer to use when analyzing the content
 * @return the most interesting words in the document
 * @see #retrieveTerms(java.io.Reader, String)
 * @see #setMaxQueryTerms
 */
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
  ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
  PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
  Object cur;
  int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
  // we just want to return the top words
  while (((cur = pq.pop()) != null) && lim-- > 0) {
    Object[] ar = (Object[]) cur;
    al.add(ar[0]); // the 1st entry is the interesting word
  }
  String[] res = new String[al.size()];
  return al.toArray(res);
}
项目:NYBC    文件:CommonTermsQueryTest.java   
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) {
  List<TermAndFreq> terms = new ArrayList<CommonTermsQueryTest.TermAndFreq>();
  while (queue.size() > 0) {
    terms.add(queue.pop());
  }
  return terms;
}
项目:NYBC    文件:TopKInEachNodeHandler.java   
@Override
// verifies that the children of each node are sorted by the order
// specified by the facetRequest.
// the values in these nodes may have changed due to a re-count, for example
// following the accumulation by Sampling.
// so now we test and re-order if necessary.
public FacetResult rearrangeFacetResult(FacetResult facetResult) {
  PriorityQueue<FacetResultNode> nodesHeap = 
    new ResultNodeHeap(this.facetRequest.numResults, this.getSuitableACComparator());
  FacetResultNode topFrn = facetResult.getFacetResultNode();
  rearrangeChilrenOfNode(topFrn, nodesHeap);
  return facetResult;
}
项目:dash-xtf    文件:GroupCounts.java   
/** Construct an object with all counts at zero */
public GroupCounts(GroupData groupData, FacetSpec spec,
                   HitQueueMaker hitQueueMaker) 
{
  // Record the input parameters for later use
  this.data = groupData;
  this.spec = spec;
  this.hitQueueMaker = hitQueueMaker;

  // Allocate our arrays of counts and such
  if (!data.isDynamic()) {
    count = new int[data.nGroups()];
    score = new float[data.nGroups()];
  }
  mark = new int[data.nGroups()];
  selection = new int[data.nGroups()];
  startDoc = new int[data.nGroups()];
  maxDocs = new int[data.nGroups()];
  hitQueue = new PriorityQueue[data.nGroups()];

  // For dynamic data, we can perform the final sort and selection
  // right now, since the group counts and scores are known.
  //
  if (data.isDynamic())
    sortAndSelect();

  // For static data, make a conservative selection.
  else
    conservativePrep();
}
项目:dash-xtf    文件:MoreLikeThisQuery.java   
/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 */
private PriorityQueue retrieveTerms(IndexReader indexReader, int docNum,
                                    Analyzer analyzer)
  throws IOException 
{
  // Gather term frequencies for all fields.
  Map termFreqMap = new HashMap();
  Document d = indexReader.document(docNum);

  for (int i = 0; i < fieldNames.length; i++) 
  {
    String fieldName = fieldNames[i];
    String[] text = d.getValues(fieldName);
    if (text == null)
      continue;

    for (int j = 0; j < text.length; j++) {
      TokenStream tokens = analyzer.tokenStream(fieldName,
                                                new StringReader(text[j]));
      addTermFrequencies(tokens, fieldName, termFreqMap);
    } // for j
  } // for i

  // Combine like terms from each field and calculate a score for each.
  Map termScoreMap = condenseTerms(indexReader, termFreqMap);

  // Finally, make a queue by score.
  return createQueue(indexReader, termScoreMap);
}