private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); assert Math.abs(score - score(path, candidates)) < 0.00001; if (score > cutoffScore) { if (corrections.size() < maxNumCorrections) { Candidate[] c = new Candidate[candidates.length]; System.arraycopy(path, 0, c, 0, path.length); corrections.add(new Correction(score, c)); } else if (corrections.top().compareTo(score, path) < 0) { Correction top = corrections.top(); System.arraycopy(path, 0, top.candidates, 0, path.length); top.score = score; corrections.updateTop(); } } }
/** * Add to an existing boolean query the More Like This query from this PriorityQueue */ private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) { ScoreTerm scoreTerm; float bestScore = -1; while ((scoreTerm = q.pop()) != null) { TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); if (boost) { if (bestScore == -1) { bestScore = (scoreTerm.score); } float myScore = (scoreTerm.score); tq.setBoost(boostFactor * myScore / bestScore); } try { query.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } } }
/** * Find words for a more-like-this query former. * * @param docNum * the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Terms vector = ir.getTermVector(docNum, fieldName); // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws IOException { HashMap<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { for (String field : fields.keySet()) { Collection<Object> fieldValues = fields.get(field); if (fieldValues == null) continue; for (Object fieldValue : fieldValues) { if (fieldValue != null) { addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap, fieldName); } } } } return createQueue(termFreqMap); }
/** * Create the More like query from a PriorityQueue */ private Query createQuery(PriorityQueue<ScoreTerm> q) { BooleanQuery query = new BooleanQuery(); ScoreTerm scoreTerm; float bestScore = -1; while ((scoreTerm = q.pop()) != null) { TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); if (boost) { if (bestScore == -1) { bestScore = (scoreTerm.score); } float myScore = (scoreTerm.score); tq.setBoost(boostFactor * myScore / bestScore); } try { query.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } } return query; }
@Override protected final int addSiblings(int ordinal, int[] siblings, PriorityQueue<FacetResultNode> pq) { FacetResultNode top = pq.top(); int numResults = 0; while (ordinal != TaxonomyReader.INVALID_ORDINAL) { int value = values[ordinal]; if (value > top.value) { top.value = value; top.ordinal = ordinal; top = pq.updateTop(); ++numResults; } ordinal = siblings[ordinal]; } return numResults; }
@Override protected final int addSiblings(int ordinal, int[] siblings, PriorityQueue<FacetResultNode> pq) { FacetResultNode top = pq.top(); int numResults = 0; while (ordinal != TaxonomyReader.INVALID_ORDINAL) { float value = values[ordinal]; if (value > top.value) { top.value = value; top.ordinal = ordinal; top = pq.updateTop(); ++numResults; } ordinal = siblings[ordinal]; } return numResults; }
/** Construct the array of doc hits for the hit group. */ private void buildDocHits(int group, ResultGroup resultGroup) { PriorityQueue queue = hitQueue[group]; int nFound = queue.size(); DocHitImpl[] hitArray = new DocHitImpl[nFound]; for (int i = 0; i < nFound; i++) { int index = nFound - i - 1; hitArray[index] = (DocHitImpl)queue.pop(); } int start = startDoc[group]; int max = maxDocs[group]; int nHits = Math.max(0, Math.min(nFound - start, max)); resultGroup.docHits = new DocHit[nHits]; resultGroup.totalDocs = nDocHits(group); resultGroup.startDoc = start; resultGroup.endDoc = start + nHits; for (int i = startDoc[group]; i < nFound; i++) resultGroup.docHits[i - start] = hitArray[i]; }
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. */ private PriorityQueue createQueue(IndexReader indexReader, Map words) throws IOException { // Will order words by score int queueSize = Math.min(words.size(), maxQueryTerms); QueryWordQueue queue = new QueryWordQueue(queueSize); // For each term... Iterator it = words.keySet().iterator(); while (it.hasNext()) { String word = (String)it.next(); float score = ((Flt)words.get(word)).x; // Okay, add an entry to the queue. queue.insert(new QueryWord(word, score)); } return queue; }
public final boolean insertInto(PriorityQueue queue) { if (docHit == null) docHit = new DocHitImpl(doc, score); try { docHit.setSpanSource(spanSrc); boolean inserted = queue.insert(docHit); // If we're keeping this hit, make sure spans have been grabbed. if (inserted) docHit.totalSnippets(); return inserted; } finally { docHit.setSpanSource(null); // prevent memory leaks } }
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft, PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException { CandidateSet current = candidates[ord]; if (ord == candidates.length - 1) { path[ord] = current.originalTerm; updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); if (numMissspellingsLeft > 0) { for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } } else { if (numMissspellingsLeft > 0) { path[ord] = current.originalTerm; findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } else { path[ord] = current.originalTerm; findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
/** * @see #retrieveInterestingTerms(java.io.Reader, String) */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
/** * Convenience routine to make it easy to return the most interesting words in a document. * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
private RFQuery buildQueryFromFieldTermFrequencies(Map<String, Map<String, Flt>> fieldTermFreq, boolean contentStreamQuery) throws IOException { List<RFTerm> interestingTerms = new ArrayList<RFTerm>(); for(String fieldName: fieldTermFreq.keySet()){ Map<String,Flt> words = fieldTermFreq.get(fieldName); PriorityQueue<RFTerm> queue = createQueue(fieldName, words, contentStreamQuery); interestingTerms.addAll(getMostInterestingTerms(queue)); } RFQuery rfResult = new RFQuery(interestingTerms, getMm()); return rfResult; }
/** * Compute the top most interesting terms from the priority queue of all RF Terms */ private List<RFTerm> getMostInterestingTerms(PriorityQueue<RFTerm> q) { int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField; double sumQuaredBoost = 0.0f; List<RFTerm> interestingTerms = new ArrayList<RFTerm>(); RFTerm currentTerm = null; while ((currentTerm = q.pop()) != null && interestingTerms.size() < maxTerms) { // if not boost, then set score to 1.0 not tf.idf // now implemented inside RFTerm // if not boost, boostValue == 1.0, so this just adds 1 as desired sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2); interestingTerms.add(currentTerm); } float vectorLength = (float) Math.sqrt(sumQuaredBoost); if(vectorLength <= 0.0){ return new ArrayList<RFTerm>(); } if(this.isNormalizeFieldBoosts()){ for(RFTerm term: interestingTerms){ term.setVectorLength(vectorLength); } } return interestingTerms; }
/** * @see #retrieveInterestingTerms(java.io.Reader, String) */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not // useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
/** * Convenience routine to make it easy to return the most interesting words in a * document. More advanced users will call {@link #retrieveTerms(Reader, String) * retrieveTerms()} directly. * * @param r * the source document * @param fieldName * field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not // useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
private String [] bestTerms(String field,int numTerms) throws IOException { PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms); IndexReader ir = DirectoryReader.open(dir); try { int threshold = ir.maxDoc() / 10; // ignore words too common. Terms terms = MultiFields.getTerms(ir, field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); while (termsEnum.next() != null) { int df = termsEnum.docFreq(); if (df<threshold) { String ttxt = termsEnum.term().utf8ToString(); pq.insertWithOverflow(new TermDf(ttxt,df)); } } } } finally { ir.close(); } String res[] = new String[pq.size()]; int i = 0; while (pq.size()>0) { TermDf tdf = pq.pop(); res[i++] = tdf.word; System.out.println(i+". word: "+tdf.df+" "+tdf.word); } return res; }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) { List<TermAndFreq> terms = new ArrayList<>(); while (queue.size() > 0) { terms.add(queue.pop()); } return terms; }
private MLTQuery buildQueryFromFieldTermFrequencies(Map<String, Map<String, Flt>> fieldTermFreq, boolean contentStreamQuery) throws IOException { List<MLTTerm> interestingTerms = new ArrayList<MLTTerm>(); for(String fieldName: fieldTermFreq.keySet()){ Map<String,Flt> words = fieldTermFreq.get(fieldName); PriorityQueue<MLTTerm> queue = createQueue(fieldName, words, contentStreamQuery); interestingTerms.addAll(getMostInterestingTerms(queue)); } MLTQuery mltResult = new MLTQuery(interestingTerms, getMm()); return mltResult; }
/** * Compute the top most interesting terms from the priority queue of all MLT Terms */ private List<MLTTerm> getMostInterestingTerms(PriorityQueue<MLTTerm> q) { int maxTerms = (maxQueryTermsPerField <= 0) ? Integer.MAX_VALUE : maxQueryTermsPerField; double sumQuaredBoost = 0.0f; List<MLTTerm> interestingTerms = new ArrayList<MLTTerm>(); MLTTerm currentTerm = null; while ((currentTerm = q.pop()) != null && interestingTerms.size() < maxTerms) { // if not boost, then set score to 1.0 not tf.idf // now implemented inside MLTTerm // if not boost, boostValue == 1.0, so this just adds 1 as desired sumQuaredBoost += Math.pow(currentTerm.getTermWeight(),2); interestingTerms.add(currentTerm); } float vectorLength = (float) Math.sqrt(sumQuaredBoost); if(vectorLength <= 0.0){ return new ArrayList<MLTTerm>(); } if(this.isNormalizeFieldBoosts()){ for(MLTTerm term: interestingTerms){ term.setVectorLength(vectorLength); } } return interestingTerms; }
/** * Create the More like query from a PriorityQueue */ private Query createQuery(PriorityQueue<Object[]> q) { BooleanQuery query = new BooleanQuery(); Object cur; int qterms = 0; float bestScore = 0; while ((cur = q.pop()) != null) { Object[] ar = (Object[]) cur; TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); if (boost) { if (qterms == 0) { bestScore = ((Float) ar[2]); } float myScore = ((Float) ar[2]); tq.setBoost(boostFactor * myScore / bestScore); } try { query.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } qterms++; if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { break; } } return query; }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<String, Int>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
/** * @see #retrieveInterestingTerms(java.io.Reader, String) */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms); PriorityQueue<Object[]> pq = retrieveTerms(docNum); Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((cur = pq.pop()) != null) && lim-- > 0) { Object[] ar = (Object[]) cur; al.add(ar[0]); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
/** * Convenience routine to make it easy to return the most interesting words in a document. * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms); PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName); Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((cur = pq.pop()) != null) && lim-- > 0) { Object[] ar = (Object[]) cur; al.add(ar[0]); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) { List<TermAndFreq> terms = new ArrayList<CommonTermsQueryTest.TermAndFreq>(); while (queue.size() > 0) { terms.add(queue.pop()); } return terms; }
@Override // verifies that the children of each node are sorted by the order // specified by the facetRequest. // the values in these nodes may have changed due to a re-count, for example // following the accumulation by Sampling. // so now we test and re-order if necessary. public FacetResult rearrangeFacetResult(FacetResult facetResult) { PriorityQueue<FacetResultNode> nodesHeap = new ResultNodeHeap(this.facetRequest.numResults, this.getSuitableACComparator()); FacetResultNode topFrn = facetResult.getFacetResultNode(); rearrangeChilrenOfNode(topFrn, nodesHeap); return facetResult; }
/** Construct an object with all counts at zero */ public GroupCounts(GroupData groupData, FacetSpec spec, HitQueueMaker hitQueueMaker) { // Record the input parameters for later use this.data = groupData; this.spec = spec; this.hitQueueMaker = hitQueueMaker; // Allocate our arrays of counts and such if (!data.isDynamic()) { count = new int[data.nGroups()]; score = new float[data.nGroups()]; } mark = new int[data.nGroups()]; selection = new int[data.nGroups()]; startDoc = new int[data.nGroups()]; maxDocs = new int[data.nGroups()]; hitQueue = new PriorityQueue[data.nGroups()]; // For dynamic data, we can perform the final sort and selection // right now, since the group counts and scores are known. // if (data.isDynamic()) sortAndSelect(); // For static data, make a conservative selection. else conservativePrep(); }
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue retrieveTerms(IndexReader indexReader, int docNum, Analyzer analyzer) throws IOException { // Gather term frequencies for all fields. Map termFreqMap = new HashMap(); Document d = indexReader.document(docNum); for (int i = 0; i < fieldNames.length; i++) { String fieldName = fieldNames[i]; String[] text = d.getValues(fieldName); if (text == null) continue; for (int j = 0; j < text.length; j++) { TokenStream tokens = analyzer.tokenStream(fieldName, new StringReader(text[j])); addTermFrequencies(tokens, fieldName, termFreqMap); } // for j } // for i // Combine like terms from each field and calculate a score for each. Map termScoreMap = condenseTerms(indexReader, termFreqMap); // Finally, make a queue by score. return createQueue(indexReader, termScoreMap); }