/** * Provide spelling corrections based on several parameters. * * @param term The term to suggest spelling corrections for * @param numSug The maximum number of spelling corrections * @param ir The index reader to fetch the candidate spelling corrections from * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included * @param editDistance The maximum edit distance candidates are allowed to have * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included * @param spare a chars scratch * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. * @throws IOException If I/O related errors occur */ protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); Terms terms = MultiFields.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { final float boost = boostAtt.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) continue; // ignore exact match of the same term if (queryTerm.bytesEquals(candidateTerm)) continue; int df = e.docFreq(); // check docFreq if required if (df <= docfreq) continue; final float score; final String termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.getScaleFactor() + e.getMinSimilarity(); } else { spare.copyUTF8Bytes(candidateTerm); termAsString = spare.toString(); score = distance.getDistance(term.text(), termAsString); } if (score < accuracy) continue; // add new entry in PQ st.term = BytesRef.deepCopyOf(candidateTerm); st.boost = boost; st.docfreq = df; st.termAsString = termAsString; st.score = score; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; }
/** * Provide spelling corrections based on several parameters. * * @param term The term to suggest spelling corrections for * @param numSug The maximum number of spelling corrections * @param ir The index reader to fetch the candidate spelling corrections from * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included * @param editDistance The maximum edit distance candidates are allowed to have * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included * @param spare a chars scratch * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. * @throws IOException If I/O related errors occur */ protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRef spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); Terms terms = MultiFields.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { final float boost = boostAtt.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) continue; // ignore exact match of the same term if (queryTerm.bytesEquals(candidateTerm)) continue; int df = e.docFreq(); // check docFreq if required if (df <= docfreq) continue; final float score; final String termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; // undo FuzzyTermsEnum's scale factor for a real scaled lev score score = boost / e.getScaleFactor() + e.getMinSimilarity(); } else { UnicodeUtil.UTF8toUTF16(candidateTerm, spare); termAsString = spare.toString(); score = distance.getDistance(term.text(), termAsString); } if (score < accuracy) continue; // add new entry in PQ st.term = BytesRef.deepCopyOf(candidateTerm); st.boost = boost; st.docfreq = df; st.termAsString = termAsString; st.score = score; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; }