public DirectSpellChecker createDirectSpellChecker() { DirectSpellChecker directSpellChecker = new DirectSpellChecker(); directSpellChecker.setAccuracy(accuracy()); Comparator<SuggestWord> comparator; switch (sort()) { case SCORE: comparator = SCORE_COMPARATOR; break; case FREQUENCY: comparator = LUCENE_FREQUENCY; break; default: throw new IllegalArgumentException("Illegal suggest sort: " + sort()); } directSpellChecker.setComparator(comparator); directSpellChecker.setDistance(stringDistance()); directSpellChecker.setMaxEdits(maxEdits()); directSpellChecker.setMaxInspections(maxInspections()); directSpellChecker.setMaxQueryFrequency(maxTermFreq()); directSpellChecker.setMinPrefix(prefixLength()); directSpellChecker.setMinQueryLength(minWordLength()); directSpellChecker.setThresholdFrequency(minDocFreq()); directSpellChecker.setLowerCaseTerms(false); return directSpellChecker; }
@Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().createDirectSpellChecker(); final IndexReader indexReader = searcher.getIndexReader(); TermSuggestion response = new TermSuggestion( name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort() ); List<Token> tokens = queryTerms(suggestion, spare); for (Token token : tokens) { // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar( token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode() ); Text key = new Text(new BytesArray(token.term.bytes())); TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset); for (SuggestWord suggestWord : suggestedWords) { Text word = new Text(suggestWord.string); resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score)); } response.addTerm(resultEntry); } return response; }
private static StringDistance resolveDistance(String distanceVal) { distanceVal = distanceVal.toLowerCase(Locale.US); if ("internal".equals(distanceVal)) { return DirectSpellChecker.INTERNAL_LEVENSHTEIN; } else if ("damerau_levenshtein".equals(distanceVal) || "damerauLevenshtein".equals(distanceVal)) { return new LuceneLevenshteinDistance(); } else if ("levenstein".equals(distanceVal)) { return new LevensteinDistance(); // TODO Jaro and Winkler are 2 people - so apply same naming logic // as damerau_levenshtein } else if ("jarowinkler".equals(distanceVal)) { return new JaroWinklerDistance(); } else if ("ngram".equals(distanceVal)) { return new NGramDistance(); } else { throw new IllegalArgumentException("Illegal distance option " + distanceVal); } }
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates, Analyzer preFilter, Analyzer postFilter, Terms terms) throws IOException { if (terms == null) { throw new IllegalArgumentException("generator field [" + field + "] doesn't exist"); } this.spellchecker = spellchecker; this.field = field; this.numCandidates = numCandidates; this.suggestMode = suggestMode; this.reader = reader; final long dictSize = terms.getSumTotalTermFreq(); this.useTotalTermFrequency = dictSize != -1; this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize; this.preFilter = preFilter; this.postFilter = postFilter; this.nonErrorLikelihood = nonErrorLikelihood; float thresholdFrequency = spellchecker.getThresholdFrequency(); this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency); termsEnum = terms.iterator(); }
@Override public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings()); final IndexReader indexReader = searcher.getIndexReader(); TermSuggestion response = new TermSuggestion( name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort() ); List<Token> tokens = queryTerms(suggestion, spare); for (Token token : tokens) { // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar( token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode() ); Text key = new Text(new BytesArray(token.term.bytes())); TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset); for (SuggestWord suggestWord : suggestedWords) { Text word = new Text(suggestWord.string); resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score)); } response.addTerm(resultEntry); } return response; }
@Override public void indexingDone() { try { spellChecker = new DirectSpellChecker(); spellChecker.setMaxEdits(2); spellChecker.setAccuracy(0.1f); spellChecker.setMinPrefix(0); reader = DirectoryReader.open(writer); fuzzySuggester = new FuzzySuggester(directory, "", writer.getAnalyzer()); Dictionary dict = new DocumentValueSourceDictionary(reader, WORD_FIELD, new LongValuesSource() { @Override public boolean needsScores() { return false; } @Override public LongValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException { return null; } }); fuzzySuggester.build(dict); writer.close(); searcher = new IndexSearcher(DirectoryReader.open(directory)); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public StringDistance toLucene() { return DirectSpellChecker.INTERNAL_LEVENSHTEIN; }
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader, double nonErrorLikelihood, int numCandidates) throws IOException { this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field)); }