/** * Rebuild a suggestion index from the document index. * * This method iterates through the entire document index and makes sure that only unique titles * are indexed. * * @param indexRoot The parent directory inside which both the document index and the suggestion * index lives. * @throws IOException */ public static void rebuild(String indexRoot) throws IOException { Path indexRootPath = Paths.get(indexRoot); Path suggestionPath = getSuggestionIndexPath(indexRootPath); // Delete the suggestion index if it exists. if (Files.exists(suggestionPath)) { Util.deletePath(suggestionPath); } // Create the suggestion index. Analyzer analyzer = Indexer.getAnalyzer(); Directory suggestionDir = FSDirectory.open(getSuggestionIndexPath(indexRootPath)); AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(suggestionDir, analyzer); // Open the document index. Directory indexDir = FSDirectory.open(Indexer.getMainIndexPath(indexRootPath)); IndexReader reader = DirectoryReader.open(indexDir); // Get a document iterator. DocumentDictionary docDict = new DocumentDictionary(reader, Indexer.TITLE_FIELD_NAME, null); InputIterator iterator = docDict.getEntryIterator(); Set<BytesRef> titleSet = new HashSet<>(); BytesRef next; while ((next = iterator.next()) != null) { if (titleSet.contains(next)) { continue; } titleSet.add(next); suggester.add(next, null, 0, null); } reader.close(); suggester.commit(); suggester.close(); }
@Override public void build(InputIterator iterator) throws IOException { if (iterator.hasPayloads()) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } BytesRef scratch = new BytesRef(); InputIterator iter = new WFSTInputIterator(iterator); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
@Override public final InputIterator getEntryIterator() throws IOException { final Terms terms = MultiFields.getTerms(reader, field); if (terms != null) { return new InputIterator.InputIteratorWrapper(terms.iterator(null)); } else { return InputIterator.EMPTY; } }
private static Lookup setupSuggester_Analyzing() { setupIndexReader(); final Lookup suggester[] = new AnalyzingSuggester[1]; Display.getDefault().syncExec(new Runnable() { @Override public void run() { BusyIndicator.showWhile(Display.getDefault(), new Runnable() { @Override public void run() { try { final Analyzer queryAnalyzer = new StandardAnalyzer(new CharArraySet(0, true)); final InputIterator termIterator = createTermIterator(); suggester[0] = new AnalyzingSuggester(queryAnalyzer); suggester[0].build(termIterator); } catch (final Exception e) { StatusUtil.showStatus(e); } } }); } }); return suggester[0]; }
/** * Like build(), but without flushing the old entries, and *ignores duplicate entries* * * @param dict * @throws IOException */ public void add(Dictionary dict) throws IOException { InputIterator iter = dict.getEntryIterator(); BytesRef text; while ((text = iter.next()) != null) { if (lookup(text.utf8ToString(), 1, true, false).size() > 0) { continue; } add(text, iter.contexts(), iter.weight(), iter.payload()); } }
@Override public void build(InputIterator iterator) throws IOException { if (iterator.hasPayloads()) { throw new IllegalArgumentException("this suggester doesn't support payloads"); } count = 0; BytesRef scratch = new BytesRef(); InputIterator iter = new WFSTInputIterator(iterator); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); count++; } fst = builder.finish(); }
@Override public void build(InputIterator iterator) throws IOException { build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB); }
WFSTInputIterator(InputIterator source) throws IOException { super(source); assert source.hasPayloads() == false; }
@Override public InputIterator getEntryIterator() throws IOException { return new InputIterator.InputIteratorWrapper(new FileIterator()); }
@Override public final InputIterator getEntryIterator() throws IOException { return new HighFrequencyIterator(); }
@Ignore public void testWiki() throws Exception { final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); // Skip header: lfd.nextDoc(); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); sug.build(new InputIterator() { private int count; @Override public long weight() { return 1; } @Override public Comparator<BytesRef> getComparator() { return null; } @Override public BytesRef next() { Document doc; try { doc = lfd.nextDoc(); } catch (IOException ioe) { throw new RuntimeException(ioe); } if (doc == null) { return null; } if (count++ == 10000) { return null; } return new BytesRef(doc.get("body")); } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } @Override public Set<BytesRef> contexts() { return null; } @Override public boolean hasContexts() { return false; } }); if (VERBOSE) { System.out.println(sug.ramBytesUsed() + " bytes"); List<LookupResult> results = sug.lookup("general r", 10); System.out.println("results:"); for(LookupResult result : results) { System.out.println(" " + result); } } }
private static InputIterator createTermIterator() throws IOException { final TermFreqIteratorListWrapper inputIterator = new TermFreqIteratorListWrapper(); final List<AtomicReaderContext> leaves = _indexReader.leaves(); for (final AtomicReaderContext readerContext : leaves) { final AtomicReader reader = readerContext.reader(); final Fields fields = reader.fields(); for (final String field : fields) { if (field.equals(SEARCH_FIELD_DESCRIPTION) || field.equals(SEARCH_FIELD_TITLE)) { final Terms terms = fields.terms(field); final TermsEnum termsEnum = terms.iterator(null); inputIterator.add(termsEnum); } } } return inputIterator; }
@Override public InputIterator getEntryIterator() throws IOException { return new MultiInputIterator(); }
@Ignore public void testWiki() throws Exception { final LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false); // Skip header: lfd.nextDoc(); FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(random())); sug.build(new InputIterator() { private int count; @Override public long weight() { return 1; } @Override public Comparator<BytesRef> getComparator() { return null; } @Override public BytesRef next() { Document doc; try { doc = lfd.nextDoc(); } catch (IOException ioe) { throw new RuntimeException(ioe); } if (doc == null) { return null; } if (count++ == 10000) { return null; } return new BytesRef(doc.get("body")); } @Override public BytesRef payload() { return null; } @Override public boolean hasPayloads() { return false; } }); if (VERBOSE) { System.out.println(sug.sizeInBytes() + " bytes"); List<LookupResult> results = sug.lookup("general r", 10); System.out.println("results:"); for(LookupResult result : results) { System.out.println(" " + result); } } }
/** * Returns an iterator over all the entries * @return Iterator */ InputIterator getEntryIterator() throws IOException;