/** * Test the WordScorer emitted by the smoothing model */ public void testBuildWordScorer() throws IOException { SmoothingModel testModel = createTestModel(); Map<String, Analyzer> mapping = new HashMap<>(); mapping.put("field", new WhitespaceAnalyzer()); PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping); IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper)); Document doc = new Document(); doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED)); writer.addDocument(doc); DirectoryReader ir = DirectoryReader.open(writer); WordScorer wordScorer = testModel.buildWordScorerFactory().newScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" ")); assertWordScorer(wordScorer, testModel); }
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the * given selection of fields from terms with a document frequency greater than * the given maxDocFreq * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param fields Selection of fields to calculate stopwords for * @param maxDocFreq Document frequency terms should be above in order to be stopwords * @throws IOException Can be thrown while reading from the IndexReader */ public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, Collection<String> fields, int maxDocFreq) throws IOException { super(delegate.getReuseStrategy()); this.delegate = delegate; for (String field : fields) { Set<String> stopWords = new HashSet<>(); Terms terms = MultiFields.getTerms(indexReader, field); CharsRefBuilder spare = new CharsRefBuilder(); if (terms != null) { TermsEnum te = terms.iterator(null); BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { spare.copyUTF8Bytes(text); stopWords.add(spare.toString()); } } } stopWordsPerField.put(field, stopWords); } }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { /* check term presence in index here for symmetry with other SimpleTerm's */ Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getTermText())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLuceneTerm(fieldName)); } } }
@Override protected FieldStatsShardResponse shardOperation(FieldStatsShardRequest request) { ShardId shardId = request.shardId(); Map<String, FieldStats> fieldStats = new HashMap<>(); IndexService indexServices = indicesService.indexServiceSafe(shardId.getIndex()); MapperService mapperService = indexServices.mapperService(); IndexShard shard = indexServices.shardSafe(shardId.id()); try (Engine.Searcher searcher = shard.acquireSearcher("fieldstats")) { for (String field : request.getFields()) { MappedFieldType fieldType = mapperService.fullName(field); if (fieldType != null) { IndexReader reader = searcher.reader(); Terms terms = MultiFields.getTerms(reader, field); if (terms != null) { fieldStats.put(field, fieldType.stats(terms, reader.maxDoc())); } } else { throw new IllegalArgumentException("field [" + field + "] doesn't exist"); } } } catch (IOException e) { throw ExceptionsHelper.convertToElastic(e); } return new FieldStatsShardResponse(shardId, fieldStats); }
public void testSeekCeilNotFound() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); // Get empty string in there! doc.add(newStringField("field", "", Field.Store.NO)); w.addDocument(doc); for(int i=0;i<36;i++) { doc = new Document(); String term = "" + (char) (97+i); String term2 = "a" + (char) (97+i); doc.add(newTextField("field", term + " " + term2, Field.Store.NO)); w.addDocument(doc); } w.forceMerge(1); IndexReader r = w.getReader(); TermsEnum te = MultiFields.getTerms(r, "field").iterator(null); assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22}))); assertEquals("a", te.term().utf8ToString()); assertEquals(1L, te.ord()); r.close(); w.close(); dir.close(); }
/** * {@inheritDoc} */ @Override public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException { if (atomicReader == null) { throw new IOException("You must first call Classifier#train"); } double max = - Double.MAX_VALUE; BytesRef foundClass = new BytesRef(); Terms terms = MultiFields.getTerms(atomicReader, classFieldName); TermsEnum termsEnum = terms.iterator(null); BytesRef next; String[] tokenizedDoc = tokenizeDoc(inputDocument); while ((next = termsEnum.next()) != null) { double clVal = calculateLogPrior(next) + calculateLogLikelihood(tokenizedDoc, next); if (clVal > max) { max = clVal; foundClass = BytesRef.deepCopyOf(next); } } double score = 10 / Math.abs(max); return new ClassificationResult<>(foundClass, score); }
/** * checks that norms are the same across all fields */ public void assertNormsEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { Fields leftFields = MultiFields.getFields(leftReader); Fields rightFields = MultiFields.getFields(rightReader); // Fields could be null if there are no postings, // but then it must be null for both if (leftFields == null || rightFields == null) { assertNull(info, leftFields); assertNull(info, rightFields); return; } for (String field : leftFields) { NumericDocValues leftNorms = MultiDocValues.getNormValues(leftReader, field); NumericDocValues rightNorms = MultiDocValues.getNormValues(rightReader, field); if (leftNorms != null && rightNorms != null) { assertDocValuesEquals(info, leftReader.maxDoc(), leftNorms, rightNorms); } else { assertNull(info, leftNorms); assertNull(info, rightNorms); } } }
private int countTerms(MultiTermQuery q) throws Exception { final Terms terms = MultiFields.getTerms(reader, q.getField()); if (terms == null) return 0; final TermsEnum termEnum = q.getTermsEnum(terms); assertNotNull(termEnum); int count = 0; BytesRef cur, last = null; while ((cur = termEnum.next()) != null) { count++; if (last != null) { assertTrue(last.compareTo(cur) < 0); } last = BytesRef.deepCopyOf(cur); } // LUCENE-3314: the results after next() already returned null are undefined, // assertNull(termEnum.next()); return count; }
public void testAllDocs() throws Exception { initializeIndex(new String[]{"A", "B", "C", "D"}); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = newSearcher(reader); TermRangeQuery query = new TermRangeQuery("content", null, null, true, true); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "content"); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); query = new TermRangeQuery("content", null, null, false, false); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); query = TermRangeQuery.newStringRange("content", "", null, true, false); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); // and now anothe one query = TermRangeQuery.newStringRange("content", "B", null, true, false); assertTrue(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length); reader.close(); }
/** * Tests if a WildcardQuery that has only a trailing * in the term is * rewritten to a single PrefixQuery. The boost and rewriteMethod should be * preserved. */ public void testPrefixTerm() throws IOException { Directory indexStore = getIndexStore("field", new String[]{"prefix", "prefixx"}); IndexReader reader = DirectoryReader.open(indexStore); IndexSearcher searcher = newSearcher(reader); MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*")); assertMatches(searcher, wq, 2); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field"); assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum); wq = new WildcardQuery(new Term("field", "*")); assertMatches(searcher, wq, 2); assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum); assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum")); reader.close(); indexStore.close(); }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
private DocIdSetIterator iterateAllDocs() { IndexReader reader = searcher.getIndexReader(); final Bits liveDocs = MultiFields.getLiveDocs( reader ); final DocIdSetIterator allDocs = DocIdSetIterator.all( reader.maxDoc() ); if ( liveDocs == null ) { return allDocs; } return new FilteredDocIdSetIterator( allDocs ) { @Override protected boolean match( int doc ) { return liveDocs.get( doc ); } }; }
/** * @param flagConfig Contains all information necessary for configuring LuceneUtils. * {@link FlagConfig#luceneindexpath()} must be non-empty. */ public LuceneUtils(FlagConfig flagConfig) throws IOException { if (flagConfig.luceneindexpath().isEmpty()) { throw new IllegalArgumentException( "-luceneindexpath is a required argument for initializing LuceneUtils instance."); } this.compositeReader = DirectoryReader.open( FSDirectory.open(FileSystems.getDefault().getPath(flagConfig.luceneindexpath()))); this.leafReader = SlowCompositeReaderWrapper.wrap(compositeReader); MultiFields.getFields(compositeReader); this.flagConfig = flagConfig; if (!flagConfig.stoplistfile().isEmpty()) loadStopWords(flagConfig.stoplistfile()); if (!flagConfig.startlistfile().isEmpty()) loadStartWords(flagConfig.startlistfile()); VerbatimLogger.info("Initialized LuceneUtils from Lucene index in directory: " + flagConfig.luceneindexpath() + "\n"); VerbatimLogger.info("Fields in index are: " + String.join(", ", this.getFieldNames()) + "\n"); }
private void testSearchSpeed(ArrayList<String> images, final Class featureClass) throws IOException { parallelIndexer = new ParallelIndexer(8, indexPath, testExtensive, true) { @Override public void addBuilders(ChainedDocumentBuilder builder) { builder.addBuilder(new GenericDocumentBuilder(featureClass, "feature")); } }; parallelIndexer.run(); IndexReader reader = DirectoryReader.open(new RAMDirectory(FSDirectory.open(new File(indexPath)), IOContext.READONCE)); Bits liveDocs = MultiFields.getLiveDocs(reader); double queryCount = 0d; ImageSearcher searcher = new GenericFastImageSearcher(100, featureClass, "feature"); long ms = System.currentTimeMillis(); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. String fileName = getIDfromFileName(reader.document(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]); if (queries.keySet().contains(fileName)) { queryCount += 1d; // ok, we've got a query here for a document ... Document queryDoc = reader.document(i); ImageSearchHits hits = searcher.search(queryDoc, reader); } } ms = System.currentTimeMillis() - ms; System.out.printf("%s \t %3.1f \n", featureClass.getName().substring(featureClass.getName().lastIndexOf('.')+1), (double) ms / queryCount); }
/** * * @param reader * @return Map of term and its inverse document frequency * * @throws IOException */ public Map<String, Float> getIdfs(IndexReader reader) throws IOException { Fields fields = MultiFields.getFields(reader); //get the fields of the index for (String field: fields) { TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator(null); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) { if (termEnum.seekExact(bytesRef)) { String term = bytesRef.utf8ToString(); float idf = tfidfSIM.idf( termEnum.docFreq(), reader.numDocs() ); inverseDocFreq.put(term, idf); System.out.println(term +" idf= "+ idf); } } } return inverseDocFreq; }
/** * Internal utility: recount for a facet result node * * @param fresNode * result node to be recounted * @param docIds * full set of matching documents. * @throws IOException If there is a low-level I/O error. */ private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) throws IOException { // TODO (Facet): change from void to return the new, smaller docSet, and use // that for the children, as this will make their intersection ops faster. // can do this only when the new set is "sufficiently" smaller. /* We need the category's path name in order to do its recounting. * If it is missing, because the option to label only part of the * facet results was exercise, we need to calculate them anyway, so * in essence sampling with recounting spends some extra cycles for * labeling results for which labels are not required. */ if (fresNode.label == null) { fresNode.label = taxonomyReader.getPath(fresNode.ordinal); } CategoryPath catPath = fresNode.label; Term drillDownTerm = DrillDownQuery.term(searchParams.indexingParams, catPath); // TODO (Facet): avoid Multi*? Bits liveDocs = MultiFields.getLiveDocs(indexReader); int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, liveDocs, drillDownTerm.field(), drillDownTerm.bytes(), 0), docIds.iterator()); fresNode.value = updatedCount; }
/** * {@inheritDoc} */ @Override public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException { if (atomicReader == null) { throw new RuntimeException("need to train the classifier first"); } double max = 0d; BytesRef foundClass = new BytesRef(); Terms terms = MultiFields.getTerms(atomicReader, classFieldName); TermsEnum termsEnum = terms.iterator(null); BytesRef next; String[] tokenizedDoc = tokenizeDoc(inputDocument); while ((next = termsEnum.next()) != null) { // TODO : turn it to be in log scale double clVal = calculatePrior(next) * calculateLikelihood(tokenizedDoc, next); if (clVal > max) { max = clVal; foundClass = next.clone(); } } return new ClassificationResult<BytesRef>(foundClass, max); }
@Override public void learnVocab() throws IOException { super.learnVocab(); final String field = ((LuceneIndexConfig)config).getField(); final Terms terms = MultiFields.getTerms(reader, field); final BytesRef maxTerm = terms.getMax(); final BytesRef minTerm = terms.getMin(); Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true); IndexSearcher searcher = new IndexSearcher(reader); topDocs = searcher.search(q, Integer.MAX_VALUE); TermsEnum termsEnum = null; termsEnum = terms.iterator(termsEnum); termsEnum.seekCeil(new BytesRef()); BytesRef term = termsEnum.term(); while(term != null){ int p = addWordToVocab(term.utf8ToString()); vocab[p].setCn((int)termsEnum.totalTermFreq()); term = termsEnum.next(); } }
public void testAllDocs() throws Exception { initializeIndex(new String[]{"A", "B", "C", "D"}); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); TermRangeQuery query = new TermRangeQuery("content", null, null, true, true); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "content"); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); query = new TermRangeQuery("content", null, null, false, false); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); query = TermRangeQuery.newStringRange("content", "", null, true, false); assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length); // and now anothe one query = TermRangeQuery.newStringRange("content", "B", null, true, false); assertTrue(query.getTermsEnum(terms) instanceof TermRangeTermsEnum); assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length); reader.close(); }
/** * Tests if a WildcardQuery that has only a trailing * in the term is * rewritten to a single PrefixQuery. The boost and rewriteMethod should be * preserved. */ public void testPrefixTerm() throws IOException { Directory indexStore = getIndexStore("field", new String[]{"prefix", "prefixx"}); IndexReader reader = DirectoryReader.open(indexStore); IndexSearcher searcher = new IndexSearcher(reader); MultiTermQuery wq = new WildcardQuery(new Term("field", "prefix*")); assertMatches(searcher, wq, 2); Terms terms = MultiFields.getTerms(searcher.getIndexReader(), "field"); assertTrue(wq.getTermsEnum(terms) instanceof PrefixTermsEnum); wq = new WildcardQuery(new Term("field", "*")); assertMatches(searcher, wq, 2); assertFalse(wq.getTermsEnum(terms) instanceof PrefixTermsEnum); assertFalse(wq.getTermsEnum(terms).getClass().getSimpleName().contains("AutomatonTermsEnum")); reader.close(); indexStore.close(); }
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Fields fields = MultiFields.getFields(r); if (fields == null) return -1; Terms terms = fields.terms(t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(null); if (!termsEnum.seekExact(termBytes, false)) { return -1; } DocsEnum docs = termsEnum.docs(MultiFields.getLiveDocs(r), null, DocsEnum.FLAG_NONE); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
/** * Searches the Lucene index of a particular appid. * @param <P> type * @param dao {@link DAO} * @param appid appid * @param type type * @param query a query * @param pager a {@link Pager} * @return a list of ParaObjects */ public static <P extends ParaObject> List<P> searchQuery(DAO dao, String appid, String type, String query, Pager... pager) { if (StringUtils.isBlank(appid)) { return Collections.emptyList(); } DirectoryReader ireader = null; try { ireader = getIndexReader(appid); if (ireader != null) { Pager page = getPager(pager); List<P> docs = searchQuery(dao, appid, searchQueryRaw(ireader, appid, type, qs(query, MultiFields.getIndexedFields(ireader)), page), page); return docs; } } catch (Exception e) { logger.error(null, e); } finally { closeIndexReader(ireader); } return Collections.emptyList(); }
private boolean getNext() { try { int next = docsEnum.nextDoc(); if (next == DocIdSetIterator.NO_MORE_DOCS) { return false; } Bits liveDocs = MultiFields.getLiveDocs(reader); if (liveDocs != null) { while (!liveDocs.get(docsEnum.docID())) { next = docsEnum.nextDoc(); } } return next == DocIdSetIterator.NO_MORE_DOCS ? false : true; } catch (IOException e) { throw new RuntimeException(e); } }
/** * {@inheritDoc} */ @Override public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException { if (atomicReader == null) { throw new IOException("You must first call Classifier#train"); } double max = - Double.MAX_VALUE; BytesRef foundClass = new BytesRef(); Terms terms = MultiFields.getTerms(atomicReader, classFieldName); TermsEnum termsEnum = terms.iterator(null); BytesRef next; String[] tokenizedDoc = tokenizeDoc(inputDocument); while ((next = termsEnum.next()) != null) { double clVal = calculateLogPrior(next) + calculateLogLikelihood(tokenizedDoc, next); if (clVal > max) { max = clVal; foundClass = BytesRef.deepCopyOf(next); } } double score = 10 / Math.abs(max); return new ClassificationResult<BytesRef>(foundClass, score); }
/** * Return a query that will return docs like the passed lucene document ID. * * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. * @return a query that will return docs like the passed lucene document ID. */ public Query like(int docNum) throws IOException { if (fieldNames == null) { // gather list of valid fields from lucene Collection<String> fields = MultiFields.getIndexedFields(ir); fieldNames = fields.toArray(new String[fields.size()]); } return createQuery(retrieveTerms(docNum)); }
/** * Here we could go overboard and use a pre-generated indexed random document for a given Item, * but for now we'd prefer to simply return the id as the content of the document and that for * every field. */ private static Fields generateFields(String[] fieldNames, String text) throws IOException { MemoryIndex index = new MemoryIndex(); for (String fieldName : fieldNames) { index.addField(fieldName, text, new WhitespaceAnalyzer()); } return MultiFields.getFields(index.createSearcher().getIndexReader()); }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { int prefixLength = prefix.length(); Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { Matcher matcher = pattern.matcher(""); try { TermsEnum termsEnum = terms.iterator(null); TermsEnum.SeekStatus status = termsEnum.seekCeil(prefixRef); BytesRef text; if (status == TermsEnum.SeekStatus.FOUND) { text = prefixRef; } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { text = termsEnum.term(); } else { text = null; } while(text != null) { if (text != null && StringHelper.startsWith(text, prefixRef)) { String textString = text.utf8ToString(); matcher.reset(textString.substring(prefixLength)); if (matcher.matches()) { mtv.visitMatchingTerm(new Term(fieldName, textString)); } } else { break; } text = termsEnum.next(); } } finally { matcher.reset(); } } }
@Override public void visitMatchingTerms( IndexReader reader, String fieldName, MatchingTermVisitor mtv) throws IOException { /* inspired by PrefixQuery.rewrite(): */ Terms terms = MultiFields.getTerms(reader, fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); boolean skip = false; TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(getPrefix())); if (status == TermsEnum.SeekStatus.FOUND) { mtv.visitMatchingTerm(getLucenePrefixTerm(fieldName)); } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { if (StringHelper.startsWith(termsEnum.term(), prefixRef)) { mtv.visitMatchingTerm(new Term(fieldName, termsEnum.term().utf8ToString())); } else { skip = true; } } else { // EOF skip = true; } if (!skip) { while(true) { BytesRef text = termsEnum.next(); if (text != null && StringHelper.startsWith(text, prefixRef)) { mtv.visitMatchingTerm(new Term(fieldName, text.utf8ToString())); } else { break; } } } } }