private Query addPageToMoreLike(String actualFolderLocation, Query query, MoreLikeThis moreLikeThis, String eachPage, String hocrContent) throws IOException { Query localQuery = query; if (null != hocrContent) { try { InputStream inputStream = new ByteArrayInputStream(hocrContent.getBytes("UTF-8")); localQuery = moreLikeThis.like(inputStream); } catch (UnsupportedEncodingException e) { LOGGER.error(e.getMessage(), e); localQuery = moreLikeThis.like(new File(actualFolderLocation + File.separator + eachPage)); } } else { localQuery = moreLikeThis.like(new File(actualFolderLocation + File.separator + eachPage)); } return localQuery; }
private void settingMoreLikeParameters(String minTermFreq, String minDocFreq, String minWordLength, String maxQueryTerms, String[] allIndexFields, String[] allStopWords, MoreLikeThis moreLikeThis) { moreLikeThis.setFieldNames(allIndexFields); // moreLikeThis.setBoost(true); // moreLikeThis.setBoostFactor(10.0f); moreLikeThis.setMinTermFreq(Integer.valueOf(minTermFreq)); moreLikeThis.setMinDocFreq(Integer.valueOf(minDocFreq)); moreLikeThis.setMinWordLen(Integer.valueOf(minWordLength)); moreLikeThis.setMaxQueryTerms(Integer.valueOf(maxQueryTerms)); if (allStopWords != null && allStopWords.length > 0) { Set<String> stopWordsTemp = new HashSet<String>(); for (int i = 0; i < allStopWords.length; i++) { stopWordsTemp.add(allStopWords[i]); } moreLikeThis.setStopWords(stopWordsTemp); } }
private MoreLikeThis updateQueryInfo(IndexReader reader, String minTermFreq, String minDocFreq, String minWordLength, String maxQueryTerms, String[] allStopWords, String[] allIndexFields) { MoreLikeThis moreLikeThis = new MoreLikeThis(reader); moreLikeThis.setFieldNames(allIndexFields); moreLikeThis.setMinTermFreq(Integer.parseInt(minTermFreq)); moreLikeThis.setMinDocFreq(Integer.parseInt(minDocFreq)); moreLikeThis.setMinWordLen(Integer.parseInt(minWordLength)); moreLikeThis.setMaxQueryTerms(Integer.parseInt(maxQueryTerms)); if (allStopWords != null && allStopWords.length > 0) { Set<String> stopWordsTemp = new HashSet<String>(); for (int index = 0; index < allStopWords.length; index++) { stopWordsTemp.add(allStopWords[index]); } moreLikeThis.setStopWords(stopWordsTemp); } return moreLikeThis; }
@Override protected void setDefaultValues() { super.setDefaultValues(); this.filterList = new FilterList(this.config); this.returnFieldList = new ReturnFieldList(); this.docQuery = null; this.likeText = null; this.lang = LanguageEnum.UNDEFINED; this.analyzerName = null; this.fieldList = new ReturnFieldList(); this.minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH; this.maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH; this.minDocFreq = MoreLikeThis.DEFAULT_MIN_DOC_FREQ; this.minTermFreq = MoreLikeThis.DEFAULT_MIN_TERM_FREQ; this.maxNumTokensParsed = MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED; this.maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS; this.boost = true; this.stopWords = null; this.start = 0; this.rows = 10; this.mltQuery = null; }
/** * Constructor. * * @param distanceQuery Distance query */ public LuceneDistanceKNNQuery(DistanceQuery<DBID> distanceQuery, IndexReader ir, DBIDRange range) { super(distanceQuery); this.range = range; this.mlt = new MoreLikeThis(ir); this.is = new IndexSearcher(ir); mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_36)); }
/** * Constructor. * * @param distanceQuery Distance query */ public LuceneDistanceRangeQuery(DistanceQuery<DBID> distanceQuery, IndexReader ir, DBIDRange ids) { super(distanceQuery); this.ids = ids; this.mlt = new MoreLikeThis(ir); this.is = new IndexSearcher(ir); mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_36)); }
@SuppressWarnings( "unchecked" ) public F moreLikeThis( String likeText, int minWordLen, int maxWordLen, int minDocFreq, int maxDocFreqPct, int minTermFreq, int maxQueryTerms ) { moreLikeThisParams = likeText + "," + minWordLen + "," + maxWordLen + "," + minDocFreq + "," + maxDocFreqPct + "," + minTermFreq + "," + maxQueryTerms; IndexReader ir = getReader( ); MoreLikeThis mlt = new MoreLikeThis( ir ); mlt.setFieldNames( mltSearchFields ); mlt.setAnalyzer( analyzer ); mlt.setMinWordLen( minWordLen ); mlt.setMaxWordLen( maxWordLen ); mlt.setMaxDocFreqPct( maxDocFreqPct ); mlt.setMinDocFreq( minDocFreq ); mlt.setMinTermFreq( minTermFreq ); mlt.setMaxQueryTerms( maxQueryTerms ); try { currentQD.parsedQuery( mlt.like( new StringReader( likeText ) ) ); } catch ( IOException e ) { Logger.error(e); } finally { closeReader( ir ); } updateLuceneQuery = true; return ( F ) this; }
protected void createMoreLikeThis() throws IOException { closeMoreLikeThis(); createIndexSearcher(); moreLikeThis = new MoreLikeThis(indexReader); moreLikeThis.setAnalyzer(analyzer); moreLikeThis.setMinTermFreq(1); moreLikeThis.setMinDocFreq(1); }
/** * Find related (similar) documents based on given value and fields to examine * * @param r The Record Object (Document Class) * @param testStr The input value to use as the basis for similarity * @param fieldNames The names of the fields to examine * @param numHits The number of similar documents to retrieve * @param excludeDocId Optional "primary key" to exclude from results * * @return List<String> The list of matching records primary keys * * @throws IOException * @throws ClassNotFoundException * @throws KiraException */ public List<String> relatedObjects(Record r, String testStr, String[] fieldNames, int numHits, String excludeDocId) throws IOException, ClassNotFoundException, KiraException { String key = makeKey(r.descriptor(), r.getPrimaryKeyName()); List<String> results = new ArrayList<String>(); FSDirectory idx; idx = FSDirectory.open(indexDirectory); IndexReader ir = IndexReader.open(idx); IndexSearcher is = new IndexSearcher(idx, true); MoreLikeThis mlt = new MoreLikeThis(ir); //lower some settings to MoreLikeThis will work with very short titles mlt.setMinTermFreq(1); mlt.setMinDocFreq(1); mlt.setMinWordLen(3); //String[] fieldNames = { "fulltext" }; mlt.setFieldNames(fieldNames ); Reader reader = new StringReader(testStr); org.apache.lucene.search.Query query = mlt.like( reader); //Search the index using the query and get the top 5 results TopDocs topDocs = is.search(query, numHits); //logger.info("found " + topDocs.totalHits + " topDocs for q:" + testStr); for ( ScoreDoc scoreDoc : topDocs.scoreDocs ) { Document doc = is.doc( scoreDoc.doc ); String docId = doc.get(key); if (docId != null) { if (excludeDocId == null || !docId.equals(excludeDocId)) { results.add(docId); } } else { logger.warning("found other document type? " + doc); } } is.close(); return results; }
@Override public MoreLikeThis getMoreLikeThis() throws SearchLibException { checkOnline(true); ReaderLocal reader = acquire(); try { return reader.getMoreLikeThis(); } finally { release(reader); } }
@Override public MoreLikeThis getMoreLikeThis() { return new MoreLikeThis(indexReader); }
MoreLikeThis getMoreLikeThis() throws SearchLibException;