public int search (String queryString, SearchField field) throws Exception { // Query query = QueryParser.parse(queryString,"mailcontent", new StandardAnalyzer()); // hits = is.search(query); // return hits.length(); //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_20); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_20, new File(ApplicationConstants.STOP_WORD_FILE)); QueryParser parser = new QueryParser(Version.LUCENE_20, field.Value(), analyzer); Query query = parser.parse(queryString) ; //Query query = QueryParser.parse(queryString,"body", new StandardAnalyzer()); //hits = is.search(query); //return hits.length(); results = searcher.search(query,100); return results.totalHits ; }
private IndexerManager (final CaseFacade caseFacade) throws IOException { File indexDir = new File(caseFacade.getCaseIndexFolderLocation()); if ( !indexDir.exists() ) { throw new IOException("not found indexing folder"); } this.caseFacade = caseFacade; // using stop analyzer this.writer = new IndexWriter( FSDirectory.open(indexDir), new StopAnalyzer(Version.LUCENE_30, new File(ApplicationConstants.STOP_WORD_FILE)), true, IndexWriter.MaxFieldLength.UNLIMITED ); this.writer.setUseCompoundFile(false); this.writer.setRAMBufferSizeMB(500); }
public List<Document> search (String queryString, SearchScope luceneFields) throws Exception { // using stop analyzer in search Analyzer analyzer = new StopAnalyzer(Version.LUCENE_30, new File(ApplicationConstants.STOP_WORD_FILE)); String[] fields = getSupportedFileds(luceneFields); MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer); // scpaing special characters queryString = QueryParser.escape(queryString); Query query = parser.parse(queryString); TopDocs topDocs = searcher.search(query, MAX_RESULT); List<Document> documents = new ArrayList<Document>(); for(ScoreDoc scoreDocs: topDocs.scoreDocs) { Document document = searcher.doc(scoreDocs.doc); documents.add(document); } return documents; }
/** * Read a Lucene index and make a spelling dictionary from it. A minimal token * analyzer will be used, which is usually just what is needed for the * dictionary. The default set of English stop words will be used (see * {@link StopAnalyzer#ENGLISH_STOP_WORDS}). * * @param indexDir directory containing the Lucene index * @param dictDir directory to receive the spelling dictionary * @param prog tracker called periodically to display progress */ public static void createDict(Directory indexDir, File dictDir, ProgressTracker prog) throws IOException { // Open and clear the dictionary (since we're going to totally rebuild it) SpellWriter spellWriter = SpellWriter.open(dictDir); spellWriter.clearDictionary(); spellWriter.setStopwords(StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); // Now re-tokenize all the fields and queue the words for the dictionary. IndexReader indexReader = IndexReader.open(indexDir); createDict(indexReader, new MinimalAnalyzer(), spellWriter, prog); // All done. spellWriter.close(); }
public StopwordAnnotator(String annotatorClass, Properties props) { this.props = props; this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA, "false")); if (this.props.containsKey(STOPWORDS_LIST)) { String stopwordList = props.getProperty(STOPWORDS_LIST); boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false")); this.stopwords = getStopWordList(Version.LUCENE_36, stopwordList, ignoreCase); } else { this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET; } }
private Set<String> getChatFilePathFast() throws IOException { Set<String> result = new HashSet<String>(); try { Directory directory = FSDirectory.open(new File( this.panel.getCaseFacade().getCaseIndexFolderLocation() )); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(panel.getAgent()); TopDocs topDocs = searcher.search(query, 5000); for(ScoreDoc scoreDoc: topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String chatFile = document.get(IndexingConstant.CHAT_FILE); if ( chatFile != null && !chatFile.trim().isEmpty()) { chatFile = this.panel.getCaseFacade().getFullPath(chatFile); final File path = new File(chatFile); result.add(path.getName()); } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return result; }
/** * Test to validate that stopwords are properly annotated in the token list * @throws Exception */ @org.junit.Test public void testLuceneStopwordList() throws Exception { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit, stopword"); props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(example); pipeline.annotate(document); List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class); //get the standard lucene stopword set Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; for (CoreLabel token : tokens) { //get the stopword annotation Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class); String word = token.word().toLowerCase(); if (stopWords.contains(word)) { assertTrue(stopword.first()); } else { assertFalse(stopword.first()); } //not checking lemma, so always false assertFalse(stopword.second()); } }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new SynonymFilter( new StopFilter(true, new LowerCaseFilter( new StandardFilter( new StandardTokenizer( Version.LUCENE_41, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET), engine ); return result; }
public static ReviewExtractor newInstance(String[] domainsToLoad, boolean polite) throws IOException { ReviewExtractor re = new ReviewExtractor(); stopWords = new HashSet<String>(); for (Object w : StopAnalyzer.ENGLISH_STOP_WORDS_SET) { stopWords.add(w.toString()); } VocabUtils vocabUtils = VocabUtils.newInstance(); re.hardOverridesSQ = vocabUtils.getHardOverrides(); re.queries = new HashMap<String, Map<Byte, SpanQuery>>(); re.vocabs = new HashMap<String, VocabularyReview>(); Map<Byte, SpanQuery> type2query = new HashMap<Byte, SpanQuery>(); String[] ds = domains; if (domainsToLoad != null) { ds = domainsToLoad; } re.loadedDomains = ds; for (String domain : ds) { VocabularyReview vocab = VocabularyReview.newInstance(domain); type2query = new HashMap<Byte, SpanQuery>(); type2query.put(PLUS_ONE, vocabUtils.getPlusOneQueries(vocab)); type2query.put(MINUS_ONE, vocabUtils.getMinusOneQueries(vocab)); type2query.put(TITLE_PLUS_ONE, vocabUtils.getTitlePlusOneQueries(vocab)); type2query.put(TITLE_MINUS_ONE, vocabUtils.getTitleMinusOneQueries(vocab)); re.queries.put(domain, type2query); re.vocabs.put(domain, vocab); } re.polite = polite; if (polite) { re.tabooSQ = vocabUtils.getTaboo(); } return re; }
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{ RAMDirectory ramDir = new RAMDirectory(); FileReader fr=new FileReader(new File("lib/stoplists/en.txt")); // Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt"))); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr ); //Index the full text of both documents //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer)); for (String s:fileSentences) { Document doc1 = new Document(); StringReader d1reader=new StringReader(s); doc1.add(new Field("contents", d1reader, TermVector.YES)); writer.addDocument(doc1); } // writer.commit(); writer.close(); DocVector[] docs = new DocVector[fileSentences.size()]; //Build a term vector for each document IndexReader RAMreader = IndexReader.open(ramDir); Map<String,Integer> terms = new HashMap<String,Integer>(); TermEnum termEnum = RAMreader.terms(new Term("contents")); //System.out.println(RAMreader.numDocs()); int pos = 0; while (termEnum.next()) { Term term = termEnum.term(); if (!"contents".equals(term.field())) break; terms.put(term.text(), pos++); } //System.out.println("Num terms:"+terms.size()); for(int i=0;i<fileSentences.size();i++) { TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i); docs[i]=new DocVector(terms); if (tfvs==null) continue; for (TermFreqVector tfv : tfvs) { String[] termTexts = tfv.getTerms(); int[] termFreqs = tfv.getTermFrequencies(); for (int j = 0; j < termTexts.length; j++) { double idfValue=getIDF(RAMreader,termTexts[j]); double tfIdfValue=termFreqs[j]*idfValue; docs[i].setEntry(termTexts[j], tfIdfValue); } } docs[i].normalize(); } RAMreader.close(); ramDir.close(); //ramDir.close(); //System.out.println(RAMreader.numDocs()); //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19])); return docs; }
private Set<ImagePathAndId> loadItemsFast(int from, int size) throws IOException { Set<ImagePathAndId> files = new HashSet<ImagePathAndId>(); int counter = 0; try { Directory directory = FSDirectory.open(new File(this.caseFacade.getCaseIndexFolderLocation())); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(IndexingConstant.fromDocumentTypeToString(IndexingConstant.DOCUMENT_GENERAL_TYPE.IMAGE)); TopDocs topDocs = searcher.search(query, 500000); for(ScoreDoc scoreDoc: topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String imageExtension = document.get(IndexingConstant.FILE_MIME); if ( imageExtension != null && !imageExtension.trim().isEmpty() && Arrays.asList(imageExtensions).contains(imageExtension ) ) { String fullpath = ""; int id = Integer.parseInt(document.get(IndexingConstant.DOCUMENT_ID)); if ( IndexingConstant.isImageDocument(document) ) { String path = document.get(IndexingConstant.FILE_PATH); if ( path.contains(this.aCase.getCaseName() + File.separator + ApplicationConstants.CASE_ARCHIVE_FOLDER) ) fullpath = path; else fullpath = this.caseFacade.getFullPath(document.get(IndexingConstant.FILE_PATH)); } if ( ! fullpath.isEmpty() ) { counter++; if ( files.size() >= size) break; if ( counter >= from ) { files.add(new ImagePathAndId(fullpath, Integer.valueOf(id))); } } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return files; }
private int getNumberOfImagesFast() throws IOException { int numberOfImages = 0; try { Directory directory = FSDirectory.open(new File( this.caseFacade.getCaseIndexFolderLocation() )); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(IndexingConstant.fromDocumentTypeToString(IndexingConstant.DOCUMENT_GENERAL_TYPE.IMAGE)); TopDocs topDocs = searcher.search(query, 500000); for(ScoreDoc scoreDoc: topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String imageExtension = document.get(IndexingConstant.FILE_MIME); if ( imageExtension != null && !imageExtension.trim().isEmpty() && Arrays.asList(imageExtensions).contains(imageExtension ) ) { String fullpath = ""; if ( IndexingConstant.isImageDocument(document) ) { String path = document.get(IndexingConstant.FILE_PATH); if ( path.contains(this.aCase.getCaseName() + File.separator + ApplicationConstants.CASE_ARCHIVE_FOLDER) ) fullpath = path; else fullpath = this.caseFacade.getFullPath(document.get(IndexingConstant.FILE_PATH)); } if ( ! fullpath.isEmpty() ) { numberOfImages++; } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return numberOfImages; }
private void getAllEmailMessagesFast(final String path, final String constant, final String type) throws IOException { List<Integer> ids = new ArrayList<Integer>(); try { Directory directory = FSDirectory.open(new File( this.panel.getCaseFacade().getCaseIndexFolderLocation() )); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse("email"); TopDocs topDocs = searcher.search(query, 100000); for(ScoreDoc scoreDocs: topDocs.scoreDocs) { Document document = searcher.doc(scoreDocs.doc); String emailPath = document.get(constant); if ( emailPath != null && !emailPath.trim().isEmpty()) { if ( emailPath.endsWith(path) ) { final EmailItem item = (EmailItem) ItemFactory.newInstance(document, panel.getCaseFacade(), false); EventQueue.invokeLater(new Runnable() { @Override public void run() { JTableUtil.addRowToJTable(panel.getTable(), item.getFullDisplayData()); } }); ids.add(Integer.valueOf(item.getDocumentId())); } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } this.panel.setResultIds(ids); }
private void displayChatSessionFast() throws IOException { try { Directory directory = FSDirectory.open(new File( this.panel.getCaseFacade().getCaseIndexFolderLocation() )); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse(panel.getAgent()); TopDocs topDocs = searcher.search(query, 5000); for(ScoreDoc scoreDoc: topDocs.scoreDocs) { final Document document = searcher.doc(scoreDoc.doc); String chatFile = document.get(IndexingConstant.CHAT_FILE); if ( chatFile != null && !chatFile.trim().isEmpty()) { if ( chatFile.endsWith(this.fileName) ) { EventQueue.invokeLater(new Runnable() { @Override public void run() { ChatItem item = (ChatItem) ItemFactory.newInstance(document, panel.getCaseFacade() , false); Object[] data = new Object[] {item.getFrom(), item.getTo(), item.getMessageText(), item.getDate()}; JTableUtil.addRowToJTable(panel.getTable(), data); } }); } } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } }
private Map<String,Double> getExtensionFreqFast() throws IOException { Map<String,Double> map = new HashMap<String,Double>(); try { Directory directory = FSDirectory.open(new File( this.caseFacade.getCaseIndexFolderLocation() )); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30)); parser.setAllowLeadingWildcard(true); Query query = parser.parse("file"); TopDocs topDocs = searcher.search(query, 100000); for(ScoreDoc scoreDoc: topDocs.scoreDocs) { Document document = searcher.doc(scoreDoc.doc); String filePath = document.get(IndexingConstant.FILE_PATH); if ( filePath != null && !filePath.trim().isEmpty()) { final File path = new File(filePath); String ext = FileUtil.getExtension(path); if ( ext == null || ext.length() > 6) // no more extension than 5 character! continue; ext = ext.toLowerCase(); if ( map.get(ext) == null ){ map.put(ext, 1.0); } else map.put(ext, map.get(ext) + 1); } } searcher.close(); } catch (ParseException ex) { Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex); } return map ; }
@SuppressWarnings("unchecked") // ENGLISH_STOP_WORDS_SET declared as Set public JWNLDistances() throws OntoSimException { if ( stopWords == null ) { stopWords = (Set<String>)StopAnalyzer.ENGLISH_STOP_WORDS_SET; } }
public StopAnalyzer2() { stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; }
public StopAnalyzer1() { stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; }
public StopAnalyzerFlawed() { stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; }