Java 类org.apache.lucene.analysis.StopAnalyzer 实例源码

项目:DEM    文件:PSTSearcher.java   
public int search (String queryString, SearchField field) throws Exception {
//  Query query = QueryParser.parse(queryString,"mailcontent", new StandardAnalyzer());
//  hits  = is.search(query);
//        return hits.length();

        //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_20);
        Analyzer analyzer = new StopAnalyzer(Version.LUCENE_20,  new File(ApplicationConstants.STOP_WORD_FILE));

        QueryParser parser = new QueryParser(Version.LUCENE_20, field.Value(), analyzer);
        Query query = parser.parse(queryString) ;

    //Query query = QueryParser.parse(queryString,"body", new StandardAnalyzer());
    //hits  = is.search(query);
        //return hits.length();

        results = searcher.search(query,100);

        return results.totalHits ;
    }
项目:DEM    文件:IndexerManager.java   
private IndexerManager (final CaseFacade caseFacade) throws IOException {
       File indexDir = new File(caseFacade.getCaseIndexFolderLocation());

       if ( !indexDir.exists() ) {
           throw new IOException("not found indexing folder");
       }

this.caseFacade = caseFacade;

       // using stop analyzer
       this.writer = new IndexWriter(
           FSDirectory.open(indexDir),
           new StopAnalyzer(Version.LUCENE_30, new File(ApplicationConstants.STOP_WORD_FILE)),
           true,
           IndexWriter.MaxFieldLength.UNLIMITED
       );

this.writer.setUseCompoundFile(false);
       this.writer.setRAMBufferSizeMB(500);
   }
项目:DEM    文件:LuceneSearcher.java   
public List<Document> search (String queryString, SearchScope luceneFields) throws Exception {
    // using stop analyzer in search
    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_30,  new File(ApplicationConstants.STOP_WORD_FILE));

    String[] fields = getSupportedFileds(luceneFields);

    MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fields, analyzer);

    // scpaing special characters
    queryString = QueryParser.escape(queryString);

    Query query = parser.parse(queryString);
    TopDocs topDocs = searcher.search(query, MAX_RESULT);

    List<Document> documents = new ArrayList<Document>();
    for(ScoreDoc scoreDocs: topDocs.scoreDocs) {
        Document document = searcher.doc(scoreDocs.doc);
        documents.add(document);
    }

    return documents;
}
项目:dash-xtf    文件:LuceneIndexToDict.java   
/**
 * Read a Lucene index and make a spelling dictionary from it. A minimal token
 * analyzer will be used, which is usually just what is needed for the
 * dictionary. The default set of English stop words will be used (see
 * {@link StopAnalyzer#ENGLISH_STOP_WORDS}).
 * 
 * @param indexDir directory containing the Lucene index
 * @param dictDir directory to receive the spelling dictionary
 * @param prog tracker called periodically to display progress
 */
public static void createDict(Directory indexDir, 
                              File dictDir, 
                              ProgressTracker prog)
  throws IOException
{
  // Open and clear the dictionary (since we're going to totally rebuild it)
  SpellWriter spellWriter = SpellWriter.open(dictDir);
  spellWriter.clearDictionary();
  spellWriter.setStopwords(StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));

  // Now re-tokenize all the fields and queue the words for the dictionary.
  IndexReader indexReader = IndexReader.open(indexDir);
  createDict(indexReader, new MinimalAnalyzer(), spellWriter, prog);

  // All done.
  spellWriter.close();
}
项目:handytrowel    文件:StopwordAnnotator.java   
public StopwordAnnotator(String annotatorClass, Properties props) {
    this.props = props;

    this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA, "false"));

    if (this.props.containsKey(STOPWORDS_LIST)) {
        String stopwordList = props.getProperty(STOPWORDS_LIST);
        boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false"));
        this.stopwords = getStopWordList(Version.LUCENE_36, stopwordList, ignoreCase);
    } else {
        this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
}
项目:DEM    文件:ChatRefreshTask.java   
private Set<String> getChatFilePathFast() throws IOException {
    Set<String> result = new HashSet<String>();

    try {
        Directory directory = FSDirectory.open(new File(
                this.panel.getCaseFacade().getCaseIndexFolderLocation()
                ));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, 
                IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(panel.getAgent());

        TopDocs topDocs = searcher.search(query, 5000);

        for(ScoreDoc scoreDoc: topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String chatFile = document.get(IndexingConstant.CHAT_FILE);

            if ( chatFile != null && !chatFile.trim().isEmpty()) {
                chatFile = this.panel.getCaseFacade().getFullPath(chatFile);
                final File path = new File(chatFile);
                result.add(path.getName());
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return result;
}
项目:coreNlp    文件:StopwordAnnotator.java   
public StopwordAnnotator(String annotatorClass, Properties props) {
    this.props = props;

    this.checkLemma = Boolean.parseBoolean(props.getProperty(CHECK_LEMMA, "false"));

    if (this.props.containsKey(STOPWORDS_LIST)) {
        String stopwordList = props.getProperty(STOPWORDS_LIST);
        boolean ignoreCase = Boolean.parseBoolean(props.getProperty(IGNORE_STOPWORD_CASE, "false"));
        this.stopwords = getStopWordList(Version.LUCENE_36, stopwordList, ignoreCase);
    } else {
        this.stopwords = (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
}
项目:coreNlp    文件:StopwordAnnotatorTest.java   
/**
 * Test to validate that stopwords are properly annotated in the token list
 * @throws Exception
 */
@org.junit.Test
public void testLuceneStopwordList() throws Exception {
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, stopword");
    props.setProperty("customAnnotatorClass.stopword", "intoxicant.analytics.coreNlp.StopwordAnnotator");

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    Annotation document = new Annotation(example);
    pipeline.annotate(document);
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);

    //get the standard lucene stopword set
    Set<?> stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

    for (CoreLabel token : tokens) {

        //get the stopword annotation
        Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);

        String word = token.word().toLowerCase();
        if (stopWords.contains(word)) {
            assertTrue(stopword.first());
        }
        else {
            assertFalse(stopword.first());
        }

        //not checking lemma, so always false
        assertFalse(stopword.second());
    }
}
项目:t4f-data    文件:SynonymAnalyzer.java   
public TokenStream tokenStream(String fieldName, Reader reader) {
  TokenStream result = new SynonymFilter(
                        new StopFilter(true,
                          new LowerCaseFilter(
                            new StandardFilter(
                              new StandardTokenizer(
                               Version.LUCENE_41, reader))),
                          StopAnalyzer.ENGLISH_STOP_WORDS_SET),
                        engine
                       );
  return result;
}
项目:nakala    文件:ReviewExtractor.java   
public static ReviewExtractor newInstance(String[] domainsToLoad, boolean polite) throws IOException {
    ReviewExtractor re = new ReviewExtractor();
    stopWords = new HashSet<String>();
    for (Object w : StopAnalyzer.ENGLISH_STOP_WORDS_SET) {
        stopWords.add(w.toString());
    }

    VocabUtils vocabUtils = VocabUtils.newInstance();
    re.hardOverridesSQ = vocabUtils.getHardOverrides();
    re.queries = new HashMap<String, Map<Byte, SpanQuery>>();
    re.vocabs = new HashMap<String, VocabularyReview>();

    Map<Byte, SpanQuery> type2query = new HashMap<Byte, SpanQuery>();

    String[] ds = domains;
    if (domainsToLoad != null) {
        ds = domainsToLoad;
    }

    re.loadedDomains = ds;

    for (String domain : ds) {
        VocabularyReview vocab = VocabularyReview.newInstance(domain);
        type2query = new HashMap<Byte, SpanQuery>();
        type2query.put(PLUS_ONE, vocabUtils.getPlusOneQueries(vocab));
        type2query.put(MINUS_ONE, vocabUtils.getMinusOneQueries(vocab));
        type2query.put(TITLE_PLUS_ONE, vocabUtils.getTitlePlusOneQueries(vocab));
        type2query.put(TITLE_MINUS_ONE, vocabUtils.getTitleMinusOneQueries(vocab));
        re.queries.put(domain, type2query);
        re.vocabs.put(domain, vocab);
    }

    re.polite = polite;
    if (polite) {
        re.tabooSQ = vocabUtils.getTaboo();
    }

    return re;
}
项目:WikiKreator    文件:CosineDocumentSimilarity.java   
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{

        RAMDirectory ramDir = new RAMDirectory();
        FileReader fr=new FileReader(new File("lib/stoplists/en.txt"));

        //  Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));  
        Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr );
        //Index the full text of both documents
        //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED);
        IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
        for (String s:fileSentences)
        {
            Document doc1 = new Document();
            StringReader d1reader=new StringReader(s);
            doc1.add(new Field("contents", d1reader, TermVector.YES));
            writer.addDocument(doc1);
        }


        //  writer.commit();
        writer.close();

        DocVector[] docs = new DocVector[fileSentences.size()];
        //Build a term vector for each document
        IndexReader RAMreader = IndexReader.open(ramDir);
        Map<String,Integer> terms = new HashMap<String,Integer>();
        TermEnum termEnum = RAMreader.terms(new Term("contents"));

        //System.out.println(RAMreader.numDocs());
        int pos = 0;
        while (termEnum.next()) {
          Term term = termEnum.term();
          if (!"contents".equals(term.field())) 
            break;
          terms.put(term.text(), pos++);
        }

        //System.out.println("Num terms:"+terms.size());

        for(int i=0;i<fileSentences.size();i++)
        {
            TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
            docs[i]=new DocVector(terms);
            if (tfvs==null)
                    continue;
            for (TermFreqVector tfv : tfvs) 
            {
                String[] termTexts = tfv.getTerms();
                int[] termFreqs = tfv.getTermFrequencies();
                for (int j = 0; j < termTexts.length; j++) {
                    double idfValue=getIDF(RAMreader,termTexts[j]);
                    double tfIdfValue=termFreqs[j]*idfValue;
                    docs[i].setEntry(termTexts[j], tfIdfValue);
                }

            }
            docs[i].normalize();


        }


        RAMreader.close();
        ramDir.close();
        //ramDir.close();
        //System.out.println(RAMreader.numDocs());
        //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
        return docs;

    }
项目:DEM    文件:ImageLoadingTask.java   
private Set<ImagePathAndId> loadItemsFast(int from, int size) throws IOException {
    Set<ImagePathAndId> files = new HashSet<ImagePathAndId>();
    int counter = 0;

    try {
        Directory directory = FSDirectory.open(new File(this.caseFacade.getCaseIndexFolderLocation()));   
        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30));     
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(IndexingConstant.fromDocumentTypeToString(IndexingConstant.DOCUMENT_GENERAL_TYPE.IMAGE));

        TopDocs topDocs = searcher.search(query, 500000);

        for(ScoreDoc scoreDoc: topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String imageExtension = document.get(IndexingConstant.FILE_MIME);

            if ( imageExtension != null && !imageExtension.trim().isEmpty() &&
                    Arrays.asList(imageExtensions).contains(imageExtension ) ) {

                String fullpath = "";
                int id = Integer.parseInt(document.get(IndexingConstant.DOCUMENT_ID));

                if ( IndexingConstant.isImageDocument(document) ) {
                    String path = document.get(IndexingConstant.FILE_PATH);

                    if ( path.contains(this.aCase.getCaseName() + File.separator + ApplicationConstants.CASE_ARCHIVE_FOLDER) ) 
                        fullpath = path;
                    else
                        fullpath = this.caseFacade.getFullPath(document.get(IndexingConstant.FILE_PATH));
                }

                if ( ! fullpath.isEmpty() ) {
                    counter++;

                    if ( files.size() >= size) 
                        break;

                    if ( counter >= from ) {
                        files.add(new ImagePathAndId(fullpath, Integer.valueOf(id)));
                    }
                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return files;
}
项目:DEM    文件:ImageLoadingTask.java   
private int getNumberOfImagesFast() throws IOException {
    int numberOfImages = 0;

    try {
        Directory directory = FSDirectory.open(new File(
            this.caseFacade.getCaseIndexFolderLocation()
        ));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, 
                IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(IndexingConstant.fromDocumentTypeToString(IndexingConstant.DOCUMENT_GENERAL_TYPE.IMAGE));

        TopDocs topDocs = searcher.search(query, 500000);

        for(ScoreDoc scoreDoc: topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String imageExtension = document.get(IndexingConstant.FILE_MIME);

            if ( imageExtension != null && !imageExtension.trim().isEmpty() &&
                    Arrays.asList(imageExtensions).contains(imageExtension )  ) {
                String fullpath = "";

                if ( IndexingConstant.isImageDocument(document) ) {
                    String path = document.get(IndexingConstant.FILE_PATH);
                    if ( path.contains(this.aCase.getCaseName() + File.separator + ApplicationConstants.CASE_ARCHIVE_FOLDER) ) 
                        fullpath = path;
                    else
                        fullpath = this.caseFacade.getFullPath(document.get(IndexingConstant.FILE_PATH));
                }

                if ( ! fullpath.isEmpty() ) {
                    numberOfImages++;
                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return numberOfImages;
}
项目:DEM    文件:EmailLoadingTask.java   
private void getAllEmailMessagesFast(final String path, final String constant, final String type) throws IOException {
    List<Integer> ids = new ArrayList<Integer>();

    try {
        Directory directory = FSDirectory.open(new File(
                this.panel.getCaseFacade().getCaseIndexFolderLocation()
                ));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, 
                IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse("email");

        TopDocs topDocs = searcher.search(query, 100000);

        for(ScoreDoc scoreDocs: topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDocs.doc);
            String emailPath = document.get(constant);

            if ( emailPath != null && !emailPath.trim().isEmpty()) {

                if ( emailPath.endsWith(path) ) {
                    final EmailItem item = (EmailItem) ItemFactory.newInstance(document, panel.getCaseFacade(), false);

                    EventQueue.invokeLater(new Runnable() { 
                        @Override
                        public void run() {
                            JTableUtil.addRowToJTable(panel.getTable(), item.getFullDisplayData());
                        }
                    });

                    ids.add(Integer.valueOf(item.getDocumentId()));
                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    this.panel.setResultIds(ids);
}
项目:DEM    文件:ChatLoadingTask.java   
private void displayChatSessionFast() throws IOException {
    try {
        Directory directory = FSDirectory.open(new File(
                this.panel.getCaseFacade().getCaseIndexFolderLocation()
                ));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, 
                IndexingConstant.CHAT_AGENT, new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse(panel.getAgent());

        TopDocs topDocs = searcher.search(query, 5000);

        for(ScoreDoc scoreDoc: topDocs.scoreDocs) {
            final Document document = searcher.doc(scoreDoc.doc);
            String chatFile = document.get(IndexingConstant.CHAT_FILE);

            if ( chatFile != null && !chatFile.trim().isEmpty()) {

                if ( chatFile.endsWith(this.fileName) ) {

                    EventQueue.invokeLater(new Runnable() { 
                        @Override
                        public void run() {
                            ChatItem item = (ChatItem) ItemFactory.newInstance(document, panel.getCaseFacade()
                                    , false);
                            Object[] data = new Object[] {item.getFrom(), item.getTo(), item.getMessageText(),
                                    item.getDate()};
                            JTableUtil.addRowToJTable(panel.getTable(), data);
                        }
                    });

                }
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }
}
项目:DEM    文件:ExtensionFrequencyTask.java   
private Map<String,Double> getExtensionFreqFast() throws IOException {
    Map<String,Double> map = new HashMap<String,Double>();

    try {
        Directory directory = FSDirectory.open(new File(
                this.caseFacade.getCaseIndexFolderLocation()
                ));

        IndexSearcher searcher = new IndexSearcher(directory);
        QueryParser parser = new QueryParser(Version.LUCENE_30, 
                IndexingConstant.DOCUMENT_TYPE, new StopAnalyzer(Version.LUCENE_30));
        parser.setAllowLeadingWildcard(true);
        Query query = parser.parse("file");

        TopDocs topDocs = searcher.search(query, 100000);

        for(ScoreDoc scoreDoc: topDocs.scoreDocs) {
            Document document = searcher.doc(scoreDoc.doc);
            String filePath = document.get(IndexingConstant.FILE_PATH);

            if ( filePath != null && !filePath.trim().isEmpty()) {
                final File path = new File(filePath);
                String ext = FileUtil.getExtension(path);

                if ( ext == null || ext.length() > 6) // no more extension than 5 character!
                    continue;

                ext = ext.toLowerCase();

                if ( map.get(ext) == null ){
                    map.put(ext, 1.0);
                }
                else
                    map.put(ext, map.get(ext) + 1);
            }
        }

        searcher.close();
    } catch (ParseException ex) {
        Logger.getLogger(ChatRefreshTask.class.getName()).log(Level.SEVERE, null, ex);
    }

    return map ;
}
项目:align-api-project    文件:JWNLDistances.java   
@SuppressWarnings("unchecked") // ENGLISH_STOP_WORDS_SET declared as Set
   public JWNLDistances() throws OntoSimException {
if ( stopWords == null ) {
    stopWords = (Set<String>)StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
   }
项目:t4f-data    文件:StopAnalyzer2.java   
public StopAnalyzer2() {
  stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
项目:t4f-data    文件:StopAnalyzer1.java   
public StopAnalyzer1() {
  stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
项目:t4f-data    文件:StopAnalyzerFlawed.java   
public StopAnalyzerFlawed() {
  stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}