public CjkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); CharArraySet stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, CJKAnalyzer.getDefaultStopSet()); analyzer = new CJKAnalyzer(stopWords); analyzer.setVersion(version); }
@Inject public CjkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); CharArraySet stopWords = Analysis.parseStopWords(env, settings, CJKAnalyzer.getDefaultStopSet()); analyzer = new CJKAnalyzer(stopWords); analyzer.setVersion(version); }
private Map<Pattern, Analyzer> createCJKAnalyzer() { Analyzer analyzer = new CJKAnalyzer(); Map<Pattern, Analyzer> fieldAnalyzerMaps = new LinkedHashMap<Pattern, Analyzer>(); Pattern fieldPattern = Pattern.compile("cjk"); fieldAnalyzerMaps.put(fieldPattern, analyzer); return fieldAnalyzerMaps; }
/** * Create index */ @SuppressWarnings("deprecation") public boolean createIndex(int num) { boolean signal = false; Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); String path = "E:/topicIndex/querytopic" + num; // System.out.println("--------createIndex--------"); // File file = new File(path); if(file.exists()){ file.delete(); } file.mkdir(); try{ Directory dir = FSDirectory.open(file); IndexWriter TextIndex = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); Document doc; for(int i = 0; i < this.arrQueryTopic.size(); ++i) { doc = new Document(); doc.add(new Field("ID", this.arrQueryTopic.get(i).toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // ע������ID��String���͵� doc.add(new Field("QueryWord", this.arrQueryWord.get(i), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("Times", this.arrQueryWordTimes.get(i), Field.Store.YES, Field.Index.ANALYZED)); TextIndex.addDocument(doc); } TextIndex.optimize(); TextIndex.close(); signal = true; }catch(Exception e) { signal = false; e.printStackTrace(); } return signal; }
/** * Search */ @SuppressWarnings({ "deprecation", "static-access" }) public boolean search(int num, String str) { boolean signal = false; Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); String path = "E:/topicIndex/querytopic" + num; File file = new File(path); try{ FSDirectory dir = FSDirectory.open(file); IndexSearcher indexSearcher = new IndexSearcher(dir); QueryParser parse = new QueryParser(Version.LUCENE_35, "QueryWord", analyzer); Query query = parse.parse(str); TopDocs topDocs = indexSearcher.search(query, this.QUERYWORDNUM); ScoreDoc[] docs = topDocs.scoreDocs; // for(int i = 0; i < docs.length; ++i) { System.out.println(indexSearcher.doc(docs[i].doc).get("ID") + " " + indexSearcher.doc(docs[i].doc).get("QueryWord") + " " + indexSearcher.doc(docs[i].doc).get("Times")); } // indexSearcher.close(); signal = true; }catch(Exception e) { e.printStackTrace(); signal = false; } return signal; }
public TxtFileIndexer(String dataDir_s, String indexDir_s) throws Exception { dataDir = new File(dataDir_s); indexDir = new File(indexDir_s); index_dir = FSDirectory.open(indexDir);// (indexDir,null); // public static FSDirectory open(File path)throws IOException // Creates an FSDirectory instance 创建一个FSDirectory实例 indexDirCN = new File(indexDir_s + File.separator + "cn"); index_dir_cn = FSDirectory.open(indexDirCN); // -----------初始化第二步:【分析器】和【索引写入器】--------------------- luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_47); // 在文档被索引前,先要对文档内容进行分词处理,由 Analyzer 来做的 // Analyzer 类是一个抽象类,它有多个实现。针对不同的语言和应用需选择适合的 Analyzer // Analyzer 把分词后的内容交给 IndexWriter 来建立索引 // StandardAnalyzer 标准分析 // public StandardAnalyzer(Version matchVersion) // Builds an analyzer with the default stop words (STOP_WORDS_SET). // Parameters:matchVersion - Lucene version to match See above indexWriter = new IndexWriter(index_dir, new IndexWriterConfig( Version.LUCENE_47, luceneAnalyzer)); chineseAnalyzer = new CJKAnalyzer(Version.LUCENE_47);// 中文分析器 indexWriter_cn = new IndexWriter(index_dir_cn, new IndexWriterConfig( Version.LUCENE_47, chineseAnalyzer)); }
@Override public CJKAnalyzer get() { return this.analyzer; }
/** * Search -->���ݸ�����һ�����֣��Ӷ���ö�Ӧ��topic����Ȼ����Ӧ�������ļ��в�ѯ����ȡǰ10����� */ @SuppressWarnings({ "deprecation", "static-access" }) public boolean search(int num) { boolean signal = false; Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); String path = "E:/topicIndex/querytopic" + (num + 1); File file = new File(path); try{ String topic = this.arrTopic.get(num); // System.out.println("----------" + topic + "--------" + num); // FSDirectory dir = FSDirectory.open(file); IndexSearcher indexSearcher = new IndexSearcher(dir); QueryParser parse = new QueryParser(Version.LUCENE_35, "QueryWord", analyzer); Query query = parse.parse(topic); TopDocs topDocs = indexSearcher.search(query, this.QUERYWORDNUM); ScoreDoc[] docs = topDocs.scoreDocs; System.out.println(docs.length); this.countTmp = docs.length; System.out.println(countTmp); // ������ŵ�this.listResult��ȥ for(int i = 0; i < this.countTmp; ++i) { TopicSearchResult resultTmp = new TopicSearchResult(); resultTmp.setID(Integer.parseInt(indexSearcher.doc(docs[i].doc).get("ID"))); resultTmp.setQueryWord(indexSearcher.doc(docs[i].doc).get("QueryWord")); resultTmp.setTimes(indexSearcher.doc(docs[i].doc).get("Times")); this.listResult[i] = resultTmp; } System.out.println("---------------" + this.listResult.length + "----------------------"); signal = true; }catch(Exception e) { signal = false; e.printStackTrace(); } return signal; }
/** * Search -->���ݸ�����һ�����֣��Ӷ���ö�Ӧ��topic����Ȼ����Ӧ�������ļ��в�ѯ����ȡǰ10����� */ @SuppressWarnings({ "deprecation", "static-access" }) public boolean search(int num) { boolean signal = false; Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); String path = "C:/data/topicIndex/querytopic" + (num + 1); File file = new File(path); try{ String topic = this.arrTopic.get(num); // System.out.println("----------" + topic + "--------" + num); // FSDirectory dir = FSDirectory.open(file); IndexSearcher indexSearcher = new IndexSearcher(dir); QueryParser parse = new QueryParser(Version.LUCENE_35, "QueryWord", analyzer); Query query = parse.parse(topic); TopDocs topDocs = indexSearcher.search(query, this.QUERYWORDNUM); ScoreDoc[] docs = topDocs.scoreDocs; System.out.println(docs.length); this.countTmp = docs.length; System.out.println(countTmp); // ������ŵ�this.listResult��ȥ for(int i = 0; i < this.countTmp; ++i) { TopicSearchResult resultTmp = new TopicSearchResult(); resultTmp.setID(Integer.parseInt(indexSearcher.doc(docs[i].doc).get("ID"))); resultTmp.setQueryWord(indexSearcher.doc(docs[i].doc).get("QueryWord")); resultTmp.setTimes(indexSearcher.doc(docs[i].doc).get("Times")); this.listResult[i] = resultTmp; } System.out.println("---------------" + this.listResult.length + "----------------------"); signal = true; }catch(Exception e) { signal = false; e.printStackTrace(); } return signal; }
public static void IndexerInAdvance(String indexDir_s) throws Exception { // --------indexWriter初始化--------------- chineseAnalyzer = new CJKAnalyzer(Version.LUCENE_47); // 中文分析器,可以使用其他,庖丁解牛分词器 code.google.com/p/paoding/ indexDir = new File(indexDir_s); nioD = new NIOFSDirectory(indexDir); iwc = new IndexWriterConfig(Version.LUCENE_47, chineseAnalyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Creates a new index if one does not exist // otherwise it opens the index and documents will be appended. iwc.setRAMBufferSizeMB(2048);// 内存上限 IndexWriterConfig.setDefaultWriteLockTimeout(10); // http://space.itpub.net/28624388/viewspace-766134 }