/** * Adds the a corpus to the index * * @param source The source {source,target} that should be added. */ public void buildIndex(String source, String filePath) { Date start = new Date(); try { IndexWriter writer = new IndexWriter(source + "Index", new SimpleAnalyzer(), true); indexDocs(writer, new File(filePath)); writer.optimize(); writer.close(); Date end = new Date(); System.out.print(end.getTime() - start.getTime()); System.out.println(" total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
/** * getSentencesContaining returns a Vector of Integers containing the * numbers of the sentences that contain the given words in the source * language corpus. * * @param words_ The words to be found * @return A Vector of the sentence numbers */ public Vector<String> getSentencesContaining(String words_) { Vector<String> sentenceNums = new Vector<>(); words_ = requireAll(words_); try { Analyzer analyzer = new SimpleAnalyzer(); Query query = QueryParser.parse(words_, "contents", analyzer); Hits hits = sourceSearcher.search(query); // Add the numbers of all the hits to the Vector for (int i = 0; i < hits.length(); i++) { Document sentence = hits.doc(i); sentenceNums.add(sentence.get("snum")); //DEBUG System.out.println(sentence.get("snum") + ": " + sentence.get("contents")); } } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } return sentenceNums; }
/** * numSentencesContaining returns the number of sentences containing the * given words. * * @param words_ The words to be found * @param searcher The searcher to be searched. * @return The number of sentences containing the words */ public int numSentencesContaining(String words_, Searcher searcher) { int num = 0; words_ = requireAll(words_); //DEBUG System.out.println("Finding hits for " + words_); try { Analyzer analyzer = new SimpleAnalyzer(); Query query = QueryParser.parse(words_, "contents", analyzer); Hits hits = searcher.search(query); num = hits.length(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } return num; }
public void testDefaultOperator() throws Exception { Query query = new MultiFieldQueryParser(Version.LUCENE_41, new String[]{"title", "subject"}, new SimpleAnalyzer()).parse("development"); Directory dir = TestUtil.getBookIndexDirectory(); IndexSearcher searcher = new IndexSearcher( dir, true); TopDocs hits = searcher.search(query, 10); assertTrue(TestUtil.hitsIncludeTitle( searcher, hits, "Ant in Action")); assertTrue(TestUtil.hitsIncludeTitle( //A searcher, //A hits, //A "Extreme Programming Explained")); //A searcher.close(); dir.close(); }
public void testSpecifiedOperator() throws Exception { Query query = MultiFieldQueryParser.parse(Version.LUCENE_41, "lucene", new String[]{"title", "subject"}, new BooleanClause.Occur[]{BooleanClause.Occur.MUST, BooleanClause.Occur.MUST}, new SimpleAnalyzer()); Directory dir = TestUtil.getBookIndexDirectory(); IndexSearcher searcher = new IndexSearcher( dir, true); TopDocs hits = searcher.search(query, 10); assertTrue(TestUtil.hitsIncludeTitle( searcher, hits, "Lucene in Action, Second Edition")); assertEquals("one and only one", 1, hits.scoreDocs.length); searcher.close(); dir.close(); }
public void setUp() throws Exception { Directory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("partnum", "Q36", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS)); //A doc.add(new Field("description", "Illidium Space Modulator", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(directory); }
public QueryParser getQuery() throws ParseException { Analyzer analyzer = new SimpleAnalyzer(); QueryParser parser = new QueryParser(org.apache.lucene.util.Version.LUCENE_4_0, "title", analyzer); String querystr = "test*"; Query query = parser.parse(querystr); }
public void testBasicQueryParser() throws Exception { Query query = new QueryParser(Version.LUCENE_41, //1 "description", //1 new SimpleAnalyzer()) //1 .parse("partnum:Q36 AND SPACE"); //1 assertEquals("note Q36 -> q", "+partnum:q +space", query.toString("description")); //2 assertEquals("doc not found :(", 0, TestUtil.hitCount(searcher, query)); }
public void testPerFieldAnalyzer() throws Exception { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new SimpleAnalyzer()); analyzer.addAnalyzer("partnum", new KeywordAnalyzer()); Query query = new QueryParser(Version.LUCENE_41, "description", analyzer).parse( "partnum:Q36 AND SPACE"); assertEquals("Q36 kept as-is", "+partnum:Q36 +space", query.toString("description")); assertEquals("doc found!", 1, TestUtil.hitCount(searcher, query)); }
public static Analyzer getAnalyzer(Configuration conf) throws IOException { Class<? extends Analyzer> analyzerClass = conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class); return ReflectionUtils.newInstance(analyzerClass, conf); }
/** * Counts the intersection between the sentences containing S in the source * corpus and the sentences containing T in the target corpus. * * @param S * The words in the source corpus, separated by spaces. * @param T * The words in the target corpus, separated by spaces. * @return The number of sentences containing both all of the words in S and * all of the words in T. */ public int countIntersections(String S, String T) { int retNum = 0; // Require all terms S = requireAll(S); T = requireAll(T); try { // Get all sentences for the source terms Analyzer sanalyzer = new SimpleAnalyzer(); Query squery = QueryParser.parse(S, "contents", sanalyzer); Hits sHits = sourceSearcher.search(squery, new Sort("snum")); // Get all sentences for the target terms Analyzer tanalyzer = new SimpleAnalyzer(); Query tquery = QueryParser.parse(T, "contents", tanalyzer); Hits tHits = targetSearcher.search(tquery, new Sort("snum")); int sCount = 0; int tCount = 0; // Compare the sentences, and count how many match while (sCount < sHits.length() && tCount < tHits.length()) { Document sSentence = sHits.doc(sCount); int sSentNum = Integer.valueOf(sSentence.get("snum")); Document tSentence = tHits.doc(tCount); int tSentNum = Integer.valueOf(tSentence.get("snum")); //DEBUG System.out.println("s " + sSentNum + "\tt " + tSentNum); if (sSentNum == tSentNum) { retNum++; sCount++; tCount++; } else if (sSentNum > tSentNum) { tCount++; } else if (sSentNum < tSentNum) { sCount++; } } } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } return retNum; }