/** * Construct an empty IndexManager */ public IndexManager(){ SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, analyzer); try { System.out.println("Building the Index..."); this.indexWriter = new IndexWriter(FSDirectory.open(new File(PATH)), indexWriterConfig); //first ask the database to give me all of the tweets. OracleDAL db = new OracleDAL(); ArrayList<Tweet> list = (ArrayList<Tweet>) db.getAllTweets(); //now build the index int indexedDocumentCount = this.indexDocsFromList(indexWriter, list); indexWriter.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
@NotNull private static SpellChecker createIndexSpellchecker(@NotNull final Directory index) throws IOException { final Directory spellCheckerDirectory = new RAMDirectory(); final IndexReader indexReader = DirectoryReader.open(index); final Analyzer analyzer = new SimpleAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); final Dictionary dictionary = new HighFrequencyDictionary(indexReader, DRUG_TERMS_FIELD, 0.0f); final SpellChecker spellChecker = new SpellChecker(spellCheckerDirectory); spellChecker.indexDictionary(dictionary, config, false); spellChecker.setAccuracy(SPELLCHECK_ACCURACY); return spellChecker; }
public static void main(String[] args) throws Exception { Analyzer analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), 9); String content = KeepEverythingExtractor.INSTANCE.getText(new InputStreamReader(System.in)); TokenStream ts = analyzer.tokenStream("extracted_text", content); CharTermAttribute cattr = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println(cattr.toString()); } ts.close(); }
@Override protected JndiRegistry createRegistry() throws Exception { JndiRegistry registry = new JndiRegistry(createJndiContext()); registry.bind("std", new File("target/stdindexDir")); registry.bind("load_dir", new File("src/test/resources/sources")); registry.bind("stdAnalyzer", new StandardAnalyzer()); registry.bind("simple", new File("target/simpleindexDir")); registry.bind("simpleAnalyzer", new SimpleAnalyzer()); registry.bind("whitespace", new File("target/whitespaceindexDir")); registry.bind("whitespaceAnalyzer", new WhitespaceAnalyzer()); return registry; }
public LuceneService(String directoryPath) { try { File indexFiles = new File(directoryPath); // 索引文件的保存位置 dir = FSDirectory.open(indexFiles); // 分析器 analyzer = new SimpleAnalyzer(DEFAULT_VERSION); // 配置类 } catch (IOException e) { e.printStackTrace(); } }
public void testIndexing() throws IOException, ParserConfigurationException, SAXException { IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_42, new SimpleAnalyzer(Version.LUCENE_42)); IndexWriter iw = new IndexWriter(FSDirectory.open(testIndex), iwConf); // if you want to append the index to a pre-existing one use the next line. // iwConf.setOpenMode(IndexWriterConfig.OpenMode.APPEND); // create a LIRE DocumentBuilder for extracting FCTH (just an example, every other feature will do). DocumentBuilder builder = DocumentBuilderFactory.getFCTHDocumentBuilder(); ArrayList<File> files = FileUtils.getAllImageFiles(new File("testdata/ferrari"), true); // for handling the XML of the test data set SAXParserFactory spf = SAXParserFactory.newInstance(); spf.setNamespaceAware(true); SAXParser saxParser = spf.newSAXParser(); XMLReader xmlReader = saxParser.getXMLReader(); for (Iterator<File> iterator = files.iterator(); iterator.hasNext(); ) { File img = iterator.next(); String path = img.getCanonicalPath(); // create the document with the LIRE DocumentBuilder, this adds the image features to the document. Document d = builder.createDocument(new FileInputStream(img), path); // handling the XML of the test data set path = path.substring(0,path.lastIndexOf('.')) + ".xml"; TagHandler handler = new TagHandler(); xmlReader.setContentHandler(handler); xmlReader.parse(new InputSource(new File(path).toURI().toString())); // add the text to the document ... d.add(new TextField("tags", handler.getTags(), Field.Store.YES)); // don't forget to add the document to the index. iw.addDocument(d); } iw.close(); }
public ContextSuggestDemo() throws IOException { indexDir = new RAMDirectory(); suggestDir = new RAMDirectory(); analyzer = new SimpleAnalyzer(); suggester = new AnalyzingInfixSuggester(suggestDir, analyzer, analyzer, 1, true); buildSearchIndex(); buildSuggesterIndex(); }
/** * Creates a query based on the given term field and type * @param term Search Term for the query * @param field Document Field for the Query which the term is matched against * @param type The type of query to be created, either QUERY_BOOLEAN, or QUERY_STANDARD, * @return a query for the given field and term using either a BooleanQuery with a * WildcardQuery for the term or a Query built from a QueryParser and SimpleAnalyzer */ private Query getQuery(String term, String field, int type) { Query qry = null; if(type == FileSearcher.QUERY_BOOLEAN) { qry = new BooleanQuery(); String[] words = term.split(" "); ((BooleanQuery) qry).add(new WildcardQuery(new Term(field, "*" + words[0])), BooleanClause.Occur.MUST); if(words.length > 1) { for(int i = 1; i < words.length - 1; i++) { ((BooleanQuery) qry).add(new WildcardQuery(new Term(field, words[i])), BooleanClause.Occur.MUST); } ((BooleanQuery) qry).add(new WildcardQuery(new Term(field, words[words.length - 1] + "*")), BooleanClause.Occur.MUST); } } else if(type == FileSearcher.QUERY_STANDARD) { try { qry = new QueryParser(Version.LUCENE_47, field, new SimpleAnalyzer(Version.LUCENE_47)).parse(term); } catch(ParseException e) { e.printStackTrace(); } } return qry; }
public void testPerField() throws Exception { String text = "Qwerty"; Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("special", new SimpleAnalyzer(TEST_VERSION_CURRENT)); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), analyzerPerField); TokenStream tokenStream = analyzer.tokenStream("field", new StringReader(text)); CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); assertTrue(tokenStream.incrementToken()); assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", termAtt.toString()); tokenStream = analyzer.tokenStream("special", new StringReader(text)); termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); assertTrue(tokenStream.incrementToken()); assertEquals("SimpleAnalyzer lowercases", "qwerty", termAtt.toString()); }
public void createIndex(List<File> files, String idxDirectory, String baseURI) { try { urlAnalyzer = new SimpleAnalyzer(LUCENE_VERSION); literalAnalyzer = new LiteralAnalyzer(LUCENE_VERSION); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put(TripleIndex.FIELD_NAME_SUBJECT, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_PREDICATE, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_OBJECT_URI, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_OBJECT_LITERAL, literalAnalyzer); PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(urlAnalyzer, mapping); File indexDirectory = new File(idxDirectory); indexDirectory.mkdir(); directory = new MMapDirectory(indexDirectory); IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, perFieldAnalyzer); iwriter = new IndexWriter(directory, config); iwriter.commit(); for (File file : files) { String type = FileUtil.getFileExtension(file.getName()); if (type.equals(TTL)) indexTTLFile(file, baseURI); if (type.equals(TSV)) indexTSVFile(file); iwriter.commit(); } iwriter.close(); ireader = DirectoryReader.open(directory); } catch (Exception e) { log.error("Error while creating TripleIndex.", e); } }
public void createIndex(List<File> files, String idxDirectory, String baseURI) { try { urlAnalyzer = new SimpleAnalyzer(LUCENE_VERSION); literalAnalyzer = new LiteralAnalyzer(LUCENE_VERSION); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put(FIELD_NAME_URI, urlAnalyzer); mapping.put(FIELD_NAME_SURFACE_FORM, literalAnalyzer); mapping.put(FIELD_NAME_URI_COUNT, literalAnalyzer); mapping.put(FIELD_NAME_CONTEXT, literalAnalyzer); PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(urlAnalyzer, mapping); File indexDirectory = new File(idxDirectory); indexDirectory.mkdir(); directory = new MMapDirectory(indexDirectory); IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, perFieldAnalyzer); iwriter = new IndexWriter(directory, config); iwriter.commit(); for (File file : files) { String type = FileUtil.getFileExtension(file.getName()); if (type.equals(TTL)) indexTTLFile(file, baseURI); iwriter.commit(); } } catch (Exception e) { log.error("Error while creating TripleIndex.", e); } }
public void addIndexes() throws Exception { Directory otherDir = null; Directory ramDir = null; IndexWriter writer = new IndexWriter(otherDir, new IndexWriterConfig(Version.LUCENE_41, new SimpleAnalyzer( Version.LUCENE_41))); writer.addIndexes(new Directory[] { ramDir }); }
public SimpleAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.simpleAnalyzer = new SimpleAnalyzer(); this.simpleAnalyzer.setVersion(version); }
@Override public SimpleAnalyzer get() { return this.simpleAnalyzer; }
@Inject public SimpleAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); this.simpleAnalyzer = new SimpleAnalyzer(); this.simpleAnalyzer.setVersion(version); }
/** * We assume that the initial indexing has been done and a set of reference objects has been * found and indexed in the separate fileList. However further documents were added and they * now need to get a ranked list of reference objects. So we (i) get all these new documents * missing the field "ro-order" and (ii) add this field. * * @param indexPath the index to update * @throws IOException */ public void updateIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); boolean hasDeletions = reader.hasDeletions(); int countUpdated = 0; IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1); perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField); IndexWriter iw = new IndexWriter(FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); if (document.getField("ro-order") == null) { // if the field is not here we create it. ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument(new Term(DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); countUpdated++; } // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); // debug: System.out.println("countUpdated = " + countUpdated); } iw.commit(); iw.close(); }
public void testSearch() throws IOException, ParseException { // create a Lucene IndexReader and the according IndexSearcher: IndexReader reader = DirectoryReader.open(FSDirectory.open(testIndex)); IndexSearcher searcher = new IndexSearcher(reader); // The QueryParser takes a String and creates a query out of it. Make sure you use the same field // as for indexing, in this case "tags" QueryParser q = new QueryParser(Version.LUCENE_42, "tags", new SimpleAnalyzer(Version.LUCENE_42)); // let's just take the tags of the first document in the index: Query query = q.parse(reader.document(1).getValues("tags")[0]); // now that's the actual search: // NOTE: The number of results here is critical. The less documents are returned here, the // less the image re-ranking can mess up. However, the recall (the absolute number of relevant // documents returned) is also influenced by this. Best to try several values like 10, 100, 200, 500, ... TopDocs results = searcher.search(query, 10); // here we print the results of the text search, just for the win. System.out.println("-----------> SEARCH RESULTS ..."); for (int i = 0; i < results.scoreDocs.length; i++) { ScoreDoc scoreDoc = results.scoreDocs[i]; System.out.print(scoreDoc.score + "\t: "); // reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] gets you the actual image file path. // LIRE manages all needed filed names as static Strings in DocumentBuilder ... System.out.print(reader.document(scoreDoc.doc).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " -> "); System.out.println(reader.document(scoreDoc.doc).getValues("tags")[0]); } // just for a visual example ... this will pop up a browser window FileUtils.browseUri(FileUtils.saveImageResultsToHtml("text", results, reader, reader.document(1).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0])); // and now for the re-ranking: // make sure to use a low level feature that has been indexed -- check the DocumentBuilder in above method. RerankFilter rerank = new RerankFilter(FCTH.class, DocumentBuilder.FIELD_NAME_FCTH); // note that you need the document here, it contains the low level feature ... // if you don't have it but just the image you need to create a new one with the // appropriate DocumentBuilder -- check the DocumentBuilder in above method. ImageSearchHits hitsReranked = rerank.filter(results, reader, reader.document(1)); // and here we print the re-ranked hits: System.out.println("-----------> RERANKED ..."); for (int i = 0; i < hitsReranked.length(); i++) { System.out.print(hitsReranked.score(i) + "\t: "); System.out.print(hitsReranked.doc(i).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0] + " -> "); System.out.println(hitsReranked.doc(i).getValues("tags")[0]); } // just for a visual example ... this will pop up a browser window. FileUtils.browseUri(FileUtils.saveImageResultsToHtml("reranked", hitsReranked, reader.document(1).getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0])); }
@Test public void testGetSimple() { Analyzer analyzer = PreBuiltAnalyzers.SIMPLE.get(); Assert.assertEquals(SimpleAnalyzer.class, analyzer.getClass()); }
private static IndexWriterConfig config() { return GaeLuceneUtil.getIndexWriterConfig(new SimpleAnalyzer()); }
private static Analyzer getAnalyzer() { return new SimpleAnalyzer(Version.LUCENE_41); }
/** * This constructor loads the SKOS model from a given InputStream using the * given serialization language parameter, which must be either N3, RDF/XML, * or TURTLE. * * @param inputStream * the input stream * @param lang * the serialization language * @throws IOException * if the model cannot be loaded */ public SKOSEngineImpl(final Version version, InputStream inputStream, String lang) throws IOException { if (!("N3".equals(lang) || "RDF/XML".equals(lang) || "TURTLE".equals(lang))) { throw new IOException("Invalid RDF serialization format"); } matchVersion = version; analyzer = new SimpleAnalyzer(matchVersion); skosModel = ModelFactory.createDefaultModel(); skosModel.read(inputStream, null, lang); indexDir = new RAMDirectory(); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); }
/** * This constructor loads the SKOS model from a given filename or URI, starts * the indexing process and sets up the index searcher. * * @param languages * the languages to be considered * @param filenameOrURI * @throws IOException */ public SKOSEngineImpl(final Version version, String filenameOrURI, String... languages) throws IOException { matchVersion = version; analyzer = new SimpleAnalyzer(matchVersion); String langSig = ""; if (languages != null) { this.languages = new TreeSet<String>(Arrays.asList(languages)); langSig = "-" + StringUtils.join(this.languages, "."); } String name = FilenameUtils.getName(filenameOrURI); //File dir = new File("skosdata/" + name + langSig); File dir = new File("/opt/webapps/solr4/ehri/portal/data/skosdata/" + name + langSig); indexDir = FSDirectory.open(dir); // TODO: Generate also if source file is modified if (!dir.isDirectory()) { // load the skos model from the given file FileManager fileManager = new FileManager(); fileManager.addLocatorFile(); fileManager.addLocatorURL(); fileManager.addLocatorClassLoader(SKOSEngineImpl.class.getClassLoader()); if (FilenameUtils.getExtension(filenameOrURI).equals("zip")) { fileManager.addLocatorZip(filenameOrURI); filenameOrURI = FilenameUtils.getBaseName(filenameOrURI); } skosModel = fileManager.loadModel(filenameOrURI); entailSKOSModel(); indexSKOSModel(); } searcher = new IndexSearcher(DirectoryReader.open(indexDir)); }
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field, which contains plain subject * terms. * * A search for "arms" doesn't return that record because the term "arms" is * not explicitly contained in the record (document). * * @throws IOException * @throws LockObtainFailedException * @throws CorruptIndexException */ @Test public void noExpansion() throws CorruptIndexException, LockObtainFailedException, IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field( "description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED)); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig( Version.LUCENE_45, new SimpleAnalyzer(Version.LUCENE_45))); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery query = new BooleanQuery(); query.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD); query.add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(query, 10); /* no results are returned since there is no term match */ Assert.assertEquals(0, results.totalHits); }
@Test public void testTermQuery() throws CorruptIndexException, IOException, QueryNodeException { Document doc = new Document(); doc.add(new Field("content", "I work for the united nations", TextField.TYPE_STORED)); writer.addDocument(doc); searcher = new IndexSearcher(DirectoryReader.open(writer, false)); StandardQueryParser parser = new StandardQueryParser(new SimpleAnalyzer( matchVersion)); Query query = parser.parse("united nations", "content"); Assert.assertEquals(1, TestUtil.hitCount(searcher, query)); }
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field, which is semantically enriched * by a URI pointing to a SKOS concept "weapons". * * A search for "arms" returns that record as a result because "arms" is * defined as an alternative label (altLabel) for the concept "weapons". * * @throws IOException */ @Test public void uriBasedTermExpansion() throws IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field( "description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED)); /* setting up the SKOS analyzer */ String skosFile = "src/test/resources/skos_samples/ukat_examples.n3"; /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */ Analyzer skosAnalyzer = new SKOSAnalyzer(matchVersion, skosFile, ExpansionType.URI); /* Define different analyzers for different fields */ Map<String,Analyzer> analyzerPerField = new HashMap<String,Analyzer>(); analyzerPerField.put("subject", skosAnalyzer); PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(matchVersion), analyzerPerField); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig( matchVersion, indexAnalyzer)); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery query1 = new BooleanQuery(); query1.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD); query1.add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD); query1.add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(query1, 10); /* the document matches because "arms" is among the expanded terms */ Assert.assertEquals(1, results.totalHits); /* defining a query that searches for a broader concept */ Query query2 = new TermQuery(new Term("subject", "military equipment")); results = searcher.search(query2, 10); /* ... also returns the document as result */ Assert.assertEquals(1, results.totalHits); }
/** * This test indexes a sample metadata record (=lucene document) having a * "title", "description", and "subject" field. * * A search for "arms" returns that record as a result because "arms" is * defined as an alternative label for "weapons", the term which is contained * in the subject field. * * @throws IOException */ @Test public void labelBasedTermExpansion() throws IOException { /* defining the document to be indexed */ Document doc = new Document(); doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED)); doc.add(new Field( "description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED)); doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED)); /* setting up the SKOS analyzer */ String skosFile = "src/test/resources/skos_samples/ukat_examples.n3"; /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */ Analyzer skosAnalyzer = new SKOSAnalyzer(matchVersion, skosFile, ExpansionType.LABEL); /* Define different analyzers for different fields */ Map<String,Analyzer> analyzerPerField = new HashMap<String,Analyzer>(); analyzerPerField.put("subject", skosAnalyzer); PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(matchVersion), analyzerPerField); /* setting up a writer with a default (simple) analyzer */ writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig( matchVersion, indexAnalyzer)); /* adding the document to the index */ writer.addDocument(doc); /* defining a query that searches over all fields */ BooleanQuery query1 = new BooleanQuery(); query1.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD); query1.add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD); query1.add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD); /* creating a new searcher */ searcher = new IndexSearcher(DirectoryReader.open(writer, false)); TopDocs results = searcher.search(query1, 10); /* the document matches because "arms" is among the expanded terms */ Assert.assertEquals(1, results.totalHits); /* defining a query that searches for a broader concept */ Query query2 = new TermQuery(new Term("subject", "military equipment")); results = searcher.search(query2, 10); /* ... also returns the document as result */ Assert.assertEquals(1, results.totalHits); }
/** * Ritorna il risultato di un tagcloud su di un testo. Dato che ogni termine * che appare nel cloud è un rappresentate di una serie di termini il * sistema fa un doppio passaggio. Quello che si vuole fare è riunire sotto * uno stesso termine rappresentante tutti i token che hanno la stessa * tokenizzazione. In pratica l'analizzatore sintattico di fronte ai due * termini come "tecnico" e "tecnica" ritorna lo stesso token "tecnic" * Quello che si vuole è che nel could non appaia la parola "tecnic" ma * sempre e comunque il primo dei termini che è apparso nel testo Nella * nuova comparira quindi la parola "tecnico" che va a rappresentare sia * "tecnico" che "tecnica" * * @param ret Risultato del tag clouding * @param text testo su cui fare il tagcloud * @param id id univoco del documento il cui testo è passato come parametro * @param analyzer analizzatore sintattico * @throws Exception Eccezione */ public static void getTagClasses(final TagCloudResults ret, String text, String id, Analyzer analyzer) throws Exception { TokenizerFilter tf = (String term) -> { try { String newTerm = tokenize(term.trim(), analyzer, -1); if (newTerm.length() > 0) { ret.add(newTerm, term, id); } } catch (Exception exception) { } }; tokenize(text, new SimpleAnalyzer(), -1, tf); }
/** * Creates an IndexWriter for given index path, with a SimpleAnalyzer. * * @param indexPath the path to the index directory * @param create set to true if you want to create a new index * @return the IndexWriter * @throws IOException */ public static IndexWriter createIndexWriter(String indexPath, boolean create) throws IOException { return createIndexWriter(indexPath, create, AnalyzerType.SimpleAnalyzer); }