@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer token = new IKTokenizer(reader, useSmart); Map<String, String> paramsMap = new HashMap<String, String>(); Configuration cfg = DefaultConfig.getInstance(); paramsMap.put("luceneMatchVersion", luceneMatchVersion.toString()); paramsMap.put("synonyms", cfg.getExtSynonymDictionarys().get(0)); paramsMap.put("ignoreCase", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(paramsMap); ResourceLoader loader = new ClasspathResourceLoader(); try { factory.inform(loader); } catch (IOException e) { e.printStackTrace(); } return new TokenStreamComponents(token, factory.create(token)); }
/** * Analyzer dedicated to indexing elements into training set and comparing * them with document to categorise * * @return * @throws ParseException * @throws NumberFormatException */ @ConditionalOnProperty(prefix = "lucene.categoriser.", value = "useTSetBasedCategoriser") public @Bean Analyzer trainingSetAnalyser(StopFilterFactory stopFilterFactory, SynonymFilterFactory synonymFilterFactory) throws NumberFormatException, ParseException { StopFilterFactory stopFilterFactoryForTSet = null; if (useStopFilter) { stopFilterFactoryForTSet = stopFilterFactory; } SynonymFilterFactory synonymFilterFactoryForTSet = null; if (useSynonymFilter) { synonymFilterFactoryForTSet = synonymFilterFactory; } return new TaxonomyTrainingSetAnalyser(stopFilterFactoryForTSet, synonymFilterFactoryForTSet, Integer.valueOf(maxShingleSize)); }
public void testMultiWordSynonyms() throws IOException { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(new StringMockResourceLoader("a b c,d")); TokenStream ts = factory.create(new MockTokenizer(new StringReader("a e"), MockTokenizer.WHITESPACE, false)); // This fails because ["e","e"] is the value of the token stream assertTokenStreamContents(ts, new String[] { "a", "e" }); }
/** * Creates a new tokenizer * */ public IAViewTextGenAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }
/** * Creates a new tokenizer * */ public TaxonomyTrainingSetAnalyser(StopFilterFactory stopFilterFactory, SynonymFilterFactory synonymFilterFactory, Integer maxShingleSize) { this.stopFilterFactory = stopFilterFactory; this.synonymFilterFactory = synonymFilterFactory; this.maxShingleSize = maxShingleSize; }
/** * Creates a new {@link WhitespaceAnalyzer} * */ public IAViewTextCasPuncAnalyser(StopFilterFactory stopFilterFactory, SynonymFilterFactory synonymFilterFactory, AnalyzerType analyzerType) { this.stopFilterFactory = stopFilterFactory; this.synonymFilterFactory = synonymFilterFactory; this.analyzerType = analyzerType; }
/** * Creates a new tokenizer * */ public IAViewTextCasNoPuncAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }
/** * Creates a new tokenizer * */ public IAViewTextNoCasNoPuncAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }