@Factory public SearchMapping getSearchMapping() { SearchMapping mapping = new SearchMapping(); mapping.analyzerDef("autocompleteEdgeAnalyzer", PatternTokenizerFactory.class) .tokenizerParam("pattern", "(.*)") .tokenizerParam("group", "1") .filter(LowerCaseFilterFactory.class) .filter(StopFilterFactory.class) .filter(EdgeNGramFilterFactory.class) .param("minGramSize", "3") .param("maxGramSize", "50") .analyzerDef("autocompletePhoneticAnalyzer", StandardTokenizerFactory.class) .filter(StandardFilterFactory.class) .filter(StopFilterFactory.class) .filter(PhoneticFilterFactory.class) .param("encoder", "DoubleMetaphone") .filter(SnowballPorterFilterFactory.class) .param("language", "English") .analyzerDef("autocompleteNGramAnalyzer", StandardTokenizerFactory.class) .filter(WordDelimiterFilterFactory.class) .filter(LowerCaseFilterFactory.class) .filter(NGramFilterFactory.class) .param("minGramSize", "3") .param("maxGramSize", "20") .analyzerDef("standardAnalyzer", StandardTokenizerFactory.class) .filter(LowerCaseFilterFactory.class) .analyzerDef("exactAnalyzer", StandardTokenizerFactory.class) .analyzerDef("conceptParentPidsAnalyzer", WhitespaceTokenizerFactory.class); return mapping; }
/** * Creates a new tokenizer * */ public IAViewTextGenAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }
/** * Creates a new tokenizer * */ public IAViewTextCasNoPuncAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }
/** * Creates a new tokenizer * */ public IAViewTextNoCasNoPuncAnalyser(SynonymFilterFactory synonymFilterFactory, WordDelimiterFilterFactory wordDelimiterFilterFactory, AnalyzerType analyzerType) { this.synonymFilterFactory = synonymFilterFactory; this.wordDelimiterFilterFactory = wordDelimiterFilterFactory; this.analyzerType = analyzerType; }
protected void registerWithPrefix(String prefix, LuceneAnalyzerDefinitionRegistryBuilder builder) { builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD).tokenizer(KeywordTokenizerFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD_CLEAN).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_STEMMING).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(CoreFrenchMinimalStemFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_SORT).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "('-&\\.,\\(\\))") .param("replacement", " ") .param("replace", "all") .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "([^0-9\\p{L} ])") .param("replacement", "") .param("replace", "all") .tokenFilter(TrimFilterFactory.class); }
@Test public void testCustomTypes() throws Exception { String testText = "I borrowed $5,400.00 at 25% interest-rate"; ResourceLoader loader = new SolrResourceLoader("solr/collection1"); Map<String,String> args = new HashMap<>(); args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString()); args.put("generateWordParts", "1"); args.put("generateNumberParts", "1"); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("catenateAll", "0"); args.put("splitOnCaseChange", "1"); /* default behavior */ WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args); factoryDefault.inform(loader); TokenStream ts = factoryDefault.create( new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" }); ts = factoryDefault.create( new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "foobar", "bar" }); /* custom behavior */ args = new HashMap<>(); // use a custom type mapping args.put("luceneMatchVersion", TEST_VERSION_CURRENT.toString()); args.put("generateWordParts", "1"); args.put("generateNumberParts", "1"); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("catenateAll", "0"); args.put("splitOnCaseChange", "1"); args.put("types", "wdftypes.txt"); WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args); factoryCustom.inform(loader); ts = factoryCustom.create( new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" }); /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */ ts = factoryCustom.create( new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo\u200Dbar" }); }
@Test public void testCustomTypes() throws Exception { String testText = "I borrowed $5,400.00 at 25% interest-rate"; WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(); ResourceLoader loader = new SolrResourceLoader("solr/collection1"); Map<String,String> args = new HashMap<String,String>(); args.put("generateWordParts", "1"); args.put("generateNumberParts", "1"); args.put("catenateWords", "1"); args.put("catenateNumbers", "1"); args.put("catenateAll", "0"); args.put("splitOnCaseChange", "1"); /* default behavior */ factoryDefault.init(args); factoryDefault.inform(loader); TokenStream ts = factoryDefault.create( new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "400", "00", "540000", "at", "25", "interest", "rate", "interestrate" }); ts = factoryDefault.create( new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "bar", "foobar" }); /* custom behavior */ WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(); // use a custom type mapping args.put("types", "wdftypes.txt"); factoryCustom.init(args); factoryCustom.inform(loader); ts = factoryCustom.create( new MockTokenizer(new StringReader(testText), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "rate", "interestrate" }); /* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */ ts = factoryCustom.create( new MockTokenizer(new StringReader("foo\u200Dbar"), MockTokenizer.WHITESPACE, false)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo\u200Dbar" }); }