@Test public void testDefaultCopiedToMulti() { SchemaField field = h.getCore().getLatestSchema().getField("content_ws"); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory)); } assertTrue(tc.getCharFilterFactories() == null); }
/** * Ensure the ASCIIFoldingFilterFactory works */ public void testASCIIFolding() throws Exception { Reader reader = new StringReader("Česká"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory(); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); Map<String, String> args = Collections.emptyMap(); factory.init(args); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "Ceska" }); }
@Test public void testDefaultCopiedToMulti() { SchemaField field = h.getCore().getSchema().getField("content_ws"); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory)); } assertTrue(tc.getCharFilterFactories() == null); }
protected void registerWithPrefix(String prefix, LuceneAnalyzerDefinitionRegistryBuilder builder) { builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD).tokenizer(KeywordTokenizerFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD_CLEAN).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_STEMMING).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(CoreFrenchMinimalStemFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_SORT).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "('-&\\.,\\(\\))") .param("replacement", " ") .param("replace", "all") .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "([^0-9\\p{L} ])") .param("replacement", "") .param("replace", "all") .tokenFilter(TrimFilterFactory.class); }