@Before public void setUp() { fieldType = new TextField(); Map<String, FieldType> fieldTypes = Maps.newHashMap(); fieldTypes.put("test", fieldType); when(searcher.getSchema()).thenReturn(schema); when(schema.getFieldTypes()).thenReturn(fieldTypes); indexAnalyzer = new TokenizerChain( new WhitespaceTokenizerFactory(Maps.<String, String> newHashMap()), new TokenFilterFactory[] { indexTokenFilterFactory }); queryAnalyzer = new TokenizerChain( new WhitespaceTokenizerFactory(Maps.<String, String> newHashMap()), new TokenFilterFactory[] { queryTokenFilterFactory }); reloader = new SearcherAwareReloader(null); }
@Factory public SearchMapping getSearchMapping() { SearchMapping mapping = new SearchMapping(); mapping.analyzerDef("autocompleteEdgeAnalyzer", PatternTokenizerFactory.class) .tokenizerParam("pattern", "(.*)") .tokenizerParam("group", "1") .filter(LowerCaseFilterFactory.class) .filter(StopFilterFactory.class) .filter(EdgeNGramFilterFactory.class) .param("minGramSize", "3") .param("maxGramSize", "50") .analyzerDef("autocompletePhoneticAnalyzer", StandardTokenizerFactory.class) .filter(StandardFilterFactory.class) .filter(StopFilterFactory.class) .filter(PhoneticFilterFactory.class) .param("encoder", "DoubleMetaphone") .filter(SnowballPorterFilterFactory.class) .param("language", "English") .analyzerDef("autocompleteNGramAnalyzer", StandardTokenizerFactory.class) .filter(WordDelimiterFilterFactory.class) .filter(LowerCaseFilterFactory.class) .filter(NGramFilterFactory.class) .param("minGramSize", "3") .param("maxGramSize", "20") .analyzerDef("standardAnalyzer", StandardTokenizerFactory.class) .filter(LowerCaseFilterFactory.class) .analyzerDef("exactAnalyzer", StandardTokenizerFactory.class) .analyzerDef("conceptParentPidsAnalyzer", WhitespaceTokenizerFactory.class); return mapping; }
/** * Test WhitespaceTokenizerFactory */ public void testWhitespaceTokenizer() throws Exception { Reader reader = new StringReader("What's this thing do?"); WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); Map<String, String> args = Collections.emptyMap(); factory.init(args); Tokenizer stream = factory.create(reader); assertTokenStreamContents(stream, new String[] {"What's", "this", "thing", "do?"}); }
/** * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query. * * @param queryString . * @param tokens . * @return number of quotes in the query */ public static int tokenizeQueryString(String queryString, List<String> tokens) { int countOfQuotes = 0; try { // first tokenize words and treat each quote as a separate token Map<String,String> args = new HashMap<String, String>(); args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString()); WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args); WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); s.setReader(new StringReader(queryString)); s.reset(); while (true) { CharTermAttribute t = s.getAttribute(CharTermAttribute.class); if (t == null) { break; } String tokentText = new String(t.toString()); if (tokentText.equals("\"")) { tokens.add("\""); countOfQuotes++; } else if (tokentText.startsWith("\"")) { tokens.add("\""); countOfQuotes++; if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(1, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else { tokens.add(tokentText.substring(1)); } } else if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(0, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else if (!tokentText.trim().equals("")) { // take into account only if different than empty string tokens.add(tokentText); } if (!s.incrementToken()) { break; } } s.end(); s.close(); } catch (IOException e) { throw new RuntimeException(e); } return countOfQuotes; }
protected void registerWithPrefix(String prefix, LuceneAnalyzerDefinitionRegistryBuilder builder) { builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD).tokenizer(KeywordTokenizerFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.KEYWORD_CLEAN).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_STEMMING).tokenizer(WhitespaceTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(WordDelimiterFilterFactory.class) .param("generateWordParts", "1") .param("generateNumberParts", "1") .param("catenateWords", "0") .param("catenateNumbers", "0") .param("catenateAll", "0") .param("splitOnCaseChange", "0") .param("splitOnNumerics", "0") .param("preserveOriginal", "1") .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(CoreFrenchMinimalStemFilterFactory.class); builder.analyzer(prefix + HibernateSearchAnalyzer.TEXT_SORT).tokenizer(KeywordTokenizerFactory.class) .tokenFilter(ASCIIFoldingFilterFactory.class) .tokenFilter(LowerCaseFilterFactory.class) .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "('-&\\.,\\(\\))") .param("replacement", " ") .param("replace", "all") .tokenFilter(PatternReplaceFilterFactory.class) .param("pattern", "([^0-9\\p{L} ])") .param("replacement", "") .param("replace", "all") .tokenFilter(TrimFilterFactory.class); }
private Analyzer makeAnalyzer() throws IOException { return CustomAnalyzer.builder() .withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter(LowerCaseFilterFactory.class) .build(); }
public void testLookupTokenizer() { assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace", versionArgOnly()).getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE", versionArgOnly()).getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace", versionArgOnly()).getClass()); }
public void testLookupTokenizerClass() { assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("Whitespace")); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("WHITESPACE")); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("whitespace")); }
public void testLookupTokenizer() { assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace").getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE").getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace").getClass()); }