public void testIgnoreWhitespace() throws Exception { String withSpace = "foo bar"; String withoutSpace = "foobar"; String withPunctuation = "foo-bar"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "primary", "alternate", "shifted", "variableTop", " "); TokenStream tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithoutSpace = factory.create( new KeywordTokenizer(new StringReader(withoutSpace))); assertCollatesToSame(tsWithSpace, tsWithoutSpace); // now assert that punctuation still matters: foo-bar < foo bar tsWithSpace = factory.create( new KeywordTokenizer(new StringReader(withSpace))); TokenStream tsWithPunctuation = factory.create( new KeywordTokenizer(new StringReader(withPunctuation))); assertCollation(tsWithPunctuation, tsWithSpace, -1); }
private void doTestTokenFilter(String tokenfilter) throws IOException { Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter); TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a charfilter or tokenizer here, this makes no sense assertTrue(mtc instanceof TokenFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, factory, null), 100, 20, false, false); } }
@Before public void setUp() { fieldType = new TextField(); Map<String, FieldType> fieldTypes = Maps.newHashMap(); fieldTypes.put("test", fieldType); when(searcher.getSchema()).thenReturn(schema); when(schema.getFieldTypes()).thenReturn(fieldTypes); indexAnalyzer = new TokenizerChain( new WhitespaceTokenizerFactory(Maps.<String, String> newHashMap()), new TokenFilterFactory[] { indexTokenFilterFactory }); queryAnalyzer = new TokenizerChain( new WhitespaceTokenizerFactory(Maps.<String, String> newHashMap()), new TokenFilterFactory[] { queryTokenFilterFactory }); reloader = new SearcherAwareReloader(null); }
private void doTestTokenFilter(String tokenfilter) throws IOException { TokenFilterFactory factory = TokenFilterFactory.forName(tokenfilter); if (initialize(factory)) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a charfilter or tokenizer here, this makes no sense assertTrue(mtc instanceof TokenFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, factory, null), 100, 20, false, false); } }
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after the new classloader has been created to * register the services for use. */ static void reloadLuceneSPI(ClassLoader loader) { // do NOT change the order of these method calls! // Codecs: PostingsFormat.reloadPostingsFormats(loader); DocValuesFormat.reloadDocValuesFormats(loader); Codec.reloadCodecs(loader); // Analysis: CharFilterFactory.reloadCharFilters(loader); TokenFilterFactory.reloadTokenFilters(loader); TokenizerFactory.reloadTokenizers(loader); }
/** * This method looks up a class with its fully qualified name (FQN), or a short-name * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -> * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory"). * * If className contains a period, the class is first looked up as-is, assuming that it * is an FQN. If this fails, lookup is retried after prepending the Lucene analysis * package prefix to the class name. * * If className does not contain a period, the analysis SPI *Factory.lookupClass() * methods are used to find the class. * * @param className The name or the short name of the class. * @param expectedType The superclass className is expected to extend * @return the loaded class. * @throws ClassNotFoundException if lookup fails */ public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType) throws ClassNotFoundException { if (className.contains(".")) { try { // First, try className == FQN return Class.forName(className).asSubclass(expectedType); } catch (ClassNotFoundException e) { try { // Second, retry lookup after prepending the Lucene analysis package prefix return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType); } catch (ClassNotFoundException e1) { throw new ClassNotFoundException("Can't find class '" + className + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'"); } } } // No dot - use analysis SPI lookup final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst(""); if (CharFilterFactory.class.isAssignableFrom(expectedType)) { return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType); } throw new ClassNotFoundException("Can't find class '" + className + "'"); }
public AnalyzerFactory(List<CharFilterFactory> charFilterFactories, TokenizerFactory tokenizerFactory, List<TokenFilterFactory> tokenFilterFactories) { this.charFilterFactories = charFilterFactories; assert null != tokenizerFactory; this.tokenizerFactory = tokenizerFactory; this.tokenFilterFactories = tokenFilterFactories; }
@Override public String toString() { StringBuilder sb = new StringBuilder("AnalyzerFactory("); if (null != name) { sb.append("name:"); sb.append(name); sb.append(", "); } if (null != positionIncrementGap) { sb.append("positionIncrementGap:"); sb.append(positionIncrementGap); sb.append(", "); } if (null != offsetGap) { sb.append("offsetGap:"); sb.append(offsetGap); sb.append(", "); } for (CharFilterFactory charFilterFactory: charFilterFactories) { sb.append(charFilterFactory); sb.append(", "); } sb.append(tokenizerFactory); for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { sb.append(", "); sb.append(tokenFilterFactory); } sb.append(')'); return sb.toString(); }
public void testBasicUsage() throws Exception { String turkishUpperCase = "I WİLL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "tr", "strength", "primary"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(turkishUpperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(turkishLowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testNormalization() throws Exception { String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "tr", "strength", "primary", "decomposition", "canonical"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(turkishUpperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(turkishLowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testSecondaryStrength() throws Exception { String upperCase = "TESTING"; String lowerCase = "testing"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "secondary", "decomposition", "no"); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(upperCase))); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(lowerCase))); assertCollatesToSame(tsUpper, tsLower); }
public void testIgnorePunctuation() throws Exception { String withPunctuation = "foo-bar"; String withoutPunctuation = "foo bar"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "primary", "alternate", "shifted"); TokenStream tsPunctuation = factory.create( new KeywordTokenizer(new StringReader(withPunctuation))); TokenStream tsWithoutPunctuation = factory.create( new KeywordTokenizer(new StringReader(withoutPunctuation))); assertCollatesToSame(tsPunctuation, tsWithoutPunctuation); }
public void testNumerics() throws Exception { String nine = "foobar-9"; String ten = "foobar-10"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "numeric", "true"); TokenStream tsNine = factory.create( new KeywordTokenizer(new StringReader(nine))); TokenStream tsTen = factory.create( new KeywordTokenizer(new StringReader(ten))); assertCollation(tsNine, tsTen, -1); }
public void testIgnoreAccentsButNotCase() throws Exception { String withAccents = "résumé"; String withoutAccents = "resume"; String withAccentsUpperCase = "Résumé"; String withoutAccentsUpperCase = "Resume"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "primary", "caseLevel", "true"); TokenStream tsWithAccents = factory.create( new KeywordTokenizer(new StringReader(withAccents))); TokenStream tsWithoutAccents = factory.create( new KeywordTokenizer(new StringReader(withoutAccents))); assertCollatesToSame(tsWithAccents, tsWithoutAccents); TokenStream tsWithAccentsUpperCase = factory.create( new KeywordTokenizer(new StringReader(withAccentsUpperCase))); TokenStream tsWithoutAccentsUpperCase = factory.create( new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); assertCollatesToSame(tsWithAccentsUpperCase, tsWithoutAccentsUpperCase); // now assert that case still matters: resume < Resume TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(withoutAccents))); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(withoutAccentsUpperCase))); assertCollation(tsLower, tsUpper, -1); }
public void testUpperCaseFirst() throws Exception { String lower = "resume"; String upper = "Resume"; TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey", "locale", "en", "strength", "tertiary", "caseFirst", "upper"); TokenStream tsLower = factory.create( new KeywordTokenizer(new StringReader(lower))); TokenStream tsUpper = factory.create( new KeywordTokenizer(new StringReader(upper))); assertCollation(tsUpper, tsLower, -1); }
/** checks for synonyms of "GB" in synonyms.txt */ private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception { Reader reader = new StringReader("GB"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = factory.create(stream); assertTrue(stream instanceof SynonymFilter); assertTokenStreamContents(stream, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); }
/** checks for synonyms of "second" in synonyms-wordnet.txt */ private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception { Reader reader = new StringReader("second"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = factory.create(stream); assertTrue(stream instanceof SynonymFilter); assertTokenStreamContents(stream, new String[] { "second", "2nd", "two" }, new int[] { 1, 0, 0 }); }
private static void assertDelegator(final TokenFilterFactory factory, final Class delegatorClass) { assertNotNull(factory); assertTrue("factory not expected class: " + factory.getClass(), factory instanceof SynonymFilterFactory); SynonymFilterFactory synFac = (SynonymFilterFactory) factory; Object delegator = synFac.getDelegator(); assertNotNull(delegator); assertTrue("delegator not expected class: " + delegator.getClass(), delegatorClass.isInstance(delegator)); }
public void test() throws IOException { for (String tokenizer : TokenizerFactory.availableTokenizers()) { doTestTokenizer(tokenizer); } for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { doTestTokenFilter(tokenFilter); } for (String charFilter : CharFilterFactory.availableCharFilters()) { doTestCharFilter(charFilter); } }
public void testCreationWithBlackList() throws Exception { TokenFilterFactory factory = tokenFilterFactory("Type", "types", "stoptypes-1.txt, stoptypes-2.txt", "enablePositionIncrements", "true"); NumericTokenStream input = new NumericTokenStream(); input.setIntValue(123); factory.create(input); }
public void testCreationWithWhiteList() throws Exception { TokenFilterFactory factory = tokenFilterFactory("Type", "types", "stoptypes-1.txt, stoptypes-2.txt", "enablePositionIncrements", "true", "useWhitelist", "true"); NumericTokenStream input = new NumericTokenStream(); input.setIntValue(123); factory.create(input); }
public void testBasicUsage() throws Exception { String turkishUpperCase = "I WİLL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("CollationKey", "language", "tr", "strength", "primary"); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
public void testNormalization() throws Exception { String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING"; String turkishLowerCase = "ı will use turkish casıng"; TokenFilterFactory factory = tokenFilterFactory("CollationKey", "language", "tr", "strength", "primary", "decomposition", "canonical"); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(turkishUpperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(turkishLowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
public void testFullDecomposition() throws Exception { String fullWidth = "Testing"; String halfWidth = "Testing"; TokenFilterFactory factory = tokenFilterFactory("CollationKey", "language", "zh", "strength", "identical", "decomposition", "full"); TokenStream tsFull = factory.create( new MockTokenizer(new StringReader(fullWidth), MockTokenizer.KEYWORD, false)); TokenStream tsHalf = factory.create( new MockTokenizer(new StringReader(halfWidth), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsFull, tsHalf); }
public void testSecondaryStrength() throws Exception { String upperCase = "TESTING"; String lowerCase = "testing"; TokenFilterFactory factory = tokenFilterFactory("CollationKey", "language", "en", "strength", "secondary", "decomposition", "no"); TokenStream tsUpper = factory.create( new MockTokenizer(new StringReader(upperCase), MockTokenizer.KEYWORD, false)); TokenStream tsLower = factory.create( new MockTokenizer(new StringReader(lowerCase), MockTokenizer.KEYWORD, false)); assertCollatesToSame(tsUpper, tsLower); }
/** * Obtains stop words for a field from the associated * {@link StopFilterFactory}, if any. */ private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) { // No need to synchronize here, Carrot2 ensures that instances // of this class are not used by multiple threads at a time. if (!solrStopWords.containsKey(fieldName)) { final Analyzer fieldAnalyzer = core.getLatestSchema().getFieldType(fieldName) .getIndexAnalyzer(); if (fieldAnalyzer instanceof TokenizerChain) { final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer) .getTokenFilterFactories(); for (TokenFilterFactory factory : filterFactories) { if (factory instanceof StopFilterFactory) { // StopFilterFactory holds the stop words in a CharArraySet solrStopWords.put(fieldName, ((StopFilterFactory) factory).getStopWords()); } if (factory instanceof CommonGramsFilterFactory) { solrStopWords.put(fieldName, ((CommonGramsFilterFactory) factory) .getCommonWords()); } } } } return solrStopWords.get(fieldName); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader aReader) { Tokenizer tk = tokenizer.create( aReader ); TokenStream ts = tk; for (TokenFilterFactory filter : filters) { ts = filter.create(ts); } return new TokenStreamComponents(tk, ts); }
/** * Reloads all Lucene SPI implementations using the new classloader. * This method must be called after {@link #addToClassLoader(String, FileFilter, boolean)} * and {@link #addToClassLoader(String,FileFilter,boolean)} before using * this ResourceLoader. */ void reloadLuceneSPI() { // Codecs: PostingsFormat.reloadPostingsFormats(this.classLoader); DocValuesFormat.reloadDocValuesFormats(this.classLoader); Codec.reloadCodecs(this.classLoader); // Analysis: CharFilterFactory.reloadCharFilters(this.classLoader); TokenFilterFactory.reloadTokenFilters(this.classLoader); TokenizerFactory.reloadTokenizers(this.classLoader); }
@Test public void testQueryCopiedToMulti() { SchemaField field = h.getCore().getLatestSchema().getField("content_charfilter"); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue(factory instanceof LowerCaseFilterFactory); } assertTrue(tc.getCharFilterFactories().length == 1); assertTrue(tc.getCharFilterFactories()[0] instanceof MappingCharFilterFactory); }
@Test public void testDefaultCopiedToMulti() { SchemaField field = h.getCore().getLatestSchema().getField("content_ws"); Analyzer analyzer = ((TextField)field.getType()).getMultiTermAnalyzer(); assertTrue(analyzer instanceof TokenizerChain); assertTrue(((TokenizerChain) analyzer).getTokenizerFactory() instanceof KeywordTokenizerFactory); TokenizerChain tc = (TokenizerChain) analyzer; for (TokenFilterFactory factory : tc.getTokenFilterFactories()) { assertTrue((factory instanceof ASCIIFoldingFilterFactory) || (factory instanceof LowerCaseFilterFactory)); } assertTrue(tc.getCharFilterFactories() == null); }
public void testLoadDeprecatedFactory() throws Exception { SolrResourceLoader loader = new SolrResourceLoader("solr/collection1"); // ensure we get our exception loader.newInstance(DeprecatedTokenFilterFactory.class.getName(), TokenFilterFactory.class, null, new Class[] { Map.class }, new Object[] { new HashMap<String,String>() }); // TODO: How to check that a warning was printed to log file? loader.close(); }
@Override public void index(final IndexType indexType, final Collection<Song> songs) { executor.execute(new Runnable() { @Override public void run() { Stopwatch stopwatch = Stopwatch.createStarted(); Directory directory = new RAMDirectory(); try { LOG.debug("available tokenizers: {}", TokenizerFactory.availableTokenizers()); LOG.debug("available token filters: {}", TokenFilterFactory.availableTokenFilters()); Analyzer analyzer = CustomAnalyzer.builder() .withTokenizer("standard") .addTokenFilter("lowercase") .addTokenFilter("ngram", "minGramSize", "1", "maxGramSize", "25") .build(); IndexWriterConfig config = new IndexWriterConfig(analyzer); try (IndexWriter writer = new IndexWriter(directory, config)) { for (Song song : songs) { Document document = createDocument(song); writer.addDocument(document); songByUuid.put(song.getUUID(), song); } } catch (IOException e) { LOG.warn("couldn't index songs", e); } } catch (IOException e1) { LOG.warn("couldn't create analyzer", e1); } finally { putIndex(indexType, directory); stopwatch.stop(); LOG.info("indexing songs in background thread took {}", stopwatch.toString()); } } }); }
public void add(Object current) { if (!(current instanceof MultiTermAwareComponent)) return; AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent(); if (newComponent instanceof TokenFilterFactory) { if (filters == null) { filters = new ArrayList<TokenFilterFactory>(2); } filters.add((TokenFilterFactory) newComponent); } else if (newComponent instanceof TokenizerFactory) { tokenizer = (TokenizerFactory) newComponent; } else if (newComponent instanceof CharFilterFactory) { if (charFilters == null) { charFilters = new ArrayList<CharFilterFactory>(1); } charFilters.add((CharFilterFactory) newComponent); } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent); } }