public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
private void doTestTokenizer(String tokenizer) throws IOException { Class<? extends TokenizerFactory> factoryClazz = TokenizerFactory.lookupClass(tokenizer); TokenizerFactory factory = (TokenizerFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it assertFalse(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); } }
private void doTestTokenFilter(String tokenfilter) throws IOException { Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter); TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a charfilter or tokenizer here, this makes no sense assertTrue(mtc instanceof TokenFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, factory, null), 100, 20, false, false); } }
private void doTestCharFilter(String charfilter) throws IOException { Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter); CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz); if (factory != null) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a tokenizer or tokenfilter here, this makes no sense assertTrue(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, null, factory), 100, 20, false, false); } }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LATEST.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select stopword from stopwords"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 somestring test2 anotherstring")); JdbcStopFilterFactory factory = new JdbcStopFilterFactory(args); factory.inform(new ClasspathResourceLoader(getClass().getClassLoader())); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("test1", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("test2", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
public SimpleAnalyzer(boolean lowerCase) { Map<String, String> parameters = new HashMap<String, String>(); parameters.put(PatternTokenizerFactory.PATTERN, PATTERN); parameters.put(PatternTokenizerFactory.GROUP, "0"); parameters.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, version.name()); tokenizerFactory = new PatternTokenizerFactory(parameters); if (lowerCase) { parameters = new HashMap<String, String>(); parameters.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, version.name()); lowerCaseFilterFactory = new LowerCaseFilterFactory(parameters); } else { lowerCaseFilterFactory = null; } }
private void doTestTokenizer(String tokenizer) throws IOException { TokenizerFactory factory = TokenizerFactory.forName(tokenizer); if (initialize(factory)) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it assertFalse(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); } }
private void doTestTokenFilter(String tokenfilter) throws IOException { TokenFilterFactory factory = TokenFilterFactory.forName(tokenfilter); if (initialize(factory)) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a charfilter or tokenizer here, this makes no sense assertTrue(mtc instanceof TokenFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, factory, null), 100, 20, false, false); } }
private void doTestCharFilter(String charfilter) throws IOException { CharFilterFactory factory = CharFilterFactory.forName(charfilter); if (initialize(factory)) { // we managed to fully create an instance. check a few more things: // if it implements MultiTermAware, sanity check its impl if (factory instanceof MultiTermAwareComponent) { AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); assertNotNull(mtc); // its not ok to return a tokenizer or tokenfilter here, this makes no sense assertTrue(mtc instanceof CharFilterFactory); } // beast it just a little, it shouldnt throw exceptions: // (it should have thrown them in initialize) checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, null, factory), 100, 20, false, false); } }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_5_0_0.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select stopword from stopwords"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 somestring test2 anotherstring")); JdbcStopFilterFactory factory = new JdbcStopFilterFactory(args); factory.inform(new ClasspathResourceLoader()); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("test1", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("test2", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
public void testBigramTokenizer() throws Exception { SlowSynonymMap synMap; // prepare bi-gram tokenizer factory Map<String, String> args = new HashMap<String, String>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, "4.4"); args.put("minGramSize","2"); args.put("maxGramSize","2"); TokenizerFactory tf = new NGramTokenizerFactory(args); // (ab)->(bc)->(cd)->[ef][fg][gh] List<String> rules = new ArrayList<String>(); rules.add( "abcd=>efgh" ); synMap = new SlowSynonymMap( true ); SlowSynonymFilterFactory.parseRules( rules, synMap, "=>", ",", true, tf); assertEquals( 1, synMap.submap.size() ); assertEquals( 1, getSubSynonymMap( synMap, "ab" ).submap.size() ); assertEquals( 1, getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ).submap.size() ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "ef" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "fg" ); assertTokIncludes( getSubSynonymMap( getSubSynonymMap( synMap, "ab" ), "bc" ), "cd", "gh" ); }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LATEST.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select synonyms from synonyms"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 test2")); JdbcSynonymFilterFactory factory = new JdbcSynonymFilterFactory(args); factory.inform(new ClasspathResourceLoader(getClass().getClassLoader())); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("testA", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testB", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testC", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testD", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
public void add(Object current) { if (!(current instanceof MultiTermAwareComponent)) return; AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent(); if (newComponent instanceof TokenFilterFactory) { if (filters == null) { filters = new ArrayList<TokenFilterFactory>(2); } filters.add((TokenFilterFactory) newComponent); } else if (newComponent instanceof TokenizerFactory) { tokenizer = (TokenizerFactory) newComponent; } else if (newComponent instanceof CharFilterFactory) { if (charFilters == null) { charFilters = new ArrayList<CharFilterFactory>(1); } charFilters.add((CharFilterFactory) newComponent); } else { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent); } }
@Override public AbstractAnalysisFactory getMultiTermComponent() { LowerCaseFilterFactory filt = new LowerCaseFilterFactory(); filt.setLuceneMatchVersion(luceneMatchVersion); filt.init(args); return filt; }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_5_0_0.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select synonyms from synonyms"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 test2")); JdbcSynonymFilterFactory factory = new JdbcSynonymFilterFactory(args); factory.inform(new ClasspathResourceLoader()); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("testA", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testB", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testC", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testD", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
public static String initBaseArgs(final Map<String, String> baseArgs, final String luceneVersion) { final String baseClass = baseArgs.remove(BASE_CLASS); baseArgs.put(CLASS, baseClass); baseArgs.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, luceneVersion); return baseClass; }
@Override public AbstractAnalysisFactory getMultiTermComponent() { return this; }
@Override public AbstractAnalysisFactory getMultiTermComponent() { return new LowerCaseFilterFactory(new HashMap<>(getOriginalArgs())); }
/** Specifying which Lucene version we need */ private void addLuceneVersionParam() { mTokClassArgs.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, UtilConst.LUCENE_VERSION); }