@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer token = new IKTokenizer(reader, useSmart); Map<String, String> paramsMap = new HashMap<String, String>(); Configuration cfg = DefaultConfig.getInstance(); paramsMap.put("luceneMatchVersion", luceneMatchVersion.toString()); paramsMap.put("synonyms", cfg.getExtSynonymDictionarys().get(0)); paramsMap.put("ignoreCase", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(paramsMap); ResourceLoader loader = new ClasspathResourceLoader(); try { factory.inform(loader); } catch (IOException e) { e.printStackTrace(); } return new TokenStreamComponents(token, factory.create(token)); }
public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); assertTrue("loader is null and it shouldn't be", loader != null); KeepWordFilterFactory factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true"); words = factory.getWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LATEST.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select stopword from stopwords"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 somestring test2 anotherstring")); JdbcStopFilterFactory factory = new JdbcStopFilterFactory(args); factory.inform(new ClasspathResourceLoader(getClass().getClassLoader())); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("test1", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("test2", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
/** * Case: default */ public void testFactory() throws IOException { Map<String,String> args = new HashMap<String, String>(); PhoneticFilterFactory ff = new PhoneticFilterFactory(); args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); ff.init( args ); ff.inform(new ClasspathResourceLoader(ff.getClass())); assertTrue( ff.getEncoder() instanceof Metaphone ); assertTrue( ff.inject ); // default args.put( PhoneticFilterFactory.INJECT, "false" ); ff.init( args ); ff.inform(new ClasspathResourceLoader(ff.getClass())); assertFalse( ff.inject ); args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2"); ff.init(args); ff.inform(new ClasspathResourceLoader(ff.getClass())); assertEquals(2, ((Metaphone) ff.getEncoder()).getMaxCodeLen()); }
/** * Ensure the factory works with hyphenation grammar+dictionary: using default options. */ public void testHyphenationWithDictionary() throws Exception { Reader reader = new StringReader("min veninde som er lidt af en læsehest"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); ResourceLoader loader = new ClasspathResourceLoader(getClass()); Map<String,String> args = new HashMap<String,String>(); args.put("hyphenator", "da_UTF8.xml"); args.put("dictionary", "da_compoundDictionary.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } ); }
/** * Ensure the factory works with no dictionary: using hyphenation grammar only. * Also change the min/max subword sizes from the default. When using no dictionary, * its generally necessary to tweak these, or you get lots of expansions. */ public void testHyphenationOnly() throws Exception { Reader reader = new StringReader("basketballkurv"); Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); ResourceLoader loader = new ClasspathResourceLoader(getClass()); Map<String,String> args = new HashMap<String,String>(); args.put("hyphenator", "da_UTF8.xml"); args.put("minSubwordSize", "2"); args.put("maxSubwordSize", "4"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } ); }
/** * If no words are provided, then a set of english default stopwords is used. */ public void testDefaults() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); Map<String, String> args = Collections.emptyMap(); factory.init(args); factory.inform(loader); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "testing", "testing_the", "the", "the_factory", "factory" }); }
/** * If no words are provided, then a set of english default stopwords is used. */ public void testDefaults() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); Map<String, String> args = Collections.emptyMap(); factory.init(args); factory.inform(loader); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue(words.contains("the")); Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, new String[] { "testing_the", "the_factory" }); }
@Test public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); TypeTokenFilterFactory factory = new TypeTokenFilterFactory(); Map<String, String> args = new HashMap<String, String>(); args.put("types", "stoptypes-1.txt"); args.put("enablePositionIncrements", "true"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(loader); Set<String> types = factory.getStopTypes(); assertTrue("types is null and it shouldn't be", types != null); assertTrue("types Size: " + types.size() + " is not: " + 2, types.size() == 2); assertTrue("enablePositionIncrements was set to true but not correctly parsed", factory.isEnablePositionIncrements()); factory = new TypeTokenFilterFactory(); args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); args.put("enablePositionIncrements", "false"); args.put("useWhitelist","true"); factory.init(args); factory.inform(loader); types = factory.getStopTypes(); assertTrue("types is null and it shouldn't be", types != null); assertTrue("types Size: " + types.size() + " is not: " + 4, types.size() == 4); assertTrue("enablePositionIncrements was set to false but not correctly parsed", !factory.isEnablePositionIncrements()); }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_5_0_0.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select stopword from stopwords"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 somestring test2 anotherstring")); JdbcStopFilterFactory factory = new JdbcStopFilterFactory(args); factory.inform(new ClasspathResourceLoader()); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("test1", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("test2", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
public void testMixedText() throws Exception { Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); ICUTokenizerFactory factory = new ICUTokenizerFactory(new HashMap<String,String>()); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(newAttributeFactory(), reader); assertTokenStreamContents(stream, new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); }
public void testTokenizeLatinDontBreakOnHyphens() throws Exception { Reader reader = new StringReader ("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish."); final Map<String,String> args = new HashMap<>(); args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi"); ICUTokenizerFactory factory = new ICUTokenizerFactory(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(newAttributeFactory(), reader); assertTokenStreamContents(stream, new String[] { "One-two", "punch", "Brang", "not", "brung-it", "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" }); }
/** * Specify more than one script/rule file pair. * Override default DefaultICUTokenizerConfig Thai script tokenization. * Use the same rule file for both scripts. */ public void testKeywordTokenizeCyrillicAndThai() throws Exception { Reader reader = new StringReader ("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English."); final Map<String,String> args = new HashMap<>(); args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi"); ICUTokenizerFactory factory = new ICUTokenizerFactory(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(newAttributeFactory(), reader); assertTokenStreamContents(stream, new String[] { "Some", "English", "Немного русский. ", "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ", "More", "English" }); }
/** * Case: default */ public void testFactoryDefaults() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "Metaphone"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertTrue(factory.getEncoder() instanceof Metaphone); assertTrue(factory.inject); // default }
public void testInjectFalse() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "Metaphone"); args.put(PhoneticFilterFactory.INJECT, "false"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertFalse(factory.inject); }
public void testMaxCodeLength() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "Metaphone"); args.put(PhoneticFilterFactory.MAX_CODE_LENGTH, "2"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertEquals(2, ((Metaphone) factory.getEncoder()).getMaxCodeLen()); }
public void testUnknownEncoder() throws IOException { try { Map<String,String> args = new HashMap<>(); args.put("encoder", "XXX"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); fail(); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains("Error loading encoder")); } }
public void testUnknownEncoderReflection() throws IOException { try { Map<String,String> args = new HashMap<>(); args.put("encoder", "org.apache.commons.codec.language.NonExistence"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); fail(); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains("Error loading encoder")); } }
/** * Case: Reflection */ public void testFactoryReflection() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertTrue(factory.getEncoder() instanceof Metaphone); assertTrue(factory.inject); // default }
/** * we use "Caverphone2" as it is registered in the REGISTRY as Caverphone, * so this effectively tests reflection without package name */ public void testFactoryReflectionCaverphone2() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "Caverphone2"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertTrue(factory.getEncoder() instanceof Caverphone2); assertTrue(factory.inject); // default }
public void testFactoryReflectionCaverphone() throws IOException { Map<String,String> args = new HashMap<>(); args.put(PhoneticFilterFactory.ENCODER, "Caverphone"); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); assertTrue(factory.getEncoder() instanceof Caverphone2); assertTrue(factory.inject); // default }
static void assertAlgorithm(String algName, String inject, String input, String[] expected) throws Exception { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); Map<String,String> args = new HashMap<>(); args.put("encoder", algName); args.put("inject", inject); PhoneticFilterFactory factory = new PhoneticFilterFactory(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, expected); }
/** test that we can parse and use the solr syn file, with the old impl * @deprecated Remove this test in Lucene 5.0 */ @Deprecated public void testSynonymsOld() throws Exception { Reader reader = new StringReader("GB"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Synonym", Version.LUCENE_3_3, new ClasspathResourceLoader(getClass()), "synonyms", "synonyms.txt").create(stream); assertTrue(stream instanceof SlowSynonymFilter); assertTokenStreamContents(stream, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); }
/** @deprecated nuke this test in lucene 5.0 */ @Deprecated public void testMatchVersion() throws Exception { Reader reader = new StringReader("ざ"); TokenStream stream = tokenizerFactory("UAX29URLEmail").create(reader); assertTokenStreamContents(stream, new String[] {"ざ"}); reader = new StringReader("ざ"); stream = tokenizerFactory("UAX29URLEmail", Version.LUCENE_3_1, new ClasspathResourceLoader(getClass())).create(reader); assertTokenStreamContents(stream, new String[] {"さ"}); // old broken behavior }
public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsFilterFactory factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true"); words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); factory = (CommonGramsFilterFactory) tokenFilterFactory("CommonGrams", TEST_VERSION_CURRENT, loader, "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true"); words = factory.getCommonWords(); assertEquals(8, words.size()); assertTrue(words.contains("he")); assertTrue(words.contains("him")); assertTrue(words.contains("his")); assertTrue(words.contains("himself")); assertTrue(words.contains("she")); assertTrue(words.contains("her")); assertTrue(words.contains("hers")); assertTrue(words.contains("herself")); }
public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(TestStopFilter.class); assertTrue("loader is null and it shouldn't be", loader != null); CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true"); words = factory.getCommonWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory .isIgnoreCase() == true); factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery", TEST_VERSION_CURRENT, loader, "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true"); words = factory.getCommonWords(); assertEquals(8, words.size()); assertTrue(words.contains("he")); assertTrue(words.contains("him")); assertTrue(words.contains("his")); assertTrue(words.contains("himself")); assertTrue(words.contains("she")); assertTrue(words.contains("her")); assertTrue(words.contains("hers")); assertTrue(words.contains("herself")); }
public void test() throws Exception { Reader reader = new StringReader("foo foobar super-duper-trooper"); TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); stream = tokenFilterFactory("Length", Version.LUCENE_4_3, new ClasspathResourceLoader(getClass()), "min", "4", "max", "10", "enablePositionIncrements", "false").create(stream); assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); }
public void testInform() throws Exception { ResourceLoader loader = new ClasspathResourceLoader(getClass()); assertTrue("loader is null and it shouldn't be", loader != null); StopFilterFactory factory = (StopFilterFactory) tokenFilterFactory("Stop", "words", "stop-1.txt", "ignoreCase", "true"); CharArraySet words = factory.getStopWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); factory = (StopFilterFactory) tokenFilterFactory("Stop", "words", "stop-1.txt, stop-2.txt", "ignoreCase", "true"); words = factory.getStopWords(); assertTrue("words is null and it shouldn't be", words != null); assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); factory = (StopFilterFactory) tokenFilterFactory("Stop", "words", "stop-snowball.txt", "format", "snowball", "ignoreCase", "true"); words = factory.getStopWords(); assertEquals(8, words.size()); assertTrue(words.contains("he")); assertTrue(words.contains("him")); assertTrue(words.contains("his")); assertTrue(words.contains("himself")); assertTrue(words.contains("she")); assertTrue(words.contains("her")); assertTrue(words.contains("hers")); assertTrue(words.contains("herself")); // defaults factory = (StopFilterFactory) tokenFilterFactory("Stop"); assertEquals(StopAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords()); assertEquals(false, factory.isIgnoreCase()); }
/** * Test for {@link JdbcSynonymFilterFactory#create(TokenStream)}. */ @Test public void create() throws Exception { Map<String, String> args = new HashMap<>(); args.put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LATEST.toString()); args.put(JdbcReaderFactoryParams.DATASOURCE, "java:comp/env/dataSource"); args.put(JdbcReaderFactoryParams.SQL, "select synonyms from synonyms"); // White space tokenizer, to lower case tokenizer. MockTokenizer tokenizer = new MockTokenizer(); tokenizer.setReader(new StringReader("test1 test2")); JdbcSynonymFilterFactory factory = new JdbcSynonymFilterFactory(args); factory.inform(new ClasspathResourceLoader(getClass().getClassLoader())); try (TokenStream stream = factory.create(tokenizer)) { CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); assertTrue(stream.incrementToken()); assertEquals("testA", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testB", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testC", attribute.toString()); assertTrue(stream.incrementToken()); assertEquals("testD", attribute.toString()); assertFalse(stream.incrementToken()); stream.end(); } }
@Test public void openResource() throws Exception { ClasspathResourceLoader parent = new ClasspathResourceLoader(getClass().getClassLoader()); JdbcReader reader = new TestJdbcReader("test=>test1,test2"); Charset charset = Charset.forName("UTF-8"); JdbcResourceLoader loader = new JdbcResourceLoader(parent, reader, charset); InputStream resource = loader.openResource(JdbcResourceLoader.DATABASE); StringWriter writer = new StringWriter(); IOUtils.copy(resource, writer, charset); assertEquals("test=>test1,test2", writer.toString()); }
public void testMixedText() throws Exception { Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); ICUTokenizerFactory factory = new ICUTokenizerFactory(); factory.init(new HashMap<String,String>()); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(reader); assertTokenStreamContents(stream, new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); }
public void testTokenizeLatinDontBreakOnHyphens() throws Exception { Reader reader = new StringReader ("One-two punch. Brang-, not brung-it. This one--not that one--is the right one, -ish."); ICUTokenizerFactory factory = new ICUTokenizerFactory(); final Map<String,String> args = new HashMap<String,String>(); args.put(ICUTokenizerFactory.RULEFILES, "Latn:Latin-dont-break-on-hyphens.rbbi"); factory.init(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(reader); assertTokenStreamContents(stream, new String[] { "One-two", "punch", "Brang", "not", "brung-it", "This", "one", "not", "that", "one", "is", "the", "right", "one", "ish" }); }
/** * Specify more than one script/rule file pair. * Override default DefaultICUTokenizerConfig Thai script tokenization. * Use the same rule file for both scripts. */ public void testKeywordTokenizeCyrillicAndThai() throws Exception { Reader reader = new StringReader ("Some English. Немного русский. ข้อความภาษาไทยเล็ก ๆ น้อย ๆ More English."); ICUTokenizerFactory factory = new ICUTokenizerFactory(); final Map<String,String> args = new HashMap<String,String>(); args.put(ICUTokenizerFactory.RULEFILES, "Cyrl:KeywordTokenizer.rbbi,Thai:KeywordTokenizer.rbbi"); factory.init(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream stream = factory.create(reader); assertTokenStreamContents(stream, new String[] { "Some", "English", "Немного русский. ", "ข้อความภาษาไทยเล็ก ๆ น้อย ๆ ", "More", "English" }); }
/** * Case: Reflection */ public void testFactoryCaseReflection() throws IOException { Map<String,String> args = new HashMap<String, String>(); PhoneticFilterFactory ff = new PhoneticFilterFactory(); ClasspathResourceLoader loader = new ClasspathResourceLoader(ff.getClass()); args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" ); ff.init( args ); ff.inform( loader ); assertTrue( ff.getEncoder() instanceof Metaphone ); assertTrue( ff.inject ); // default // we use "Caverphone2" as it is registered in the REGISTRY as Caverphone, // so this effectively tests reflection without package name args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" ); ff.init( args ); ff.inform( loader ); assertTrue( ff.getEncoder() instanceof Caverphone2 ); assertTrue( ff.inject ); // default // cross check with registry args.put( PhoneticFilterFactory.ENCODER, "Caverphone" ); ff.init( args ); ff.inform( loader ); assertTrue( ff.getEncoder() instanceof Caverphone2 ); assertTrue( ff.inject ); // default }
static void assertAlgorithm(String algName, String inject, String input, String[] expected) throws Exception { Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); Map<String,String> args = new HashMap<String,String>(); args.put("encoder", algName); args.put("inject", inject); PhoneticFilterFactory factory = new PhoneticFilterFactory(); factory.init(args); factory.inform(new ClasspathResourceLoader(factory.getClass())); TokenStream stream = factory.create(tokenizer); assertTokenStreamContents(stream, expected); }
/** test that we can parse and use the solr syn file */ public void testSynonyms() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.init(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); assertTrue(ts instanceof SynonymFilter); assertTokenStreamContents(ts, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); }
/** test that we can parse and use the solr syn file, with the old impl * @deprecated Remove this test in Lucene 5.0 */ @Deprecated public void testSynonymsOld() throws Exception { SynonymFilterFactory factory = new SynonymFilterFactory(); Map<String,String> args = new HashMap<String,String>(); args.put("synonyms", "synonyms.txt"); factory.setLuceneMatchVersion(Version.LUCENE_33); factory.init(args); factory.inform(new ClasspathResourceLoader(getClass())); TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); assertTrue(ts instanceof SlowSynonymFilter); assertTokenStreamContents(ts, new String[] { "GB", "gib", "gigabyte", "gigabytes" }, new int[] { 1, 0, 0, 0 }); }