@Test public void testNormalizerCharFilter() throws Exception { String query = "한국어를 처리하는 예시입니닼ㅋ. 오픈코리안텍스틓ㅎㅎㅎㅎㅎㅎㅎ"; String expected = "한국어를 처리하는 예시입니다ㅋ. 오픈코리안텍스트ㅎㅎㅎ"; CharFilter inputReader = new OpenKoreanTextNormalizer(new StringReader(query)); char[] tempBuff = new char[10]; StringBuilder actual = new StringBuilder(); while (true) { int length = inputReader.read(tempBuff); if (length == -1) break; actual.append(tempBuff, 0, length); } Assert.assertEquals(expected, actual.toString()); }
public void testDefaultSetting() throws Exception { Settings settings = Settings.builder() .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .build(); TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin()); CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar"); String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); String expectedOutput = normalizer.normalize(input); CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); char[] tempBuff = new char[10]; StringBuilder output = new StringBuilder(); while (true) { int length = inputReader.read(tempBuff); if (length == -1) break; output.append(tempBuff, 0, length); assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); } assertEquals(expectedOutput, output.toString()); }
public void testNameAndModeSetting() throws Exception { Settings settings = Settings.builder() .put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer") .put("index.analysis.char_filter.myNormalizerChar.name", "nfkc") .put("index.analysis.char_filter.myNormalizerChar.mode", "decompose") .build(); TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin()); CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar"); String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE); String expectedOutput = normalizer.normalize(input); CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input)); char[] tempBuff = new char[10]; StringBuilder output = new StringBuilder(); while (true) { int length = inputReader.read(tempBuff); if (length == -1) break; output.append(tempBuff, 0, length); assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length())))); } assertEquals(expectedOutput, output.toString()); }
public void testNormalization() throws IOException { String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि"; Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); String expectedOutput = normalizer.normalize(input); CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer); char[] tempBuff = new char[10]; StringBuilder output = new StringBuilder(); while (true) { int length = reader.read(tempBuff); if (length == -1) { break; } output.append(tempBuff, 0, length); assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length())))); } assertEquals(expectedOutput, output.toString()); }
private CharFilterSpec newCharFilterChain(Random random, Reader reader) { CharFilterSpec spec = new CharFilterSpec(); spec.reader = reader; StringBuilder descr = new StringBuilder(); int numFilters = random.nextInt(3); for (int i = 0; i < numFilters; i++) { while (true) { final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size())); final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes()); if (broken(ctor, args)) { continue; } reader = createComponent(ctor, args, descr); if (reader != null) { spec.reader = reader; break; } } } spec.toString = descr.toString(); return spec; }
public void testIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); Map<String, String> tokenizerArgs = Collections.emptyMap(); tokenizerFactory.init(tokenizerArgs); tokenizerFactory.inform(new StringMockResourceLoader("")); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); Map<String, String> filterArgs = Collections.emptyMap(); filterFactory.init(filterArgs); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(filter); assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"}); }
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); Map<String, String> tokenizerArgs = Collections.emptyMap(); tokenizerFactory.init(tokenizerArgs); tokenizerFactory.inform(new StringMockResourceLoader("")); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); Map<String, String> filterArgs = new HashMap<String, String>(); filterArgs.put("normalizeKanji", "true"); filterArgs.put("normalizeKana", "false"); filterFactory.init(filterArgs); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(filter); assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"}); }
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); Map<String, String> tokenizerArgs = Collections.emptyMap(); tokenizerFactory.init(tokenizerArgs); tokenizerFactory.inform(new StringMockResourceLoader("")); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); Map<String, String> filterArgs = new HashMap<String, String>(); filterArgs.put("normalizeKanji", "false"); filterArgs.put("normalizeKana", "true"); filterFactory.init(filterArgs); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(filter); assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"}); }
public void testIterationMarksWithKeywordTokenizer() throws IOException { final String text = "時々馬鹿々々しいところゞゝゝミスヾ"; JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>()); CharFilter filter = filterFactory.create(new StringReader(text)); TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false); assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"}); }
public void testIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>()); tokenizerFactory.inform(new StringMockResourceLoader("")); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>()); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter); assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"}); }
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>()); tokenizerFactory.inform(new StringMockResourceLoader("")); Map<String, String> filterArgs = new HashMap<>(); filterArgs.put("normalizeKanji", "true"); filterArgs.put("normalizeKana", "false"); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter); assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"}); }
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException { JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>()); tokenizerFactory.inform(new StringMockResourceLoader("")); Map<String, String> filterArgs = new HashMap<>(); filterArgs.put("normalizeKanji", "false"); filterArgs.put("normalizeKana", "true"); JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs); CharFilter filter = filterFactory.create( new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") ); TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter); assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"}); }
public void testKanjiOnly() throws IOException { // Test kanji only repetition marks CharFilter filter = new JapaneseIterationMarkCharFilter( new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), true, // kanji false // no kana ); assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"); }
public void testKanaOnly() throws IOException { // Test kana only repetition marks CharFilter filter = new JapaneseIterationMarkCharFilter( new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), false, // no kanji true // kana ); assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。"); }
public void testNone() throws IOException { // Test no repetition marks CharFilter filter = new JapaneseIterationMarkCharFilter( new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), false, // no kanji false // no kana ); assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"); }
public void testTokenStream() throws IOException { // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<' String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(tokenStream, new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"}, new int[] {0, 2, 4, 6, 8, 11, 14}, new int[] {1, 3, 5, 7, 10, 13, 16}, input.length()); }
public void testTokenStream2() throws IOException { // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' String input = "㌰゙5℃№㈱㌘ザゾ"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1); assertTokenStreamContents(tokenStream, new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, input.length() ); }
public void testMassiveLigature() throws IOException { String input = "\uFDFA"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(tokenStream, new String[] {"صلى", "الله", "عليه", "وسلم"}, new int[]{0, 0, 0, 0}, new int[]{0, 0, 0, 1}, input.length() ); }
private void checkOutput(String input, String pattern, String replacement, String expectedOutput, String expectedIndexMatchedOutput) throws IOException { CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement, new StringReader(input)); StringBuilder output = new StringBuilder(); for (int chr = cs.read(); chr > 0; chr = cs.read()) { output.append((char) chr); } StringBuilder indexMatched = new StringBuilder(); for (int i = 0; i < output.length(); i++) { indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i)))); } boolean outputGood = expectedOutput.equals(output.toString()); boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString()); if (!outputGood || !indexMatchedGood || false) { System.out.println("Pattern : " + pattern); System.out.println("Replac. : " + replacement); System.out.println("Input : " + input); System.out.println("Output : " + output); System.out.println("Expected: " + expectedOutput); System.out.println("Output/i: " + indexMatched); System.out.println("Expected: " + expectedIndexMatchedOutput); System.out.println(); } assertTrue("Output doesn't match.", outputGood); assertTrue("Index-matched output doesn't match.", indexMatchedGood); }
public void testNothingChange() throws IOException { final String BLOCK = "this is test."; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 13 }, BLOCK.length()); }
public void testReplaceByEmpty() throws IOException { final String BLOCK = "aa bb cc"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] {}); }
public void test1block1matchSameLength() throws IOException { final String BLOCK = "aa bb cc"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, new int[] { 0 }, new int[] { 8 }, BLOCK.length()); }
public void test1block1matchLonger() throws IOException { final String BLOCK = "aa bb cc dd"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa##bb###cc", "dd" }, new int[] { 0, 9 }, new int[] { 8, 11 }, BLOCK.length()); }
public void test1block2matchLonger() throws IOException { final String BLOCK = " a a"; CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa", "aa" }, new int[] { 1, 4 }, new int[] { 2, 5 }, BLOCK.length()); }
public void test1block1matchShorter() throws IOException { final String BLOCK = "aa bb cc dd"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, new int[] { 0, 12 }, new int[] { 11, 14 }, BLOCK.length()); }
public void test1blockMultiMatches() throws IOException { final String BLOCK = " aa bb cc --- aa bb aa bb cc"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }, BLOCK.length()); }
public void test2blocksMultiMatches() throws IOException { final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc"; CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", new StringReader( BLOCK ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }, BLOCK.length()); }
public void testChain() throws IOException { final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb"; CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa", new StringReader( BLOCK ) ); cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs ); cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }, BLOCK.length()); }
public void testReaderReset() throws Exception { CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) ); char[] buf = new char[10]; int len = cs.read(buf, 0, 10); assertEquals( 1, len ); assertEquals( 'x', buf[0]) ; len = cs.read(buf, 0, 10); assertEquals( -1, len ); // rewind cs.reset(); len = cs.read(buf, 0, 10); assertEquals( 1, len ); assertEquals( 'x', buf[0]) ; }
public void testTokenStream() throws Exception { String testString = "h i j k ll cccc bbb aa"; CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[]{"i","i","jj","kkk","llll","cc","b","a"}, new int[]{0,2,4,6,8,11,16,20}, new int[]{1,3,5,7,10,15,19,22}, testString.length() ); }
public void testChained() throws Exception { String testString = "aaaa ll h"; CharFilter cs = new MappingCharFilter( normMap, new MappingCharFilter( normMap, new StringReader( testString ) ) ); TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); assertTokenStreamContents(ts, new String[]{"a","llllllll","i"}, new int[]{0,5,8}, new int[]{4,7,9}, testString.length() ); }
public void testIterationMarksWithKeywordTokenizer() throws IOException { final String text = "時々馬鹿々々しいところゞゝゝミスヾ"; JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); CharFilter filter = filterFactory.create(new StringReader(text)); TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false); assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"}); }