/** @deprecated remove this and sophisticated backwards layer in 5.0 */ @Deprecated public void testCombiningMarksBackwards() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents (String fieldName, Reader reader) { Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_3_1, reader); return new TokenStreamComponents(tokenizer); } }; checkOneTerm(a, "ざ", "さ"); // hiragana Bug checkOneTerm(a, "ザ", "ザ"); // katakana Works checkOneTerm(a, "壹゙", "壹"); // ideographic Bug checkOneTerm(a, "아゙", "아゙"); // hangul Works }
/** @deprecated remove this and sophisticated backwards layer in 5.0 */ @Deprecated public void testCombiningMarksBackwards() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents (String fieldName, Reader reader) { Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader); return new TokenStreamComponents(tokenizer); } }; checkOneTerm(a, "ざ", "さ"); // hiragana Bug checkOneTerm(a, "ザ", "ザ"); // katakana Works checkOneTerm(a, "壹゙", "壹"); // ideographic Bug checkOneTerm(a, "아゙", "아゙"); // hangul Works }
public void testLongEMAILatomText() throws Exception { // EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~] char[] emailAtomChars = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray(); StringBuilder builder = new StringBuilder(); int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024); for (int i = 0 ; i < numChars ; ++i) { builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]); } int tokenCount = 0; String text = builder.toString(); UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer(new StringReader(text)); ts.reset(); while (ts.incrementToken()) { tokenCount++; } ts.end(); ts.close(); assertTrue(tokenCount > 0); tokenCount = 0; int newBufferSize = TestUtil.nextInt(random(), 200, 8192); ts.setMaxTokenLength(newBufferSize); ts.setReader(new StringReader(text)); ts.reset(); while (ts.incrementToken()) { tokenCount++; } ts.end(); ts.close(); assertTrue(tokenCount > 0); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
@Override protected TokenStreamComponents createComponents (String fieldName, Reader reader) { Tokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader); return new TokenStreamComponents(tokenizer); }
@Override public final boolean incrementToken() throws java.io.IOException { boolean isTokenAvailable = false; while (input.incrementToken()) { if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) { isTokenAvailable = true; break; } } return isTokenAvailable; }
@Override public final boolean incrementToken() throws java.io.IOException { boolean isTokenAvailable = false; while (input.incrementToken()) { if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) { isTokenAvailable = true; break; } } return isTokenAvailable; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader); tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs TokenFilter filter = new URLFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
@Override protected TokenStreamComponents createComponents (String fieldName, Reader reader) { Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(tokenizer); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader); tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs TokenFilter filter = new URLFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
public void testLemmagenFilterFactoryWithDefaultLexicon() throws IOException { ESTestCase.TestAnalysis analysis = createAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_default_filter"); assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class)); String source = "I was late."; String[] expected = new String[]{"I", "be", "late"}; Tokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testLemmagenFilterFactoryWithCustomLexicon() throws IOException { ESTestCase.TestAnalysis analysis = createAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_cs_filter"); assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class)); String source = "Děkuji, že jsi přišel."; String[] expected = {"Děkovat", "že", "být", "přijít"}; Tokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testLemmagenFilterFactoryWithShortLexiconCode() throws IOException { ESTestCase.TestAnalysis analysis = createAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_fr_filter"); assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class)); String source = "Il faut encore ajouter une pincée de sel."; String[] expected = new String[]{"Il", "falloir", "encore", "ajouter", "un", "pincer", "de", "sel"}; Tokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testLemmagenFilterFactoryWithPath() throws IOException { ESTestCase.TestAnalysis analysis = createAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_cs_path_filter"); assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class)); String source = "Děkuji, že jsi přišel."; String[] expected = {"Děkovat", "že", "být", "přijít"}; Tokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
@Override public Tokenizer create() { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader); TokenFilter filter = new EmailFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override public UAX29URLEmailTokenizer create(Reader input) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader); TokenFilter filter = new EmailFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }