public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
public void testCrossPlaneNormalization() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), reader) { @Override protected int normalize(int c) { if (c > 0xffff) { return 'δ'; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", s); try { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } // just for fun checkRandomData(random(), analyzer, num); }
public void testCrossPlaneNormalization2() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), reader) { @Override protected int normalize(int c) { if (c <= 0xffff) { return 0x1043C; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", s); try { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } // just for fun checkRandomData(random(), analyzer, num); }
public void testCrossPlaneNormalization() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) { @Override protected int normalize(int c) { if (c > 0xffff) { return 'δ'; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", new StringReader(s)); ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); ts.close(); } // just for fun checkRandomData(random(), analyzer, num); }
public void testCrossPlaneNormalization2() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) { @Override protected int normalize(int c) { if (c <= 0xffff) { return 0x1043C; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", new StringReader(s)); ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); ts.close(); } // just for fun checkRandomData(random(), analyzer, num); }
public void testCrossPlaneNormalization() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) { @Override protected int normalize(int c) { if (c > 0xffff) { return 'δ'; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", s); try { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } // just for fun checkRandomData(random(), analyzer, num); }
public void testCrossPlaneNormalization2() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) { @Override protected int normalize(int c) { if (c <= 0xffff) { return 0x1043C; } else { return c; } } }; return new TokenStreamComponents(tokenizer, tokenizer); } }; int num = 1000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { String s = _TestUtil.randomUnicodeString(random()); TokenStream ts = analyzer.tokenStream("foo", s); try { ts.reset(); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); while (ts.incrementToken()) { String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset()); for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) { cp = highlightedText.codePointAt(j); assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp)); } } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } } // just for fun checkRandomData(random(), analyzer, num); }
@Override public Tokenizer create() { return new LetterTokenizer(); }
@Override public LetterTokenizer create(Reader input) { return new LetterTokenizer(luceneMatchVersion, input); }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { return new MetaphoneReplacementFilter(new LetterTokenizer(reader)); }