public void testInvalidOffset() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new WordTokenFilter(filters); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mosfellsbaer" }, new int[] { 0 }, new int[] { 11 }); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); }
public void testFirstPosInc() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new MockSynonymFilter(tokenizer); StopFilter stopfilter = new StopFilter(Version.LUCENE_4_3, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stopfilter.setEnablePositionIncrements(false); return new TokenStreamComponents(tokenizer, stopfilter); } }; assertAnalyzesTo(analyzer, "the quick brown fox", new String[] { "hte", "quick", "brown", "fox" }, new int[] { 1, 1, 1, 1} ); }
@Override public TokenStream create(TokenStream input) { return new TokenFilter(input) { @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { try { throw exceptionClass.newInstance(); } catch (IllegalAccessException iae) { throw new RuntimeException(iae); } catch (InstantiationException ie) { throw new RuntimeException(ie); } } return false; } }; }
@Override public List<Annotation> annotate(String text) throws Exception { text = SimpleTokenizer.format(text); Analyzer analyser = new EnglishAnalyzer(Version.LUCENE_47, CharArraySet.EMPTY_SET); TokenFilter filter = new EnglishMinimalStemFilter(analyser.tokenStream("text", new StringReader(text))); List<Annotation> out = Lists.newArrayList(); while (filter.incrementToken()) { CharTermAttribute az = filter.getAttribute(CharTermAttribute.class); OffsetAttribute o = filter.getAttribute(OffsetAttribute.class); String token = text.substring(o.startOffset(), o.endOffset()); String lemma = az.toString(); Annotation t = new Annotation(); t.setForm(token); t.setLemma(lemma); out.add(t); } if (out.size() == 0) { log.debug("Input string is empty"); } filter.close(); analyser.close(); return out; }
public void testFirstPosInc() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new MockSynonymFilter(tokenizer); StopFilter stopfilter = new StopFilter(TEST_VERSION_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stopfilter.setEnablePositionIncrements(false); return new TokenStreamComponents(tokenizer, stopfilter); } }; assertAnalyzesTo(analyzer, "the quick brown fox", new String[] { "hte", "quick", "brown", "fox" }, new int[] { 1, 1, 1, 1} ); }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new NGramTokenFilter(TEST_VERSION_CURRENT, filters, 2, 2); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); }
public void testFirstPosInc() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filter = new MockSynonymFilter(tokenizer); StopFilter stopfilter = new StopFilter(Version.LUCENE_43, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stopfilter.setEnablePositionIncrements(false); return new TokenStreamComponents(tokenizer, stopfilter); } }; assertAnalyzesTo(analyzer, "the quick brown fox", new String[] { "hte", "quick", "brown", "fox" }, new int[] { 1, 1, 1, 1} ); }
@Override public TokenFilter create(TokenStream input) { if (luceneMatchVersion == null) { return new NGramTokenFilter(input, minGramSize, maxGramSize); } return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize); }
@Override public TokenFilter create(TokenStream input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } return new Lucene43HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); }
@Override public TokenFilter create(TokenStream input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_8_0)) { return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords); } else { return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords); } }
@Test public void affixedFilterTest() throws IOException { System.out.println("Testing TibAffixedFilter()"); String input = "དག། གའམ། གའིའོ། དགའ། དགའི། དགའོ། དགའིས། དགའང་། དགའམ། དགའིའོ།"; Reader reader = new StringReader(input); List<String> expected = Arrays.asList("དག", "ག", "ག", "དགའ", "དགའ", "དགའ", "དགའ", "དགའ", "དགའ", "དགའ"); System.out.print(input + " => "); TokenStream syllables = tokenize(reader, new TibSyllableTokenizer()); TokenFilter res = new TibAffixedFilter(syllables); assertTokenStream(res, expected); }
@Override protected TokenStreamComponents createComponents(String fieldName) { JiebaTokenizer tokenizer = new JiebaTokenizer(); if (userDictIn != null) { try { tokenizer.loadUserDict(userDictIn); } catch (IOException e) { throw new RuntimeException("load user dict error"); } } TokenFilter stopFilter = new JiebaStopTokenFilter(tokenizer); return new TokenStreamComponents(tokenizer, stopFilter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final AutocompleteTokenizer tokenizer = new AutocompleteTokenizer(reader); TokenFilter filter = new StandardFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@NotNull private static Analyzer spellcheckAnalyzer(@NotNull final SpellChecker spellChecker) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull final String field) { final Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); final SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker); final TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
@NotNull private static Analyzer concatenatingAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull final String field) { final Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); final TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException { Double label = row.getDouble(1); StringReader document = new StringReader(row.getString(0).replaceAll("br2n", "")); List<String> wordsList = new ArrayList<>(); try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) { TokenStream stream = analyzer.tokenStream("words", document); TokenFilter lowerFilter = new LowerCaseFilter(stream); TokenFilter numbers = new NumberFilter(lowerFilter); TokenFilter length = new LengthFilter(numbers, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", ""); if (word.contains("_")) { continue; } wordsList.add(word); } } } Multiset<String> words = ConcurrentHashMultiset.create(wordsList); return new Tuple2<>(label, words); }
public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
private synchronized TokenFilter maybePayload(TokenFilter stream, String fieldName) { Integer val = previousMappings.get(fieldName); if (val == null) { val = -1; // no payloads previousMappings.put(fieldName, val); // save it so we are consistent for this field } return stream; }
public void testInvalidOffsets() throws Exception { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); TokenFilter filters = new ASCIIFoldingFilter(tokenizer); filters = new EdgeNGramTokenFilter(Version.LUCENE_4_3, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); return new TokenStreamComponents(tokenizer, filters); } }; assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); }
public void testLucene43() throws IOException { TokenFilter filter = new Lucene43NGramTokenFilter(input, 2, 3); assertTokenStreamContents(filter, new String[]{"ab","bc","cd","de","abc","bcd","cde"}, new int[]{0,1,2,3,0,1,2}, new int[]{2,3,4,5,3,4,5}, null, new int[]{1,1,1,1,1,1,1}, null, null, false ); }
public void testElision() throws Exception { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), new StringReader(test)); CharArraySet articles = new CharArraySet(asSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); List<String> tas = filter(filter); assertEquals("embrouille", tas.get(4)); assertEquals("O'brian", tas.get(6)); assertEquals("enfin", tas.get(7)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader); tokenizer.setMaxTokenLength(Integer.MAX_VALUE); // Tokenize arbitrary length URLs TokenFilter filter = new URLFilter(tokenizer); return new TokenStreamComponents(tokenizer, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader input) { Tokenizer tokenizer = new MockTokenizer(input); if (fieldName.equals("distinctiveFieldName")) { TokenFilter tosser = new TokenFilter(tokenizer) { @Override public boolean incrementToken() throws IOException { throw new BadNews("Something is icky."); } }; return new TokenStreamComponents(tokenizer, tosser); } else { return new TokenStreamComponents(tokenizer); } }
private synchronized TokenFilter maybePayload(TokenFilter stream, String fieldName) { Integer val = previousMappings.get(fieldName); if (val == null) { val = -1; // no payloads if (rarely(random)) { switch (random.nextInt(3)) { case 0: val = -1; // no payloads break; case 1: val = Integer.MAX_VALUE; // variable length payload break; case 2: val = random.nextInt(12); // fixed length payload break; } } if (VERBOSE) { if (val == Integer.MAX_VALUE) { System.out.println("MockAnalyzer: field=" + fieldName + " gets variable length payloads"); } else if (val != -1) { System.out.println("MockAnalyzer: field=" + fieldName + " gets fixed length=" + val + " payloads"); } } previousMappings.put(fieldName, val); // save it so we are // consistent for this field } if (val == -1) return stream; else if (val == Integer.MAX_VALUE) return new MockVariableLengthPayloadFilter(random, stream); else return new MockFixedLengthPayloadFilter(random, stream, val); }
@Test public void returns_false_when_no_more_tokens() throws IOException { try (TokenFilter f = new AnnotatorTokenFilter(new EmptyTokenStream(), annotator)) { f.reset(); assertThat(f.incrementToken()).isFalse(); } }
@Test public void does_not_return_any_token_if_no_accepted_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) { tok.setReader(new StringReader(ONE)); assertTokenInfos(f); } }
@Test public void returns_accepted_token() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) { stubAnnotator(ONE); tok.setReader(new StringReader(ONE)); assertTokenInfos(f, new TokenInfo(ONE, 0)); } }
@Test public void returns_all_accepted_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) { stubAnnotator(ONE, THREE); tok.setReader(new StringReader(ONE_TWO_THREE)); assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(THREE, 2)); } }
@Test public void returns_tokens_when_only_accepted_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new AnnotatorTokenFilter(tok, annotator)) { stubAnnotator(ONE, TWO); tok.setReader(new StringReader(ONE_TWO)); assertTokenInfos(f, new TokenInfo(ONE, 0), new TokenInfo(TWO, 1)); } }
@Test public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false)); TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) { stubAnnotator(TWO); tok.setReader(new StringReader(ONE_TWO)); assertTokenInfos(f, new TokenInfo(TWO, 1)); } }
@Test public void returns_token_when_underlying_stream_skips_multiple_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE, THREE), false)); TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) { stubAnnotator(TWO, FOUR); tok.setReader(new StringReader(ONE_TWO_THREE_FOUR)); assertTokenInfos(f, new TokenInfo(TWO, 1), new TokenInfo(FOUR, 3)); } }
@Test public void returns_false_when_no_more_tokens() throws IOException { try (TokenFilter f = new PreAnnotatedTokenFilter(new EmptyTokenStream(), 1, 2)) { f.reset(); assertThat(f.incrementToken()).isFalse(); } }
@Test public void returns_annotated_token() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new PreAnnotatedTokenFilter(tok, 0, 1)) { tok.setReader(new StringReader(ONE)); assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 0, 1), new TokenInfo(ONE, 0)); } }
@Test public void returns_all_annotated_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter f = new PreAnnotatedTokenFilter(tok, 0, 1, 2, 1)) { tok.setReader(new StringReader(ONE_TWO_THREE)); assertTokenInfos(f, new TokenInfo(ANY_ANNOTATION_TERM, 0, 1), new TokenInfo(ONE, 0), new TokenInfo(ANY_ANNOTATION_TERM, 2, 1), new TokenInfo(THREE, 2)); } }