@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
@Override public void inform(ResourceLoader loader) throws IOException { final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_43, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_43, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { String formatClass = format; if (format == null || format.equals("solr")) { formatClass = SolrSynonymParser.class.getName(); } else if (format.equals("wordnet")) { formatClass = WordnetSynonymParser.class.getName(); } // TODO: expose dedup as a parameter? map = loadSynonyms(loader, formatClass, true, analyzer); } catch (ParseException e) { throw new IOException("Error parsing synonyms file:", e); } }
/** * @deprecated Use {@link #ThaiWordFilter(TokenStream)} */ @Deprecated public ThaiWordFilter(Version matchVersion, TokenStream input) { super(matchVersion.onOrAfter(Version.LUCENE_3_1) ? input : new LowerCaseFilter(matchVersion, input)); if (!DBBI_AVAILABLE) throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation"); handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_3_1); }
@Override public LowerCaseFilter create(TokenStream input) { if (luceneMatchVersion == null) { return new LowerCaseFilter(input); } return new LowerCaseFilter(luceneMatchVersion, input); }
@Override public TokenStream create(TokenStream tokenStream) { if (lang == null) { return new LowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("greek")) { return new GreekLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("irish")) { return new IrishLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("turkish")) { return new TurkishLowerCaseFilter(tokenStream); } else { throw new IllegalArgumentException("language [" + lang + "] not support for lower case"); } }
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException { Double label = row.getDouble(1); StringReader document = new StringReader(row.getString(0).replaceAll("br2n", "")); List<String> wordsList = new ArrayList<>(); try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) { TokenStream stream = analyzer.tokenStream("words", document); TokenFilter lowerFilter = new LowerCaseFilter(stream); TokenFilter numbers = new NumberFilter(lowerFilter); TokenFilter length = new LengthFilter(numbers, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", ""); if (word.contains("_")) { continue; } wordsList.add(word); } } } Multiset<String> words = ConcurrentHashMultiset.create(wordsList); return new Tuple2<>(label, words); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new MockTokenizer(reader, MockTokenizer.KEYWORD, false); return new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(t))); }
public void testMultipleSources() throws Exception { final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); tee1.reset(); final TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(CheckClearAttributesAttribute.class); dogDetector.addAttribute(CheckClearAttributesAttribute.class); theDetector.addAttribute(CheckClearAttributesAttribute.class); MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false); final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer); tee2.addSinkTokenStream(dogDetector); tee2.addSinkTokenStream(theDetector); final TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"}); assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"}); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.length]; for (int i = 0; i < tokens1.length; i++) lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT); assertTokenStreamContents(lowerCasing, lowerCaseTokens); }
@Override protected TokenStreamComponents createComponents( String fieldName ) { called = true; Tokenizer source = new WhitespaceTokenizer(); return new TokenStreamComponents( source, new LowerCaseFilter( source ) ); }
@Override protected TokenStreamComponents createComponents(String fieldName) { final LowerCaseTokenizer src = new LowerCaseTokenizer(); final TokenStream tok = new LowerCaseFilter(src); return new TokenStreamComponents(src, tok); }
protected static Analyzer getAnalyzer(final boolean ignoreCase) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new KeywordTokenizer(); final TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer; if (DocumentIndex.FIELD__KEYWORD.equals(fieldName)) { tokenizer = new NullTokenizer(reader); } else { tokenizer = new LetterOrDigitTokenizer(reader); } return new TokenStreamComponents(tokenizer, new LowerCaseFilter(Version.LUCENE_40, tokenizer)); }
@SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new LemmatizingTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); // stream = new KeywordRepeatFilter(stream); stream = new LemmaTokenFilter(stream, true); return new TokenStreamComponents(tokenizer, stream); }
@SuppressWarnings("resource") @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new LemmatizingTokenizer(); TokenStream stream = new LowerCaseFilter(tokenizer); stream = new LemmaTokenFilter(stream, false); return new TokenStreamComponents(tokenizer, stream); }
@Override public void inform(ResourceLoader loader) throws IOException { final boolean ignoreCase = getBoolean("ignoreCase", false); this.ignoreCase = ignoreCase; String tf = args.get("tokenizerFactory"); final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_31, reader) : factory.create(reader); TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_31, tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; String format = args.get("format"); try { if (format == null || format.equals("solr")) { // TODO: expose dedup as a parameter? map = loadSolrSynonyms(loader, true, analyzer); } else if (format.equals("wordnet")) { map = loadWordnetSynonyms(loader, true, analyzer); } else { // TODO: somehow make this more pluggable throw new IllegalArgumentException("Unrecognized synonyms format: " + format); } } catch (ParseException e) { throw new IOException("Exception thrown while loading synonyms", e); } }
/** Creates a new ThaiWordFilter with the specified match version. */ public ThaiWordFilter(Version matchVersion, TokenStream input) { super(matchVersion.onOrAfter(Version.LUCENE_31) ? input : new LowerCaseFilter(matchVersion, input)); if (!DBBI_AVAILABLE) throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation"); handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new KeywordTokenizer(reader); return new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t))); }
public void testMultipleSources() throws Exception { final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false)); final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter); final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter); tee1.reset(); final TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(CheckClearAttributesAttribute.class); dogDetector.addAttribute(CheckClearAttributesAttribute.class); theDetector.addAttribute(CheckClearAttributesAttribute.class); final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false)); tee2.addSinkTokenStream(dogDetector); tee2.addSinkTokenStream(theDetector); final TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); assertTokenStreamContents(theDetector, new String[]{"The", "the", "The", "the"}); assertTokenStreamContents(dogDetector, new String[]{"Dogs", "Dogs"}); source1.reset(); TokenStream lowerCasing = new LowerCaseFilter(TEST_VERSION_CURRENT, source1); String[] lowerCaseTokens = new String[tokens1.length]; for (int i = 0; i < tokens1.length; i++) lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT); assertTokenStreamContents(lowerCasing, lowerCaseTokens); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new GalicianStemFilter(result)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new PortugueseLightStemFilter(result)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader); TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source); return new TokenStreamComponents(source, new PortugueseStemFilter(result)); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new WhitespaceTokenizer(version, reader); TokenStream filter = new LowerCaseFilter(version, source); filter = new InlineAnnotationFilter(filter); return new TokenStreamComponents(source, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream src = new LowerCaseFilter(source); ShingleFilter filter = new ShingleFilter(src, this.minShingleSize, this.maxShingleSize); return new TokenStreamComponents(source, filter); }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream src = new LowerCaseFilter(source); src = new AddWordBoundaryFilter(src); NGramTokenFilter filter = new NGramTokenFilter(src, this.minShingleSize, this.maxShingleSize); return new TokenStreamComponents(source, filter); }
@Override protected TokenStreamComponents createComponents( final String fieldName ) { final LowerCaseTokenizer src = new LowerCaseTokenizer(); final TokenStream tok = new LowerCaseFilter( src ); return new TokenStreamComponents( src, tok ); }
public static Analyzer getAnalyzer(final boolean ignoreCase) { return new Analyzer() { @Override protected TokenStreamComponents createComponents( final String fieldName, final Reader reader) { final Tokenizer tokenizer = new KeywordTokenizer(reader); @SuppressWarnings("resource") final TokenStream stream = ignoreCase ? new LowerCaseFilter( tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; }