@Override public void init(Map<String, String> args) { super.init(args); maxShingleSize = getInt("maxShingleSize", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); if (maxShingleSize < 2) { throw new IllegalArgumentException("Invalid maxShingleSize (" + maxShingleSize + ") - must be at least 2"); } minShingleSize = getInt("minShingleSize", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); if (minShingleSize < 2) { throw new IllegalArgumentException("Invalid minShingleSize (" + minShingleSize + ") - must be at least 2"); } if (minShingleSize > maxShingleSize) { throw new IllegalArgumentException("Invalid minShingleSize (" + minShingleSize + ") - must be no greater than maxShingleSize (" + maxShingleSize + ")"); } outputUnigrams = getBoolean("outputUnigrams", true); outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false); tokenSeparator = args.containsKey("tokenSeparator") ? args.get("tokenSeparator") : ShingleFilter.TOKEN_SEPARATOR; }
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); Boolean outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams", true, deprecationLogger); Boolean outputUnigramsIfNoShingles = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams_if_no_shingles", false, deprecationLogger); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); }
@Override public TokenStream create(TokenStream tokenStream) { ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.setTokenSeparator(tokenSeparator); filter.setFillerToken(fillerToken); return filter; }
@Override public final boolean incrementToken() throws IOException { //TODO make sure this works with synonyms and stop words int i = 0; while (input.incrementToken()) { String term = new String(termAttr.buffer(), 0, termAttr.length()); List<String> word = posIncrAttr.getPositionIncrement() > 0 ? new ArrayList<String>() : words.removeLast(); word.add(term); words.add(word); i++; } // now write out as a single token if (! concat) { makePhrases(words, phrases, 0); concat = true; } while (phrases.size() > 0) { String phrase = phrases.removeFirst(); restoreState(current); clearAttributes(); termAttr.setEmpty(); termAttr.append(phrase); termAttr.setLength(phrase.length()); //posIncrAttr.setPositionIncrement(0); typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" current = captureState(); return true; } concat = false; return false; }
@Inject public ShingleTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true); Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); }
@NotNull private static Analyzer createShingleAnalyzer(final int maxShingles) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull final String field) { final Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); final ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles); shingleFilter.setOutputUnigrams(true); return new TokenStreamComponents(source, shingleFilter); } }; }
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException { Double label = row.getDouble(1); StringReader document = new StringReader(row.getString(0).replaceAll("br2n", "")); List<String> wordsList = new ArrayList<>(); try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) { TokenStream stream = analyzer.tokenStream("words", document); TokenFilter lowerFilter = new LowerCaseFilter(stream); TokenFilter numbers = new NumberFilter(lowerFilter); TokenFilter length = new LengthFilter(numbers, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", ""); if (word.contains("_")) { continue; } wordsList.add(word); } } } Multiset<String> words = ConcurrentHashMultiset.create(wordsList); return new Tuple2<>(label, words); }
public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
/** Tests ShingleFilter up to six shingles against six terms. * Tests PositionFilter setting all but the first positionIncrement to zero. * @throws java.io.IOException @see Token#next(Token) */ public void test6GramFilterNoPositions() throws Exception { ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6); assertTokenStreamContents(new PositionFilter(filter), SIX_GRAM_NO_POSITIONS_TOKENS, SIX_GRAM_NO_POSITIONS_INCREMENTS); }
@Override public ShingleFilter create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); return r; }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream src = new LowerCaseFilter(source); ShingleFilter filter = new ShingleFilter(src, this.minShingleSize, this.maxShingleSize); return new TokenStreamComponents(source, filter); }
@Override public final boolean incrementToken() throws IOException { if (done) return false; done = true; buf.setLength(0); boolean firstTerm = true; while (input.incrementToken()) { if (!firstTerm) { buf.append(separator); } //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13 buf.append(termAtt); firstTerm = false; } input.end();//call here so we can see end of stream offsets termAtt.setEmpty().append(buf); //Setting the other attributes ultimately won't have much effect but lets be thorough offsetAtt.setOffset(0, offsetAtt.endOffset()); posIncrAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way. typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" return true; }
public Factory(String name) { this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN); }
public static List<String> getNgrams(String text, int N) throws IOException { List<String> tokens = new ArrayList<String>(); Reader reader = new StringReader(text); // Tokenizer //StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader); LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_46, reader); // Filters LowerCaseFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); KStemFilter kStemFilter = new KStemFilter(lowerCaseFilter); CharArraySet stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; StopFilter stopFilter = new StopFilter(Version.LUCENE_46, kStemFilter, stopwords); TokenStream ts; if(N > 1) { PositionFilter positionFilter = new PositionFilter(stopFilter); //@SuppressWarnings("resource") //ShingleFilter shingleFilter = new ShingleFilter(positionFilter, N, N); //shingleFilter.setOutputUnigrams(false); @SuppressWarnings("resource") ShingleFilter shingleFilter = new ShingleFilter(positionFilter, 2, N); shingleFilter.setOutputUnigrams(true); ts = shingleFilter; } else { ts = stopFilter; } CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = charTermAtt.toString(); if(token.length()>1) tokens.add(token); } ts.end(); ts.close(); return tokens; }