public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException { Double label = row.getDouble(1); StringReader document = new StringReader(row.getString(0).replaceAll("br2n", "")); List<String> wordsList = new ArrayList<>(); try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) { TokenStream stream = analyzer.tokenStream("words", document); TokenFilter lowerFilter = new LowerCaseFilter(stream); TokenFilter numbers = new NumberFilter(lowerFilter); TokenFilter length = new LengthFilter(numbers, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", ""); if (word.contains("_")) { continue; } wordsList.add(word); } } } Multiset<String> words = ConcurrentHashMultiset.create(wordsList); return new Tuple2<>(label, words); }
public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
@Override public TokenStream create(TokenStream input) { return new BulgarianStemFilter(input); }