Java 类org.apache.lucene.analysis.bg.BulgarianStemFilter 实例源码

项目:news-credibility    文件:TokenTransform.java   
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException {
    Double label = row.getDouble(1);
    StringReader document = new StringReader(row.getString(0).replaceAll("br2n", ""));
    List<String> wordsList = new ArrayList<>();

    try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) {
        TokenStream stream = analyzer.tokenStream("words", document);

        TokenFilter lowerFilter = new LowerCaseFilter(stream);
        TokenFilter numbers = new NumberFilter(lowerFilter);
        TokenFilter length = new LengthFilter(numbers, 3, 1000);
        TokenFilter stemmer = new BulgarianStemFilter(length);
        TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3);

        try (TokenFilter filter = ngrams) {
            Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
            filter.reset();
            while (filter.incrementToken()) {
                String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", "");
                if (word.contains("_")) {
                    continue;
                }
                wordsList.add(word);
            }
        }
    }

    Multiset<String> words = ConcurrentHashMultiset.create(wordsList);

    return new Tuple2<>(label, words);
}
项目:news-credibility    文件:EgdeMain.java   
public static void main(String[] args) throws IOException {
    System.out.println(NumberUtils.isDigits("12345"));
    System.out.println(NumberUtils.isDigits("12345.1"));
    System.out.println(NumberUtils.isDigits("12345,2"));

    System.out.println(NumberUtils.isNumber("12345"));
    System.out.println(NumberUtils.isNumber("12345.1"));
    System.out.println(NumberUtils.isNumber("12345,2".replace(",", ".")));
    System.out.println(NumberUtils.isNumber("12345,2"));
    StringReader input = new StringReader(
            "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24"
                    .replaceAll("br2n", ""));

    LetterTokenizer tokenizer = new LetterTokenizer();
    tokenizer.setReader(input);

    TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET);
    TokenFilter length = new LengthFilter(stopFilter, 3, 1000);
    TokenFilter stemmer = new BulgarianStemFilter(length);
    TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2);

    try (TokenFilter filter = ngrams) {

        Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        while (filter.incrementToken()) {
            String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", "");
            System.out.println(word);
        }
    }
}
项目:lams    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:search    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:NYBC    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:read-open-source-code    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:read-open-source-code    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:read-open-source-code    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}
项目:Maskana-Gestor-de-Conocimiento    文件:BulgarianStemFilterFactory.java   
@Override
public TokenStream create(TokenStream input) {
  return new BulgarianStemFilter(input);
}