Java 类org.apache.lucene.analysis.miscellaneous.LengthFilter 实例源码

项目:Elasticsearch    文件:LengthTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    if (version.onOrAfter(Version.LUCENE_4_4)) {
        return new LengthFilter(tokenStream, min, max);
    } else {
        @SuppressWarnings("deprecation")
        final TokenStream filter = new Lucene43LengthFilter(enablePositionIncrements, tokenStream, min, max);
        return filter;
    }
}
项目:news-credibility    文件:TokenTransform.java   
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException {
    Double label = row.getDouble(1);
    StringReader document = new StringReader(row.getString(0).replaceAll("br2n", ""));
    List<String> wordsList = new ArrayList<>();

    try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) {
        TokenStream stream = analyzer.tokenStream("words", document);

        TokenFilter lowerFilter = new LowerCaseFilter(stream);
        TokenFilter numbers = new NumberFilter(lowerFilter);
        TokenFilter length = new LengthFilter(numbers, 3, 1000);
        TokenFilter stemmer = new BulgarianStemFilter(length);
        TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3);

        try (TokenFilter filter = ngrams) {
            Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
            filter.reset();
            while (filter.incrementToken()) {
                String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", "");
                if (word.contains("_")) {
                    continue;
                }
                wordsList.add(word);
            }
        }
    }

    Multiset<String> words = ConcurrentHashMultiset.create(wordsList);

    return new Tuple2<>(label, words);
}
项目:news-credibility    文件:EgdeMain.java   
public static void main(String[] args) throws IOException {
    System.out.println(NumberUtils.isDigits("12345"));
    System.out.println(NumberUtils.isDigits("12345.1"));
    System.out.println(NumberUtils.isDigits("12345,2"));

    System.out.println(NumberUtils.isNumber("12345"));
    System.out.println(NumberUtils.isNumber("12345.1"));
    System.out.println(NumberUtils.isNumber("12345,2".replace(",", ".")));
    System.out.println(NumberUtils.isNumber("12345,2"));
    StringReader input = new StringReader(
            "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24"
                    .replaceAll("br2n", ""));

    LetterTokenizer tokenizer = new LetterTokenizer();
    tokenizer.setReader(input);

    TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET);
    TokenFilter length = new LengthFilter(stopFilter, 3, 1000);
    TokenFilter stemmer = new BulgarianStemFilter(length);
    TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2);

    try (TokenFilter filter = ngrams) {

        Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        while (filter.incrementToken()) {
            String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", "");
            System.out.println(word);
        }
    }
}
项目:elasticsearch_my    文件:LengthTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    return new LengthFilter(tokenStream, min, max);
}
项目:basekb-search    文件:SafetyAnalyzer.java   
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
    TokenStream ts = components.getTokenStream();
    LengthFilter drop_long_tokens = new LengthFilter(ts, 0, 1024);
    return new TokenStreamComponents(components.getTokenizer(), drop_long_tokens);
}
项目:NYBC    文件:LengthFilterFactory.java   
@Override
public LengthFilter create(TokenStream input) {
  return new LengthFilter(enablePositionIncrements, input,min,max);
}