Java 类org.apache.lucene.analysis.shingle.ShingleFilter 实例源码

项目:NYBC    文件:ShingleFilterFactory.java   
@Override
public void init(Map<String, String> args) {
  super.init(args);
  maxShingleSize = getInt("maxShingleSize", 
                          ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
  if (maxShingleSize < 2) {
    throw new IllegalArgumentException("Invalid maxShingleSize (" + maxShingleSize
                            + ") - must be at least 2");
  }
  minShingleSize = getInt("minShingleSize",
                          ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
  if (minShingleSize < 2) {
    throw new IllegalArgumentException("Invalid minShingleSize (" + minShingleSize
                            + ") - must be at least 2");
  }
  if (minShingleSize > maxShingleSize) {
    throw new IllegalArgumentException("Invalid minShingleSize (" + minShingleSize
                            + ") - must be no greater than maxShingleSize ("
                            + maxShingleSize + ")");
  }
  outputUnigrams = getBoolean("outputUnigrams", true);
  outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false);
  tokenSeparator = args.containsKey("tokenSeparator")
                   ? args.get("tokenSeparator")
                   : ShingleFilter.TOKEN_SEPARATOR;
}
项目:elasticsearch_my    文件:ShingleTokenFilterFactory.java   
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
    Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
    Boolean outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams", true, deprecationLogger);
    Boolean outputUnigramsIfNoShingles = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "output_unigrams_if_no_shingles", false, deprecationLogger);
    String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
    String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
    factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
项目:elasticsearch_my    文件:ShingleTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
    filter.setOutputUnigrams(outputUnigrams);
    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    filter.setTokenSeparator(tokenSeparator);
    filter.setFillerToken(fillerToken);
    return filter;
}
项目:RelevancyFeedback    文件:ConcatenateTokenFilter.java   
@Override
public final boolean incrementToken() throws IOException {
    //TODO make sure this works with synonyms and stop words

    int i = 0;
    while (input.incrementToken()) {
        String term = new String(termAttr.buffer(), 0, termAttr.length());
        List<String> word = posIncrAttr.getPositionIncrement() > 0 ? new ArrayList<String>() : words.removeLast();
        word.add(term);
        words.add(word);
        i++;
    }
    // now write out as a single token
    if (! concat) {
        makePhrases(words, phrases, 0);
        concat = true;
    }
    while (phrases.size() > 0) {

        String phrase = phrases.removeFirst();
        restoreState(current);
        clearAttributes();

        termAttr.setEmpty();
        termAttr.append(phrase);
        termAttr.setLength(phrase.length());

        //posIncrAttr.setPositionIncrement(0);
        typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

        current = captureState();
        return true;
    }

    concat = false;
    return false;
}
项目:Elasticsearch    文件:ShingleTokenFilterFactory.java   
@Inject
public ShingleTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
    Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
    Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
    Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
    String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
    String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
    factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
项目:Elasticsearch    文件:ShingleTokenFilterFactory.java   
@Override
public TokenStream create(TokenStream tokenStream) {
    ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
    filter.setOutputUnigrams(outputUnigrams);
    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    filter.setTokenSeparator(tokenSeparator);
    filter.setFillerToken(fillerToken);
    return filter;
}
项目:hmftools    文件:TreatmentCurator.java   
@NotNull
private static Analyzer createShingleAnalyzer(final int maxShingles) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull final String field) {
            final Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            final ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles);
            shingleFilter.setOutputUnigrams(true);
            return new TokenStreamComponents(source, shingleFilter);
        }
    };
}
项目:news-credibility    文件:TokenTransform.java   
public Tuple2<Double, Multiset<String>> transform(Row row) throws IOException {
    Double label = row.getDouble(1);
    StringReader document = new StringReader(row.getString(0).replaceAll("br2n", ""));
    List<String> wordsList = new ArrayList<>();

    try (BulgarianAnalyzer analyzer = new BulgarianAnalyzer(BULGARIAN_STOP_WORDS_SET)) {
        TokenStream stream = analyzer.tokenStream("words", document);

        TokenFilter lowerFilter = new LowerCaseFilter(stream);
        TokenFilter numbers = new NumberFilter(lowerFilter);
        TokenFilter length = new LengthFilter(numbers, 3, 1000);
        TokenFilter stemmer = new BulgarianStemFilter(length);
        TokenFilter ngrams = new ShingleFilter(stemmer, 2, 3);

        try (TokenFilter filter = ngrams) {
            Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
            filter.reset();
            while (filter.incrementToken()) {
                String word = termAtt.toString().replace(",", "(comma)").replaceAll("\n|\r", "");
                if (word.contains("_")) {
                    continue;
                }
                wordsList.add(word);
            }
        }
    }

    Multiset<String> words = ConcurrentHashMultiset.create(wordsList);

    return new Tuple2<>(label, words);
}
项目:news-credibility    文件:EgdeMain.java   
public static void main(String[] args) throws IOException {
    System.out.println(NumberUtils.isDigits("12345"));
    System.out.println(NumberUtils.isDigits("12345.1"));
    System.out.println(NumberUtils.isDigits("12345,2"));

    System.out.println(NumberUtils.isNumber("12345"));
    System.out.println(NumberUtils.isNumber("12345.1"));
    System.out.println(NumberUtils.isNumber("12345,2".replace(",", ".")));
    System.out.println(NumberUtils.isNumber("12345,2"));
    StringReader input = new StringReader(
            "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24"
                    .replaceAll("br2n", ""));

    LetterTokenizer tokenizer = new LetterTokenizer();
    tokenizer.setReader(input);

    TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET);
    TokenFilter length = new LengthFilter(stopFilter, 3, 1000);
    TokenFilter stemmer = new BulgarianStemFilter(length);
    TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2);

    try (TokenFilter filter = ngrams) {

        Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        while (filter.incrementToken()) {
            String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", "");
            System.out.println(word);
        }
    }
}
项目:search    文件:PositionFilterTest.java   
/** Tests ShingleFilter up to six shingles against six terms.
 *  Tests PositionFilter setting all but the first positionIncrement to zero.
 * @throws java.io.IOException @see Token#next(Token)
 */
public void test6GramFilterNoPositions() throws Exception {

  ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
  assertTokenStreamContents(new PositionFilter(filter),
             SIX_GRAM_NO_POSITIONS_TOKENS,
             SIX_GRAM_NO_POSITIONS_INCREMENTS);
}
项目:SolrPlugins    文件:ConcatenateTokenFilter.java   
@Override
public final boolean incrementToken() throws IOException {
    //TODO make sure this works with synonyms and stop words

    int i = 0;
    while (input.incrementToken()) {
        String term = new String(termAttr.buffer(), 0, termAttr.length());
        List<String> word = posIncrAttr.getPositionIncrement() > 0 ? new ArrayList<String>() : words.removeLast();
        word.add(term);
        words.add(word);
        i++;
    }
    // now write out as a single token
    if (! concat) {
        makePhrases(words, phrases, 0);
        concat = true;
    }
    while (phrases.size() > 0) {

        String phrase = phrases.removeFirst();
        restoreState(current);
        clearAttributes();

        termAttr.setEmpty();
        termAttr.append(phrase);
        termAttr.setLength(phrase.length());

        //posIncrAttr.setPositionIncrement(0);
        typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

        current = captureState();
        return true;
    }

    concat = false;
    return false;
}
项目:NYBC    文件:ShingleFilterFactory.java   
@Override
public ShingleFilter create(TokenStream input) {
  ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize);
  r.setOutputUnigrams(outputUnigrams);
  r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
  r.setTokenSeparator(tokenSeparator);
  return r;
}
项目:NYBC    文件:PositionFilterTest.java   
/** Tests ShingleFilter up to six shingles against six terms.
 *  Tests PositionFilter setting all but the first positionIncrement to zero.
 * @throws java.io.IOException @see Token#next(Token)
 */
public void test6GramFilterNoPositions() throws Exception {

  ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
  assertTokenStreamContents(new PositionFilter(filter),
             SIX_GRAM_NO_POSITIONS_TOKENS,
             SIX_GRAM_NO_POSITIONS_INCREMENTS);
}
项目:meresco-lucene    文件:ShingleAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new StandardTokenizer();
    TokenStream src = new LowerCaseFilter(source);
    ShingleFilter filter = new ShingleFilter(src, this.minShingleSize, this.maxShingleSize);
    return new TokenStreamComponents(source, filter);
}
项目:SolrTextTagger    文件:ConcatenateFilter.java   
@Override
public final boolean incrementToken() throws IOException {
  if (done)
    return false;
  done = true;

  buf.setLength(0);
  boolean firstTerm = true;
  while (input.incrementToken()) {
    if (!firstTerm) {
      buf.append(separator);
    }
    //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13
    buf.append(termAtt);
    firstTerm = false;
  }
  input.end();//call here so we can see end of stream offsets

  termAtt.setEmpty().append(buf);
  //Setting the other attributes ultimately won't have much effect but lets be thorough
  offsetAtt.setOffset(0, offsetAtt.endOffset());
  posIncrAtt.setPositionIncrement(1);
  posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
  typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

  return true;
}
项目:Maskana-Gestor-de-Conocimiento    文件:PositionFilterTest.java   
/** Tests ShingleFilter up to six shingles against six terms.
 *  Tests PositionFilter setting all but the first positionIncrement to zero.
 * @throws java.io.IOException @see Token#next(Token)
 */
public void test6GramFilterNoPositions() throws Exception {

  ShingleFilter filter = new ShingleFilter(new TestTokenStream(TEST_TOKEN), 6);
  assertTokenStreamContents(new PositionFilter(filter),
             SIX_GRAM_NO_POSITIONS_TOKENS,
             SIX_GRAM_NO_POSITIONS_INCREMENTS);
}
项目:elasticsearch_my    文件:ShingleTokenFilterFactory.java   
public Factory(String name) {
    this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
项目:Elasticsearch    文件:ShingleTokenFilterFactory.java   
public Factory(String name) {
    this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
项目:mgraph-summarization    文件:TextAnalyser.java   
public static List<String> getNgrams(String text, int N) throws IOException {

    List<String> tokens = new ArrayList<String>();


    Reader reader = new StringReader(text);
    // Tokenizer
    //StandardTokenizer tokenizer = new StandardTokenizer(Version.LUCENE_46, reader);

    LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_46, reader);

    // Filters
    LowerCaseFilter lowerCaseFilter = new LowerCaseFilter(Version.LUCENE_46, tokenizer); 
    KStemFilter kStemFilter = new KStemFilter(lowerCaseFilter);

    CharArraySet stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    StopFilter stopFilter = new StopFilter(Version.LUCENE_46, kStemFilter, stopwords);

    TokenStream ts;
    if(N > 1) {

        PositionFilter positionFilter = new PositionFilter(stopFilter);

        //@SuppressWarnings("resource")
        //ShingleFilter shingleFilter = new ShingleFilter(positionFilter, N, N);
        //shingleFilter.setOutputUnigrams(false);

        @SuppressWarnings("resource")
        ShingleFilter shingleFilter = new ShingleFilter(positionFilter, 2, N);
        shingleFilter.setOutputUnigrams(true);

        ts = shingleFilter;
    }
    else {
        ts = stopFilter;
    }

    CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);

    ts.reset();
    while (ts.incrementToken()) {
        String token = charTermAtt.toString();
        if(token.length()>1)
            tokens.add(token);
      }
      ts.end();  
      ts.close();

    return tokens;
}