Java 类org.apache.lucene.analysis.ngram.NGramTokenizer 实例源码

项目:DrakkarKeel    文件:NGramAnalyzer.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
        IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        streams.source = new NGramTokenizer(reader, 1, 30);
        streams.result = new LowerCaseFilter(streams.source);
        streams.result = new PorterStemFilter(streams.source);
        streams.result = new StopFilter(false, streams.source, stopwords, true);

        setPreviousTokenStream(streams);
    } else {
        streams.source.reset(reader);
    }
    return streams.result;
}
项目:elasticsearch_my    文件:NGramTokenizerFactory.java   
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new NGramTokenizer(minGram, maxGram);
    } else {
        return new NGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
项目:Elasticsearch    文件:EdgeNGramTokenizerFactory.java   
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel()));
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
项目:Elasticsearch    文件:NGramTokenizerFactory.java   
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
    this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings);
}
项目:gitplex-mit    文件:NGramLuceneQuery.java   
public NGramLuceneQuery(String fieldName, String fieldValue, int gramSize) {
    super(gramSize);

    Preconditions.checkArgument(fieldValue.length()>=gramSize);

    try (NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(fieldValue.toLowerCase()), gramSize, gramSize)) {
        tokenizer.reset();
        while (tokenizer.incrementToken()) { 
            add(new Term(fieldName, 
                    tokenizer.getAttribute(CharTermAttribute.class).toString()));
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}
项目:crawl-eval    文件:MinHashDupesByCharNgram.java   
public int[] docToMinHashes(String raw_html) throws Exception {
    HashSet<Integer> doc = new HashSet<Integer>();
    int count = 0;

    NGramTokenizer gramTokenizer = new NGramTokenizer(factory, gram_length, gram_length);
    gramTokenizer.setReader(new StringReader(raw_html));
    CharTermAttribute cattr = gramTokenizer.addAttribute(CharTermAttribute.class);
    gramTokenizer.reset();

    while (gramTokenizer.incrementToken()) {
        count++;
        if ((count % skip_interval) == 0)
            doc.add(murmur.hashString(cattr.toString(), Charsets.UTF_8).asInt());
    }
    gramTokenizer.close();
    if (hasher == null)
        hasher = new MinHasher(num_hashes);
    return hasher.hash(doc);

}
项目:analyzers-ja    文件:AlphaNumWordFilterTest.java   
private Analyzer createAnalzyer(final int length) {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new NGramTokenizer(1, 1);
            final AlphaNumWordFilter filter = new AlphaNumWordFilter(tokenizer);
            filter.setMaxTokenLength(length);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    return analyzer;
}
项目:search    文件:TestICUNormalizer2CharFilter.java   
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
项目:NYBC    文件:NGramTokenizerFactory.java   
/** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */
@Override
public void init(Map<String, String> args) {
  super.init(args);
  String maxArg = args.get("maxGramSize");
  maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);

  String minArg = args.get("minGramSize");
  minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
}
项目:DrakkarKeel    文件:NGramAnalyzer.java   
public TokenStream tokenStream(String fieldName, Reader reader) {

        TokenStream stream = new NGramTokenizer(reader, 1, 30);
        stream = new LowerCaseFilter(stream);
        stream = new PorterStemFilter(stream);
        stream = new StopFilter(false, stream, stopwords, true);

        return stream;
    }
项目:DrakkarKeel    文件:NGramAnalyzer.java   
/**
 *
 * @param fieldName
 * @param reader
 * @return
 */
public TokenStream tokenStream(String fieldName, Reader reader) {

    TokenStream stream = new NGramTokenizer(reader, 1, 30);
    stream = new LowerCaseFilter(stream);

    return stream;
}
项目:DrakkarKeel    文件:NGramAnalyzer.java   
/**
 *
 * @param fieldName
 * @param reader
 * @return
 * @throws IOException
 */
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        streams.source = new NGramTokenizer(reader, 1, 30);
        streams.result = new LowerCaseFilter(streams.source);
        setPreviousTokenStream(streams);
    } else {
        streams.source.reset(reader);
    }
    return streams.result;
}
项目:elasticsearch_my    文件:EdgeNGramTokenizerFactory.java   
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
项目:elasticsearch_my    文件:NGramTokenizerFactory.java   
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
    this.matcher = parseTokenChars(settings.getAsArray("token_chars"));
}
项目:LuceneDB    文件:NgramAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String paramString) {
    Tokenizer source = new NGramTokenizer(n, n);
    TokenStream result =  new StandardFilter(source);
    return new TokenStreamComponents(source, result);
}
项目:NYBC    文件:NGramTokenizerFactory.java   
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */
@Override
public NGramTokenizer create(Reader input) {
  return new NGramTokenizer(input, minGramSize, maxGramSize);
}
项目:QMAClone    文件:NGramAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
  return new TokenStreamComponents(new NGramTokenizer(MIN_NGRAM_WEIGHT, MAX_NGRAM_WEIGHT));
}
项目:DrakkarKeel    文件:NGramAnalyzerCaseSensitive.java   
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream stream = new NGramTokenizer(reader, 1, 30);
    return stream;
}
项目:DrakkarKeel    文件:NGramAnalyzerCaseSensitive.java   
/**
 *
 * @param fieldName
 * @param reader
 * @return
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream stream = new NGramTokenizer(reader, 1, 30);
    return stream;
}