@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new NGramTokenizer(reader, 1, 30); streams.result = new LowerCaseFilter(streams.source); streams.result = new PorterStemFilter(streams.source); streams.result = new StopFilter(false, streams.source, stopwords, true); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel())); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
public NGramLuceneQuery(String fieldName, String fieldValue, int gramSize) { super(gramSize); Preconditions.checkArgument(fieldValue.length()>=gramSize); try (NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(fieldValue.toLowerCase()), gramSize, gramSize)) { tokenizer.reset(); while (tokenizer.incrementToken()) { add(new Term(fieldName, tokenizer.getAttribute(CharTermAttribute.class).toString())); } } catch (IOException e) { throw new RuntimeException(e); } }
public int[] docToMinHashes(String raw_html) throws Exception { HashSet<Integer> doc = new HashSet<Integer>(); int count = 0; NGramTokenizer gramTokenizer = new NGramTokenizer(factory, gram_length, gram_length); gramTokenizer.setReader(new StringReader(raw_html)); CharTermAttribute cattr = gramTokenizer.addAttribute(CharTermAttribute.class); gramTokenizer.reset(); while (gramTokenizer.incrementToken()) { count++; if ((count % skip_interval) == 0) doc.add(murmur.hashString(cattr.toString(), Charsets.UTF_8).asInt()); } gramTokenizer.close(); if (hasher == null) hasher = new MinHasher(num_hashes); return hasher.hash(doc); }
private Analyzer createAnalzyer(final int length) { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new NGramTokenizer(1, 1); final AlphaNumWordFilter filter = new AlphaNumWordFilter(tokenizer); filter.setMaxTokenLength(length); return new TokenStreamComponents(tokenizer, filter); } }; return analyzer; }
public void testTokenStream2() throws IOException { // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' String input = "㌰゙5℃№㈱㌘ザゾ"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1); assertTokenStreamContents(tokenStream, new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, input.length() ); }
/** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */ @Override public void init(Map<String, String> args) { super.init(args); String maxArg = args.get("maxGramSize"); maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); String minArg = args.get("minGramSize"); minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new NGramTokenizer(reader, 1, 30); stream = new LowerCaseFilter(stream); stream = new PorterStemFilter(stream); stream = new StopFilter(false, stream, stopwords, true); return stream; }
/** * * @param fieldName * @param reader * @return */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new NGramTokenizer(reader, 1, 30); stream = new LowerCaseFilter(stream); return stream; }
/** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new NGramTokenizer(reader, 1, 30); streams.result = new LowerCaseFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); }
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); }
@Override protected TokenStreamComponents createComponents(String paramString) { Tokenizer source = new NGramTokenizer(n, n); TokenStream result = new StandardFilter(source); return new TokenStreamComponents(source, result); }
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */ @Override public NGramTokenizer create(Reader input) { return new NGramTokenizer(input, minGramSize, maxGramSize); }
@Override protected TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(new NGramTokenizer(MIN_NGRAM_WEIGHT, MAX_NGRAM_WEIGHT)); }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new NGramTokenizer(reader, 1, 30); return stream; }
/** * * @param fieldName * @param reader * @return */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream stream = new NGramTokenizer(reader, 1, 30); return stream; }