private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); stream1.reset(); stream2.reset(); assertThat(stream1.incrementToken(), equalTo(true)); assertThat(stream2.incrementToken(), equalTo(true)); assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); assertThat(stream1.incrementToken(), equalTo(false)); assertThat(stream2.incrementToken(), equalTo(false)); stream1.end(); stream2.end(); stream1.close(); stream2.close(); }
@After public void after(){ if(analyzer != null){ try { TokenStream ts = analyzer.tokenStream("field", text); CharTermAttribute ch = ts.addAttribute(CharTermAttribute.class); ts.reset(); int i = 0; while (ts.incrementToken()) { i++; System.out.print(ch.toString() + "\t"); if(i % 7 == 0){ System.out.println(); } } ts.end(); ts.close(); } catch (IOException e) { e.printStackTrace(); } } }
private String[] splitStringIntoTerms(String value) { try { List<String> results = new ArrayList<>(); try (TokenStream tokens = analyzer.tokenStream("", value)) { CharTermAttribute term = tokens.getAttribute(CharTermAttribute.class); tokens.reset(); while (tokens.incrementToken()) { String t = term.toString().trim(); if (t.length() > 0) { results.add(t); } } } return results.toArray(new String[results.size()]); } catch (IOException e) { throw new MemgraphException("Could not tokenize string: " + value, e); } }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
@Override public Filter getFilter(Element e) throws ParserException { List<BytesRef> terms = new ArrayList<>(); String text = DOMUtils.getNonBlankTextOrFail(e); String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); TokenStream ts = null; try { ts = analyzer.tokenStream(fieldName, text); TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); terms.add(BytesRef.deepCopyOf(bytes)); } ts.end(); } catch (IOException ioe) { throw new RuntimeException("Error constructing terms from index:" + ioe); } finally { IOUtils.closeWhileHandlingException(ts); } return new TermsFilter(fieldName, terms); }
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } }
protected String lemmatize(String query) { ItalianAnalyzer analyzer = new ItalianAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("label", query); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return sb.toString(); }
/** * Create field with TokenStream value. * @param name field name * @param tokenStream TokenStream value * @param type field type * @throws IllegalArgumentException if either the name or type * is null, or if the field's type is stored(), or * if tokenized() is false, or if indexed() is false. * @throws NullPointerException if the tokenStream is null */ public Field(String name, TokenStream tokenStream, FieldType type) { if (name == null) { throw new IllegalArgumentException("name cannot be null"); } if (tokenStream == null) { throw new NullPointerException("tokenStream cannot be null"); } if (!type.indexed() || !type.tokenized()) { throw new IllegalArgumentException("TokenStream fields must be indexed and tokenized"); } if (type.stored()) { throw new IllegalArgumentException("TokenStream fields cannot be stored"); } this.name = name; this.fieldsData = null; this.tokenStream = tokenStream; this.type = type; }
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String field, QueryParseContext parseContext, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { // Logic similar to QueryParser#getFieldQuery int count = 0; try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); BytesRefBuilder builder = new BytesRefBuilder(); while (source.incrementToken()) { // UTF-8 builder.copyChars(termAtt); query.add(new Term(field, builder.toBytesRef())); count++; } } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return query; }
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() { @Override public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { return singletonMap("myfilter", MyFilterTokenFilterFactory::new); } })); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field1", text, 1.0f); TokenStream stream = AllTokenStream.allTokenStream("_all", text, 1.0f, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); this.tokens=new LinkedList<>(); if (minWordSize < 0) { throw new IllegalArgumentException("minWordSize cannot be negative"); } this.minWordSize=minWordSize; if (minSubwordSize < 0) { throw new IllegalArgumentException("minSubwordSize cannot be negative"); } this.minSubwordSize=minSubwordSize; if (maxSubwordSize < 0) { throw new IllegalArgumentException("maxSubwordSize cannot be negative"); } this.maxSubwordSize=maxSubwordSize; this.onlyLongestMatch=onlyLongestMatch; this.dictionary = dictionary; }
/** * other attribute extract object. * Extracted object group by AttributeClassName * * @param stream current TokenStream * @param includeAttributes filtering attributes * @return Map<key value> */ private static Map<String, Object> extractExtendedAttributes(TokenStream stream, final Set<String> includeAttributes) { final Map<String, Object> extendedAttributes = new TreeMap<>(); stream.reflectWith((attClass, key, value) -> { if (CharTermAttribute.class.isAssignableFrom(attClass)) { return; } if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) { return; } if (OffsetAttribute.class.isAssignableFrom(attClass)) { return; } if (TypeAttribute.class.isAssignableFrom(attClass)) { return; } if (includeAttributes == null || includeAttributes.isEmpty() || includeAttributes.contains(key.toLowerCase(Locale.ROOT))) { if (value instanceof BytesRef) { final BytesRef p = (BytesRef) value; value = p.toString(); } extendedAttributes.put(key, value); } }); return extendedAttributes; }
public AlternateSpellingFilter(TokenStream tokenStream) { super(tokenStream); this.previousTokens = new ArrayList<String>(); this.alternateSpellings = new HashMap<String, String>(); this.alternateSpellings.put("היא", "הוא"); }
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) { Reader reader = new FastStringReader(source); for (CharFilterFactory charFilterFactory : charFilterFactories) { reader = charFilterFactory.create(reader); } Tokenizer tokenizer = tokenizerFactory.create(); tokenizer.setReader(reader); TokenStream tokenStream = tokenizer; for (int i = 0; i < current; i++) { tokenStream = tokenFilterFactories[i].create(tokenStream); } return tokenStream; }
@Test public void testMetaphoneWords() throws Exception { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.filter.myStemmer.type", "br_metaphone") .build(); AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin()); TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone"); Tokenizer tokenizer = new KeywordTokenizer(); Map<String,String> words = buildWordList(); Set<String> inputWords = words.keySet(); for(String word : inputWords) { tokenizer.setReader(new StringReader(word)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(words.get(word))); ts.close(); } }
@Test public void testMetaphonePhrases() throws Exception { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("index.analysis.analyzer.myAnalyzer.type", "custom") .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") .put("index.analysis.analyzer.myAnalyzer.filter", "br_metaphone") .build(); AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin()); Analyzer analyzer = analysisService.analyzer("myAnalyzer"); Map<String,List<String>> phrases = buildPhraseList(); for(String phrase : phrases.keySet()) { List<String> outputWords = phrases.get(phrase); TokenStream ts = analyzer.tokenStream("test", phrase); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); for (String expected : outputWords) { assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); } ts.close(); } }
/** * 打印出给定分词器的分词结果 * * @param analyzer 分词器 * @param keyWord 关键词 * @throws Exception */ private static List<String> analysisResult(Analyzer analyzer, String keyWord) throws Exception { TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(keyWord)); tokenStream.addAttribute(CharTermAttribute.class); List<String> stringList = new ArrayList<String>(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream .getAttribute(CharTermAttribute.class); stringList.add(charTermAttribute.toString()); } return stringList; }
static TokenStream tokenize(Reader reader, Tokenizer tokenizer) throws IOException { tokenizer.close(); tokenizer.end(); tokenizer.setReader(reader); tokenizer.reset(); return tokenizer; }
private void assertTokenFilter(String name, Class<?> clazz) throws IOException { Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream stream = tokenFilter.create(tokenizer); assertThat(stream, instanceOf(clazz)); }
public void testFillerToken() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_filler"); String source = "simon the sorcerer"; String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the")); assertTokenStreamContents(tokenFilter.create(stream), expected); }
@Override public SpanQuery getSpanQuery(Element e) throws ParserException { String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName"); String value = DOMUtils.getNonBlankTextOrFail(e); List<SpanQuery> clausesList = new ArrayList<>(); TokenStream ts = null; try { ts = analyzer.tokenStream(fieldName, value); TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); ts.reset(); while (ts.incrementToken()) { termAtt.fillBytesRef(); SpanTermQuery stq = new SpanTermQuery(new Term(fieldName, BytesRef.deepCopyOf(bytes))); clausesList.add(stq); } ts.end(); SpanOrQuery soq = new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()])); soq.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f)); return soq; } catch (IOException ioe) { throw new ParserException("IOException parsing value:" + value); } finally { IOUtils.closeWhileHandlingException(ts); } }
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_4_4_0)) { return new DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } else { return new Lucene43DictionaryCompoundWordTokenFilter(tokenStream, wordList, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } }
private String[] termsFromTokenStream(TokenStream stream) throws IOException { List<String> outputTemp=new ArrayList<>(); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { outputTemp.add(charTermAttribute.toString()); } stream.end(); stream.close(); return outputTemp.toArray(new String[0]); }
@Test public void test() throws IOException { TokenStream ts = analyzer.tokenStream("field", new StringReader("大大大战争")); while (ts.incrementToken()) { System.out.println("token : " + ts); } }
public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) { super(suffix); prefix = new PrefixAwareTokenFilter(prefix, input) { @Override public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken); } }; this.suffix = new PrefixAwareTokenFilter(prefix, suffix) { @Override public Token updateSuffixToken(Token suffixToken, Token lastInputToken) { return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken); } }; }
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_4_4)) { return new KeepWordFilter(tokenStream, keepWords); } else { @SuppressWarnings("deprecation") final TokenStream filter = new Lucene43KeepWordFilter(enablePositionIncrements, tokenStream, keepWords); return filter; } }
/** * @param input Source token stream * @param collator CollationKey generator */ public CollationKeyFilter(TokenStream input, Collator collator) { super(input); // clone in case JRE doesnt properly sync, // or to reduce contention in case they do this.collator = (Collator) collator.clone(); }
@Override public TokenStream create(TokenStream tokenStream) { if (version.onOrAfter(Version.LUCENE_4_4_0)) { return new TrimFilter(tokenStream); } else { @SuppressWarnings("deprecation") final TokenStream filter = new Lucene43TrimFilter(tokenStream, updateOffsets); return filter; } }
@Override public ShingleFilter create(TokenStream input) { ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); r.setOutputUnigrams(outputUnigrams); r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); r.setTokenSeparator(tokenSeparator); r.setFillerToken(fillerToken); return r; }
@Override public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) { if (fieldType().indexOptions() != IndexOptions.NONE) { return getCachedStream().setDoubleValue(number); } return null; }
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); assertThat(termAttr, notNullValue()); int i = 0; while (stream.incrementToken()) { assertThat(expected.length, greaterThan(i)); assertThat( "expected different term at index " + i, expected[i++], equalTo(termAttr.toString())); } assertThat("not all tokens produced", i, equalTo(expected.length)); }
/** * @deprecated Use {@link #DictionaryCompoundWordTokenFilter(TokenStream,CharArraySet,int,int,int,boolean)} */ @Deprecated public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); if (dictionary == null) { throw new IllegalArgumentException("dictionary cannot be null"); } }
/** * Count position increments in a token stream. Package private for testing. * @param analyzer analyzer to create token stream * @param fieldName field name to pass to analyzer * @param fieldValue field value to pass to analyzer * @return number of position increments in a token stream * @throws IOException if tokenStream throws it */ static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { count += position.getPositionIncrement(); } tokenStream.end(); count += position.getPositionIncrement(); return count; } }
@Override public TokenFilter create(TokenStream input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { return new HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); } return new Lucene43HyphenationCompoundWordTokenFilter(input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); }
@Override public PorterStemFilter create(TokenStream input) { return new PorterStemFilter(input); }
@Override public TokenStream create(TokenStream input) { return new GermanNormalizationFilter(input); }
public CJKWidthFilter(TokenStream input) { super(input); }
AllTokenStream(TokenStream input, AllEntries allEntries) { super(input); this.allEntries = allEntries; offsetAttribute = addAttribute(OffsetAttribute.class); payloadAttribute = addAttribute(PayloadAttribute.class); }
@Override public GreekLowerCaseFilter create(TokenStream in) { return new GreekLowerCaseFilter(in); }