public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException { tokenStream.reset(); int index = 0; while (tokenStream.incrementToken() == true) { assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString()); if(expectedTypes != null) { assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type()); } OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); if(expectedStartOffsets != null) { assertEquals(expectedStartOffsets[index], offsets.startOffset()); } if(expectedEndOffsets != null) { assertEquals(expectedEndOffsets[index], offsets.endOffset()); } index++; } tokenStream.end(); }
public static void main(String[] args) throws IOException { List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 "); System.out.println(parse); List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文"); //System.out.println(parse1); String text11="ZW321282050000000325"; Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); tokenizer.reset(); while (tokenizer.incrementToken()){ System.out.print(new String(termAtt.toString()+" ") ); // System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" ); //System.out.print( positionIncrementAtt.getPositionIncrement() +"/"); } tokenizer.close(); }
private String[] splitStringIntoTerms(String value) { try { List<String> results = new ArrayList<>(); try (TokenStream tokens = analyzer.tokenStream("", value)) { CharTermAttribute term = tokens.getAttribute(CharTermAttribute.class); tokens.reset(); while (tokens.incrementToken()) { String t = term.toString().trim(); if (t.length() > 0) { results.add(t); } } } return results.toArray(new String[results.size()]); } catch (IOException e) { throw new MemgraphException("Could not tokenize string: " + value, e); } }
private List<TokenData> parse(String text) { NamedAnalyzer analyzer = getAnalysisService().indexAnalyzers.get("test"); try { try (TokenStream ts = analyzer.tokenStream("test", new StringReader(text))) { List<TokenData> result = new ArrayList<>(); CharTermAttribute charTerm = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute position = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { String original = text.substring(offset.startOffset(), offset.endOffset()); result.add(token(original, charTerm.toString(), position.getPositionIncrement())); } ts.end(); return result; } } catch (IOException e) { throw new RuntimeException(e); } }
private static Query parseQueryString(ExtendedCommonTermsQuery query, Object queryString, String field, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { // Logic similar to QueryParser#getFieldQuery try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); BytesRefBuilder builder = new BytesRefBuilder(); while (source.incrementToken()) { // UTF-8 builder.copyChars(termAtt); query.add(new Term(field, builder.toBytesRef())); } } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return query; }
public void testSimple() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(t, new UniqueTokenFilter(t)); } }; TokenStream test = analyzer.tokenStream("test", "this test with test"); test.reset(); CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("this")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("test")); assertThat(test.incrementToken(), equalTo(true)); assertThat(termAttribute.toString(), equalTo("with")); assertThat(test.incrementToken(), equalTo(false)); }
private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException { IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings); AnalysisModule analysisModule = new AnalysisModule(new Environment(settings), singletonList(new AnalysisPlugin() { @Override public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { return singletonMap("myfilter", MyFilterTokenFilterFactory::new); } })); IndexAnalyzers indexAnalyzers = analysisModule.getAnalysisRegistry().build(idxSettings); Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer(); AllEntries allEntries = new AllEntries(); allEntries.addText("field1", text, 1.0f); TokenStream stream = AllTokenStream.allTokenStream("_all", text, 1.0f, analyzer); stream.reset(); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); List<String> terms = new ArrayList<>(); while (stream.incrementToken()) { String tokText = termAtt.toString(); terms.add(tokText); } return terms; }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new Tokenizer() { boolean incremented = false; CharTermAttribute term = addAttribute(CharTermAttribute.class); @Override public boolean incrementToken() throws IOException { if (incremented) { return false; } term.setLength(0).append(output); incremented = true; return true; } }; return new TokenStreamComponents(tokenizer); }
private void testToken(String source, String expected) throws IOException { Index index = new Index("test", "_na_"); Settings settings = Settings.builder() .put("index.analysis.filter.myStemmer.type", "polish_stem") .build(); TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin()); TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer"); Tokenizer tokenizer = new KeywordTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream ts = filterFactory.create(tokenizer); CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class); ts.reset(); assertThat(ts.incrementToken(), equalTo(true)); assertThat(term1.toString(), equalTo(expected)); }
private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException { CharTermAttribute term1 = stream1.addAttribute(CharTermAttribute.class); CharTermAttribute term2 = stream2.addAttribute(CharTermAttribute.class); stream1.reset(); stream2.reset(); assertThat(stream1.incrementToken(), equalTo(true)); assertThat(stream2.incrementToken(), equalTo(true)); assertThat(Integer.signum(term1.toString().compareTo(term2.toString())), equalTo(Integer.signum(comparison))); assertThat(stream1.incrementToken(), equalTo(false)); assertThat(stream2.incrementToken(), equalTo(false)); stream1.end(); stream2.end(); stream1.close(); stream2.close(); }
/** * analyzes string like the given field * @param field the name of the field * @param value the string to analyze * @return the analyzed string */ public static String analyzeString(SolrCore core, String field, String value) { try { StringBuilder b = new StringBuilder(); try (TokenStream ts = core.getLatestSchema().getFieldType(field).getQueryAnalyzer().tokenStream(field, new StringReader(value))) { ts.reset(); while (ts.incrementToken()) { b.append(" "); CharTermAttribute attr = ts.getAttribute(CharTermAttribute.class); b.append(attr); } } return b.toString().trim(); } catch (IOException e) { //FIXME: This error should be properly logged! e.printStackTrace(); return value; } }
@After public void after(){ if(analyzer != null){ try { TokenStream ts = analyzer.tokenStream("field", text); CharTermAttribute ch = ts.addAttribute(CharTermAttribute.class); ts.reset(); int i = 0; while (ts.incrementToken()) { i++; System.out.print(ch.toString() + "\t"); if(i % 7 == 0){ System.out.println(); } } ts.end(); ts.close(); } catch (IOException e) { e.printStackTrace(); } } }
private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory) throws IOException{ StringReader reader = new StringReader( source ); TokenStream ts = loadTokenizer(tokFactory, reader); List<String> tokList = new ArrayList<>(); try { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()){ if( termAtt.length() > 0 ) tokList.add( termAtt.toString() ); } } finally{ reader.close(); } return tokList; }
@Override public boolean accept(AttributeSource source) { if (termAtt == null) { termAtt = source.addAttribute(CharTermAttribute.class); } try { Date date = dateFormat.parse(termAtt.toString());//We don't care about the date, just that we can parse it as a date if (date != null) { return true; } } catch (ParseException e) { } return false; }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
static private void assertTokenStream(TokenStream tokenStream, List<String> expected) { try { List<String> termList = new ArrayList<String>(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { termList.add(charTermAttribute.toString()); } System.out.println(String.join(" ", termList)); assertThat(termList, is(expected)); } catch (IOException e) { assertTrue(false); } }
private final Query parseQueryString(ExtendedCommonTermsQuery query, String queryString, String field, QueryParseContext parseContext, Analyzer analyzer, String lowFreqMinimumShouldMatch, String highFreqMinimumShouldMatch) throws IOException { // Logic similar to QueryParser#getFieldQuery int count = 0; try (TokenStream source = analyzer.tokenStream(field, queryString.toString())) { source.reset(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); BytesRefBuilder builder = new BytesRefBuilder(); while (source.incrementToken()) { // UTF-8 builder.copyChars(termAtt); query.add(new Term(field, builder.toBytesRef())); count++; } } if (count == 0) { return null; } query.setLowFreqMinimumNumberShouldMatch(lowFreqMinimumShouldMatch); query.setHighFreqMinimumNumberShouldMatch(highFreqMinimumShouldMatch); return query; }
protected String lemmatize(String query) { ItalianAnalyzer analyzer = new ItalianAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("label", query); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return sb.toString(); }
private Set<String> analyze(String text) throws IOException { Set<String> result = new HashSet<String>(); Analyzer analyzer = configuration.getAnalyzer(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } result.add(new String(termAtt.buffer(), 0, termAtt.length())); } ts.end(); return result; } }
public static void stemHinglish(CharTermAttribute termAtt) { char [] buffer = termAtt.buffer(); String strInput = new String(termAtt.toString()); //System.out.println("Before " + strInput + " " + termAtt.toString()); Iterator itr = lsRegexs.iterator(); while (itr.hasNext()) { List<Object> lsInputs = (List<Object>)itr.next(); Matcher matcher = ((Pattern)lsInputs.get(0)).matcher(strInput); if (matcher.matches()) { Matcher replMatcher = ((Pattern)lsInputs.get(1)).matcher(strInput); strInput = replMatcher.replaceAll((String)lsInputs.get(2)); } } //strInput = strInput.trim(); for (int iCounter = 0; iCounter < strInput.length(); iCounter++) { buffer[iCounter] = strInput.charAt(iCounter); } termAtt.setLength(strInput.length()); //System.out.println("After " + strInput + " " + termAtt.toString()); }
@Override public boolean incrementToken() throws IOException { if (savePrevToken()) return true; // Reached the end of the token stream being processed if ( ! this.input.incrementToken()) { return false; } // Get text of the current token and remove any leading/trailing whitespace. String currToken = this.input.getAttribute(CharTermAttribute.class).toString().trim(); if (! hebTokAttribute.isExact()) { List<String> prefixedStrippedList = getPrefixStrippedList(currToken); for (String preStrip : prefixedStrippedList) { previousTokens.add(preStrip); } savePrevToken(); } return true; }
@Override public boolean incrementToken() throws IOException { // Reached the end of the token stream being processed if ( ! this.input.incrementToken()) { return false; } String currToken = this.input.getAttribute(CharTermAttribute.class).toString().trim(); if ( ! this.hebTokAttribute.isExact()) { this.charTermAttribute.setEmpty().append(this.slr.filterStopLetters(currToken)); this.hebTokAttribute.setExact(false); } return true; }
@Override public boolean incrementToken() throws IOException { if (savePrevToken()) return true; // Reached the end of the token stream being processed if ( ! this.input.incrementToken()) { return false; } // Get text of the current token and remove any leading/trailing whitespace. String currToken = this.input.getAttribute(CharTermAttribute.class).toString().trim(); if (! hebTokAttribute.isExact()) { List<String> ngrams = ngramizer.ngramize(currToken); for (String ngram : ngrams) { previousTokens.add(ngram); } savePrevToken(); } return true; }
@Override public boolean incrementToken() throws IOException { // Reached the end of the token stream being processed if ( ! this.input.incrementToken()) { return false; } String currToken = this.input.getAttribute(CharTermAttribute.class).toString().trim(); if ( ! this.hebTokAttribute.isExact()) { this.charTermAttribute.setEmpty().append(replaceSofits(currToken)); this.hebTokAttribute.setExact(false); } return true; }
@Override public boolean incrementToken() throws IOException { // Reached the end of the token stream being processed if ( ! this.input.incrementToken()) { return false; } String currToken = this.input.getAttribute(CharTermAttribute.class).toString().trim(); if ( ! this.hebTokAttribute.isExact()) { this.charTermAttribute.setEmpty().append(filterStopLetters(currToken)); this.hebTokAttribute.setExact(false); } return true; }
private String removeStopWords(String action) { StringBuilder builder = new StringBuilder(); try { FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer(); TokenStream tokenStream = frenchAnalyzer.tokenStream("contents", action); CharTermAttribute attribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = attribute.toString(); builder.append(term + " "); } } catch (IOException e) { e.printStackTrace(); } return builder.toString(); }
/** * Parses the query. Using this instead of a QueryParser in order * to avoid thread-safety issues with Lucene's query parser. * * @param fieldName the name of the field * @param value the value of the field * @return the parsed query */ private Query parseTokens(String fieldName, String value) { BooleanQuery searchQuery = new BooleanQuery(); if (value != null) { Analyzer analyzer = new KeywordAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(value)); tokenStream.reset(); CharTermAttribute attr = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String term = attr.toString(); Query termQuery = new TermQuery(new Term(fieldName, term)); searchQuery.add(termQuery, Occur.SHOULD); } } catch (IOException e) { throw new DukeException("Error parsing input string '" + value + "' " + "in field " + fieldName); } } return searchQuery; }
/** * Lucene constructor * * @throws UnirestException * @throws JSONException * @throws IOException */ public LTPTokenizer(Set<String> filter) throws IOException, JSONException, UnirestException { super(); logger.info("LTPTokenizer Initialize......"); // Add token offset attribute offsetAttr = addAttribute(OffsetAttribute.class); // Add token content attribute charTermAttr = addAttribute(CharTermAttribute.class); // Add token type attribute typeAttr = addAttribute(TypeAttribute.class); // Add token position attribute piAttr = addAttribute(PositionIncrementAttribute.class); // Create a new word segmenter to get tokens LTPSeg = new LTPWordSegmenter(input); // Add filter words set this.filter = filter; }
@Test public void testSearch() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "重qing"); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "重"); Assert.assertEquals(offsetAttribute.startOffset(), 0); Assert.assertEquals(offsetAttribute.endOffset(), 1); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "qing"); Assert.assertEquals(offsetAttribute.startOffset(), 1); Assert.assertEquals(offsetAttribute.endOffset(), 5); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); tokenStream.close(); }
public void testFullPinyinFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
public void testFirstLetterFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
public WrappedTokenStream(TokenStream tokenStream, String pText) { this.pText = pText; this.tokenStream = tokenStream; if(tokenStream.hasAttribute(CharTermAttribute.class)) { charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); } if(tokenStream.hasAttribute(OffsetAttribute.class)) { offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); } if(tokenStream.hasAttribute(CharsRefTermAttribute.class)) { charsRefTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } if(tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } additionalTermAttributeLocal.init(this); }
@Test public void test() throws IOException { PrimaryWordAnalyzer analyzer = new PrimaryWordAnalyzer(); String text = "서울 지하철(300만명)"; TokenStream tokenStream = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); assertTrue(tokenStream.incrementToken()); assertEquals("서울", charTermAttribute.toString()); assertTrue(tokenStream.incrementToken()); assertEquals("지하철", charTermAttribute.toString()); assertTrue(tokenStream.incrementToken()); assertEquals("300", charTermAttribute.toString()); assertTrue(tokenStream.incrementToken()); assertEquals("만명", charTermAttribute.toString()); assertFalse(tokenStream.incrementToken()); }
@Test public void testBulk() throws IOException { String str = ""; str = "SK, 하이닉스"; //str = "하이닉스"; StringReader input = new StringReader(str); CSVAnalyzer analyzer = new CSVAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("", input); tokenStream.reset(); logger.debug("tokenStream:{}", tokenStream); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); for(int inx=0;tokenStream.incrementToken();inx++) { String term = charTermAttribute.toString(); logger.debug("[{}] \"{}\" {}~{}", inx, term, offsetAttribute.startOffset(), offsetAttribute.endOffset()); } analyzer.close(); }