@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Tokenizer tok = new WhitespaceTokenizer(); tok.setReader(new StringReader("dark sea green sea green")); final SynonymMap.Builder builder = new SynonymMap.Builder(true); addSynonym("dark sea green", "color", builder); addSynonym("green", "color", builder); addSynonym("dark sea", "color", builder); addSynonym("sea green", "color", builder); final SynonymMap synMap = builder.build(); final TokenStream ts = new SynonymFilter(tok, synMap, true); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class); ts.reset(); int pos = -1; while (ts.incrementToken()) { pos += posIncrAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength()); } ts.end(); ts.close(); }
public static void main(String[] args) throws IOException { List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 "); System.out.println(parse); List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文"); //System.out.println(parse1); String text11="ZW321282050000000325"; Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); tokenizer.reset(); while (tokenizer.incrementToken()){ System.out.print(new String(termAtt.toString()+" ") ); // System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" ); //System.out.print( positionIncrementAtt.getPositionIncrement() +"/"); } tokenizer.close(); }
private List<TokenData> parse(String text) { NamedAnalyzer analyzer = getAnalysisService().indexAnalyzers.get("test"); try { try (TokenStream ts = analyzer.tokenStream("test", new StringReader(text))) { List<TokenData> result = new ArrayList<>(); CharTermAttribute charTerm = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute position = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { String original = text.substring(offset.startOffset(), offset.endOffset()); result.add(token(original, charTerm.toString(), position.getPositionIncrement())); } ts.end(); return result; } } catch (IOException e) { throw new RuntimeException(e); } }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
private Set<String> analyze(String text) throws IOException { Set<String> result = new HashSet<String>(); Analyzer analyzer = configuration.getAnalyzer(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } result.add(new String(termAtt.buffer(), 0, termAtt.length())); } ts.end(); return result; } }
/** * Lucene constructor * * @throws UnirestException * @throws JSONException * @throws IOException */ public LTPTokenizer(Set<String> filter) throws IOException, JSONException, UnirestException { super(); logger.info("LTPTokenizer Initialize......"); // Add token offset attribute offsetAttr = addAttribute(OffsetAttribute.class); // Add token content attribute charTermAttr = addAttribute(CharTermAttribute.class); // Add token type attribute typeAttr = addAttribute(TypeAttribute.class); // Add token position attribute piAttr = addAttribute(PositionIncrementAttribute.class); // Create a new word segmenter to get tokens LTPSeg = new LTPWordSegmenter(input); // Add filter words set this.filter = filter; }
@Test public void testSearch() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "重qing"); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "重"); Assert.assertEquals(offsetAttribute.startOffset(), 0); Assert.assertEquals(offsetAttribute.endOffset(), 1); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "qing"); Assert.assertEquals(offsetAttribute.startOffset(), 1); Assert.assertEquals(offsetAttribute.endOffset(), 5); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); tokenStream.close(); }
public void testFullPinyinFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
public void testFirstLetterFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
@Override public void copyTo(AttributeImpl target) { if (target instanceof Token) { final Token to = (Token) target; to.reinit(this); // reinit shares the payload, so clone it: if (payload !=null) { to.payload = payload.clone(); } } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); ((FlagsAttribute) target).setFlags(flags); ((TypeAttribute) target).setType(type); } }
@Test public void testShorthand2() throws IOException { JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer(); tokenizer.setReader(new StringReader("{\"filing\": \"something\", \"prefix\": \"The \"}")); tokenizer.reset(); assertTrue(tokenizer.incrementToken()); assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertTrue(tokenizer.incrementToken()); assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertFalse(tokenizer.incrementToken()); }
@Test public void testShorthand3() throws IOException { JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer(); tokenizer.setReader(new StringReader("{\"prefix\": \"The \", \"filing\": \"something\"}")); tokenizer.reset(); assertTrue(tokenizer.incrementToken()); assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertTrue(tokenizer.incrementToken()); assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertFalse(tokenizer.incrementToken()); }
public static MyToken[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException { ; TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttr = stream.addAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); ArrayList<MyToken> tokenList = new ArrayList<MyToken>(); while (stream.incrementToken()) { tokenList.add(new MyToken(term.toString(), positionIncrementAttr.getPositionIncrement(), typeAttr.type(), offsetAttr.startOffset(), offsetAttr.endOffset())); } return tokenList.toArray(new MyToken[0]); }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPIndexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testDups(final String expected, final Token... tokens) throws Exception { final Iterator<Token> toks = Arrays.asList(tokens).iterator(); final TokenStream ts = new RemoveDuplicatesTokenFilter( (new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); @Override public boolean incrementToken() { if (toks.hasNext()) { clearAttributes(); Token tok = toks.next(); termAtt.setEmpty().append(tok); offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); posIncAtt.setPositionIncrement(tok.getPositionIncrement()); return true; } else { return false; } } })); assertTokenStreamContents(ts, expected.split("\\s")); }
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception { left.reset(); right.reset(); CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class); CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class); OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class); OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class); PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class); while (left.incrementToken()) { assertTrue("wrong number of tokens for input: " + s, right.incrementToken()); assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString()); assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement()); assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset()); assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); }; assertFalse("wrong number of tokens for input: " + s, right.incrementToken()); left.end(); right.end(); assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); left.close(); right.close(); }
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException { log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled")); stpf.setEnablePositionIncrements(enableIcrements); CharTermAttribute termAtt = stpf.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class); stpf.reset(); for (int i=0; i<20; i+=3) { assertTrue(stpf.incrementToken()); log("Token "+i+": "+stpf); String w = English.intToEnglish(i).trim(); assertEquals("expecting token "+i+" to be "+w,w,termAtt.toString()); assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement()); } assertFalse(stpf.incrementToken()); stpf.end(); stpf.close(); }
public void testStopListPositions() throws IOException { CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false); StopAnalyzer newStop = new StopAnalyzer(stopWordsSet); String s = "This is a good test of the english stop analyzer with positions"; int expectedIncr[] = { 1, 1, 1, 3, 1, 1, 1, 2, 1}; TokenStream stream = newStop.tokenStream("test", s); try { assertNotNull(stream); int i = 0; CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { String text = termAtt.toString(); assertFalse(stopWordsSet.contains(text)); assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement()); } stream.end(); } finally { IOUtils.closeWhileHandlingException(stream); } }
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception { PhraseQuery q = new PhraseQuery(); TokenStream ts = analyzer.tokenStream("content", "this sentence"); try { int j = -1; PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { j += posIncrAtt.getPositionIncrement(); String termText = termAtt.toString(); q.add(new Term("content", termText), j); } ts.end(); } finally { IOUtils.closeWhileHandlingException(ts); } ScoreDoc[] hits = searcher.search(q, null, 1000).scoreDocs; int[] ranks = new int[] { 0 }; compareRanks(hits, ranks); }
public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.toString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload()); }
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
private CharsRef analyze(Analyzer analyzer, String text) throws IOException { CharsRefBuilder charsRefBuilder = new CharsRefBuilder(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } charsRefBuilder.grow(charsRefBuilder.length() + length + 1); /* current + word + separator */ if (charsRefBuilder.length() > 0) { charsRefBuilder.append(CcWordSet.WORD_SEPARATOR); } charsRefBuilder.append(termAtt); } ts.end(); } if (charsRefBuilder.length() == 0) { return null; } charsRefBuilder.append(CcWordSet.WORD_END); return charsRefBuilder.get(); }
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException { tokenStream.reset(); int pos = 0; CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream); OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream); TypeAttribute typeAttribute = getTypeAttribute(tokenStream); PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream); while (tokenStream.incrementToken()) { if (null == charTermAttribute || null == offsetAttribute) { return; } Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); if (null != typeAttribute) { token.setType(typeAttribute.type()); } pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1; if (!tokenPosMap.containsKey(pos)) { tokenPosMap.put(pos, new LinkedList<Token>()); } tokenPosMap.get(pos).add(token); } tokenStream.close(); }
/** * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List<Term> currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); } add(q, currentQuery, operator); return q.build(); }
/** * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); } return builder.build(); }
public void testAttributesAfterStreamEnd() throws IOException { final String path = "uri1:one"; StringReader reader = new StringReader(path); PathTokenFilter ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); // PathTokenFilter.end() will be called after all tokens consumed. tokenise(ts, new String[]{"uri1", "one"}); // Check attributes cleaned up assertEquals("", termAtt.toString()); assertEquals("word", typeAtt.type()); // the default assertEquals(0, posIncAtt.getPositionIncrement()); // Final offset... assertEquals(path.length(), offsetAtt.startOffset()); assertEquals(path.length(), offsetAtt.endOffset()); }
@Test public void testFull() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 4); }
@Test public void testFullWithNoChineseOut() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer, false, 1, false); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 3); }
@Test public void testShort() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer, true); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 4); }
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset(); final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); final ByteArrayDataInput in = new ByteArrayDataInput(); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); int len = -1; final BytesRef payload = payloadAtt.getPayload(); if (info.len != -1) { assertThat(payload).isNotNull(); in.reset(payload.bytes); len = in.readVInt(); } else { assertThat(payload).isNull(); } assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info); } assertThat(ts.incrementToken()).isFalse(); }
private void emit( char[] token ) { System.out.println( "emit: " + new String( token ) ); if (replaceWhitespaceWith != null) { token = replaceWhiteSpace( token ); } CharTermAttribute termAttr = getTermAttribute( ); termAttr.setEmpty( ); termAttr.append( new StringBuilder( ).append( token ) ); OffsetAttribute offAttr = getOffsetAttribute( ); if (offAttr != null && offAttr.endOffset() >= token.length){ int start = offAttr.endOffset() - token.length; offAttr.setOffset( start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute( ); if (pia != null) { pia.setPositionIncrement( ++positionIncr ); } lastEmitted = token; }
public void testStandardTokenizer() throws Exception { String source = "우리나라라면에서부터 일본라면이 파생되었잖니?"; source = "너는 너는 다시 내게 돌아 올거야. school is a good place 呵呵大笑 呵呵大笑"; long start = System.currentTimeMillis(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); TokenStream tok = new StandardFilter(Version.LUCENE_36, stream); while (tok.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.println(new String(termAttr.buffer(), 0, termAttr.length())); } System.out.println((System.currentTimeMillis() - start) + "ms"); }
public void testHanjaConvert() throws Exception { String source = "呵呵大笑 落落長松 "; long start = System.currentTimeMillis(); KoreanAnalyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); TokenStream tok = new KoreanFilter(stream); while (tok.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.println(new String(termAttr.buffer())); } System.out.println((System.currentTimeMillis() - start) + "ms"); }