public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException { tokenStream.reset(); int index = 0; while (tokenStream.incrementToken() == true) { assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString()); if(expectedTypes != null) { assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type()); } OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); if(expectedStartOffsets != null) { assertEquals(expectedStartOffsets[index], offsets.startOffset()); } if(expectedEndOffsets != null) { assertEquals(expectedEndOffsets[index], offsets.endOffset()); } index++; } tokenStream.end(); }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
/** * Lucene constructor * * @throws UnirestException * @throws JSONException * @throws IOException */ public LTPTokenizer(Set<String> filter) throws IOException, JSONException, UnirestException { super(); logger.info("LTPTokenizer Initialize......"); // Add token offset attribute offsetAttr = addAttribute(OffsetAttribute.class); // Add token content attribute charTermAttr = addAttribute(CharTermAttribute.class); // Add token type attribute typeAttr = addAttribute(TypeAttribute.class); // Add token position attribute piAttr = addAttribute(PositionIncrementAttribute.class); // Create a new word segmenter to get tokens LTPSeg = new LTPWordSegmenter(input); // Add filter words set this.filter = filter; }
@Override public void copyTo(AttributeImpl target) { if (target instanceof Token) { final Token to = (Token) target; to.reinit(this); // reinit shares the payload, so clone it: if (payload !=null) { to.payload = payload.clone(); } } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); ((FlagsAttribute) target).setFlags(flags); ((TypeAttribute) target).setType(type); } }
@Override public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim) throws IOException { TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val); try { ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE; while (ts.incrementToken()) { if (matchType.equals(typeAtt.type())) { BytesRefBuilder ret = new BytesRefBuilder(); ret.copyChars(termAtt.toString()); if (!strict || appendExtraDelim) { ret.append(delimBytes, 0, delimBytes.length); } return ret.get(); } } return new BytesRef(BytesRef.EMPTY_BYTES); } finally { ts.close(); } }
/** verify that payload gets picked up for 1st group of tokens */ public void testTypeForPayload1() throws IOException { TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new TokenArrayTokenizer(tokensWithPayloads), new String[] {"normalized", "filing", "prefix"}, "joined", "normalized", "!", false, false); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); ttjf.reset(); assertTrue(ttjf.incrementToken()); assertEquals("unconsoled!Unconsoled!The ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertEquals("payload1", payloadAtt.getPayload().utf8ToString()); assertTrue(ttjf.incrementToken()); assertEquals("room with a view!Room With A View!A ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertNull(payloadAtt.getPayload()); assertFalse(ttjf.incrementToken()); }
/** verify that payload gets picked up for 2nd group of tokens */ public void testTypeForPayload2() throws IOException { TokenTypeJoinFilter ttjf = new TokenTypeJoinFilter(new TokenArrayTokenizer(tokensWithPayloads), new String[] {"normalized", "filing", "prefix"}, "joined", "filing", "!", false, false); CharTermAttribute termAtt = ttjf.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ttjf.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = ttjf.getAttribute(PayloadAttribute.class); ttjf.reset(); assertTrue(ttjf.incrementToken()); assertEquals("unconsoled!Unconsoled!The ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertNull(payloadAtt.getPayload()); assertTrue(ttjf.incrementToken()); assertEquals("room with a view!Room With A View!A ", termAtt.toString()); assertEquals("joined", typeAtt.type()); assertEquals("payload2", payloadAtt.getPayload().utf8ToString()); assertFalse(ttjf.incrementToken()); }
@Test public void testShorthand2() throws IOException { JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer(); tokenizer.setReader(new StringReader("{\"filing\": \"something\", \"prefix\": \"The \"}")); tokenizer.reset(); assertTrue(tokenizer.incrementToken()); assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertTrue(tokenizer.incrementToken()); assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertFalse(tokenizer.incrementToken()); }
@Test public void testShorthand3() throws IOException { JsonReferencePayloadTokenizer tokenizer = new JsonReferencePayloadTokenizer(); tokenizer.setReader(new StringReader("{\"prefix\": \"The \", \"filing\": \"something\"}")); tokenizer.reset(); assertTrue(tokenizer.incrementToken()); assertEquals("something", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_FILING, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(1, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertTrue(tokenizer.incrementToken()); assertEquals("The ", tokenizer.getAttribute(CharTermAttribute.class).toString()); assertEquals(JsonReferencePayloadTokenizer.TYPE_PREFIX, tokenizer.getAttribute(TypeAttribute.class).type()); assertEquals(0, tokenizer.getAttribute(PositionIncrementAttribute.class).getPositionIncrement()); assertNull(tokenizer.getAttribute(PayloadAttribute.class).getPayload()); assertFalse(tokenizer.incrementToken()); }
public static MyToken[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException { ; TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttr = stream.addAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); ArrayList<MyToken> tokenList = new ArrayList<MyToken>(); while (stream.incrementToken()) { tokenList.add(new MyToken(term.toString(), positionIncrementAttr.getPositionIncrement(), typeAttr.type(), offsetAttr.startOffset(), offsetAttr.endOffset())); } return tokenList.toArray(new MyToken[0]); }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPIndexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
/** * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter} * will be passed along to the tokenizer. * @param input a string to be tokenized * @return a list of tokens extracted from the input string * @throws IOException */ private List<Token> tokenize(String input) throws IOException { List<Token> tokens = new ArrayList<>(); URLTokenizer tokenizer = new URLTokenizer(); // create a copy of the parts list to avoid ConcurrentModificationException when sorting tokenizer.setParts(new ArrayList<>(parts)); tokenizer.setUrlDecode(urlDeocde); tokenizer.setTokenizeHost(tokenizeHost); tokenizer.setTokenizePath(tokenizePath); tokenizer.setTokenizeQuery(tokenizeQuery); tokenizer.setAllowMalformed(allowMalformed || passthrough); tokenizer.setTokenizeMalformed(tokenizeMalformed); tokenizer.setReader(new StringReader(input)); tokenizer.reset(); String term; URLPart part; OffsetAttribute offset; while (tokenizer.incrementToken()) { term = tokenizer.getAttribute(CharTermAttribute.class).toString(); part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type()); offset = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset())); } return tokens; }
protected List<TestToken> collectExtractedNouns(TokenStream stream) throws IOException { CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); List<TestToken> extractedTokens = Lists.newArrayList(); while(stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("startoffSetAtt : " + offSetAtt.startOffset()); System.out.println("endoffSetAtt : " + offSetAtt.endOffset()); System.out.println("typeAttr : " + typeAttr.toString()); extractedTokens.add(t); } return extractedTokens; }
public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.toString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload()); }
public void testLongStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(bytesAtt); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); assertNotNull(typeAtt); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); assertNotNull(numericAtt); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(64, numericAtt.getValueSize()); for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); bytesAtt.fillBytesRef(); assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), NumericUtils.prefixCodedToLong(bytes)); assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
public void testIntStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(bytesAtt); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); assertNotNull(typeAtt); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); assertNotNull(numericAtt); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(32, numericAtt.getValueSize()); for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); bytesAtt.fillBytesRef(); assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes)); assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException { tokenStream.reset(); int pos = 0; CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream); OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream); TypeAttribute typeAttribute = getTypeAttribute(tokenStream); PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream); while (tokenStream.incrementToken()) { if (null == charTermAttribute || null == offsetAttribute) { return; } Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); if (null != typeAttribute) { token.setType(typeAttribute.type()); } pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1; if (!tokenPosMap.containsKey(pos)) { tokenPosMap.put(pos, new LinkedList<Token>()); } tokenPosMap.get(pos).add(token); } tokenStream.close(); }
public void testAttributesAfterStreamEnd() throws IOException { final String path = "uri1:one"; StringReader reader = new StringReader(path); PathTokenFilter ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); // PathTokenFilter.end() will be called after all tokens consumed. tokenise(ts, new String[]{"uri1", "one"}); // Check attributes cleaned up assertEquals("", termAtt.toString()); assertEquals("word", typeAtt.type()); // the default assertEquals(0, posIncAtt.getPositionIncrement()); // Final offset... assertEquals(path.length(), offsetAtt.startOffset()); assertEquals(path.length(), offsetAtt.endOffset()); }
@Test public void testFull() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 4); }
@Test public void testFullWithNoChineseOut() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer, false, 1, false); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 3); }
@Test public void testShort() throws IOException { this.filter = new PinyinTransformTokenFilter(tokenizer, true); this.filter.reset(); int position = 0; while (this.filter.incrementToken()) { CharTermAttribute termAtt = this.filter.getAttribute(CharTermAttribute.class); String token = termAtt.toString(); int increment = this.filter.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); position += increment; OffsetAttribute offset = this.filter.getAttribute(OffsetAttribute.class); TypeAttribute type = this.filter.getAttribute(TypeAttribute.class); System.out.println(position + "[" + offset.startOffset() + "," + offset.endOffset() + "} (" + type .type() + ") " + token); } assertTrue(position == 4); }
public void testStandardTokenizer() throws Exception { String source = "우리나라라면에서부터 일본라면이 파생되었잖니?"; source = "너는 너는 다시 내게 돌아 올거야. school is a good place 呵呵大笑 呵呵大笑"; long start = System.currentTimeMillis(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); TokenStream tok = new StandardFilter(Version.LUCENE_36, stream); while (tok.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.println(new String(termAttr.buffer(), 0, termAttr.length())); } System.out.println((System.currentTimeMillis() - start) + "ms"); }
public void testHanjaConvert() throws Exception { String source = "呵呵大笑 落落長松 "; long start = System.currentTimeMillis(); KoreanAnalyzer analyzer = new KoreanAnalyzer(); TokenStream stream = analyzer.tokenStream("s", new StringReader(source)); TokenStream tok = new KoreanFilter(stream); while (tok.incrementToken()) { CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); OffsetAttribute offAttr = stream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute posAttr = stream.getAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.getAttribute(TypeAttribute.class); System.out.println(new String(termAttr.buffer())); } System.out.println((System.currentTimeMillis() - start) + "ms"); }
public void testLongStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setLongValue(lvalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(64, numericAtt.getValueSize()); for (int shift=0; shift<64; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); final int hash = bytesAtt.fillBytesRef(); assertEquals("Hash incorrect", bytes.hashCode(), hash); assertEquals("Term is incorrectly encoded", lvalue & ~((1L << shift) - 1L), NumericUtils.prefixCodedToLong(bytes)); assertEquals("Term raw value is incorrectly encoded", lvalue & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
public void testIntStream() throws Exception { final NumericTokenStream stream=new NumericTokenStream().setIntValue(ivalue); // use getAttribute to test if attributes really exist, if not an IAE will be throwed final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class); final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class); final NumericTokenStream.NumericTermAttribute numericAtt = stream.getAttribute(NumericTokenStream.NumericTermAttribute.class); final BytesRef bytes = bytesAtt.getBytesRef(); stream.reset(); assertEquals(32, numericAtt.getValueSize()); for (int shift=0; shift<32; shift+=NumericUtils.PRECISION_STEP_DEFAULT) { assertTrue("New token is available", stream.incrementToken()); assertEquals("Shift value wrong", shift, numericAtt.getShift()); final int hash = bytesAtt.fillBytesRef(); assertEquals("Hash incorrect", bytes.hashCode(), hash); assertEquals("Term is incorrectly encoded", ivalue & ~((1 << shift) - 1), NumericUtils.prefixCodedToInt(bytes)); assertEquals("Term raw value is incorrectly encoded", ((long) ivalue) & ~((1L << shift) - 1L), numericAtt.getRawValue()); assertEquals("Type incorrect", (shift == 0) ? NumericTokenStream.TOKEN_TYPE_FULL_PREC : NumericTokenStream.TOKEN_TYPE_LOWER_PREC, typeAtt.type()); } assertFalse("More tokens available", stream.incrementToken()); stream.end(); stream.close(); }
protected void analyze(Collection<Token> result, Reader text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }