public static void assertTokenStream(TokenStream tokenStream, String[] expectedCharTerms, String[] expectedTypes, int[] expectedStartOffsets, int[] expectedEndOffsets) throws IOException { tokenStream.reset(); int index = 0; while (tokenStream.incrementToken() == true) { assertEquals(expectedCharTerms[index], tokenStream.getAttribute(CharTermAttribute.class).toString()); if(expectedTypes != null) { assertEquals(expectedTypes[index], tokenStream.getAttribute(TypeAttribute.class).type()); } OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); if(expectedStartOffsets != null) { assertEquals(expectedStartOffsets[index], offsets.startOffset()); } if(expectedEndOffsets != null) { assertEquals(expectedEndOffsets[index], offsets.endOffset()); } index++; } tokenStream.end(); }
public static void main(String[] args) throws IOException { List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 "); System.out.println(parse); List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文"); //System.out.println(parse1); String text11="ZW321282050000000325"; Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); tokenizer.reset(); while (tokenizer.incrementToken()){ System.out.print(new String(termAtt.toString()+" ") ); // System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" ); //System.out.print( positionIncrementAtt.getPositionIncrement() +"/"); } tokenizer.close(); }
private List<TokenData> parse(String text) { NamedAnalyzer analyzer = getAnalysisService().indexAnalyzers.get("test"); try { try (TokenStream ts = analyzer.tokenStream("test", new StringReader(text))) { List<TokenData> result = new ArrayList<>(); CharTermAttribute charTerm = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); PositionIncrementAttribute position = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { String original = text.substring(offset.startOffset(), offset.endOffset()); result.add(token(original, charTerm.toString(), position.getPositionIncrement())); } ts.end(); return result; } } catch (IOException e) { throw new RuntimeException(e); } }
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } }
public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
static private void assertOffsets(String inputStr, TokenStream tokenStream, List<String> expected) { try { List<String> termList = new ArrayList<String>(); // CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = tokenStream.addAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { int start = offsetAttr.startOffset(); int end = offsetAttr.endOffset(); termList.add(inputStr.substring(start, end)); } System.out.println(String.join(" ", termList)); assertThat(termList, is(expected)); } catch (IOException e) { assertTrue(false); } }
/** * Lucene constructor * * @throws UnirestException * @throws JSONException * @throws IOException */ public LTPTokenizer(Set<String> filter) throws IOException, JSONException, UnirestException { super(); logger.info("LTPTokenizer Initialize......"); // Add token offset attribute offsetAttr = addAttribute(OffsetAttribute.class); // Add token content attribute charTermAttr = addAttribute(CharTermAttribute.class); // Add token type attribute typeAttr = addAttribute(TypeAttribute.class); // Add token position attribute piAttr = addAttribute(PositionIncrementAttribute.class); // Create a new word segmenter to get tokens LTPSeg = new LTPWordSegmenter(input); // Add filter words set this.filter = filter; }
@Test public void testSearch() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "重qing"); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "重"); Assert.assertEquals(offsetAttribute.startOffset(), 0); Assert.assertEquals(offsetAttribute.endOffset(), 1); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); Assert.assertTrue(tokenStream.incrementToken()); Assert.assertEquals(charTermAttribute.toString(), "qing"); Assert.assertEquals(offsetAttribute.startOffset(), 1); Assert.assertEquals(offsetAttribute.endOffset(), 5); Assert.assertEquals(positionIncrementAttribute.getPositionIncrement(), 1); tokenStream.close(); }
public void testFullPinyinFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.full_pinyin); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
public void testFirstLetterFilter() throws IOException { LcPinyinAnalyzer analyzer = new LcPinyinAnalyzer(AnalysisSetting.search); TokenStream tokenStream = analyzer.tokenStream("lc", "作者 : 陈楠"); LcPinyinTokenFilter lcPinyinTokenFilter = new LcPinyinTokenFilter(tokenStream, PinyinFilterSetting.first_letter); CharTermAttribute charTermAttribute = lcPinyinTokenFilter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = lcPinyinTokenFilter.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = lcPinyinTokenFilter.getAttribute(PositionIncrementAttribute.class); lcPinyinTokenFilter.reset(); while (lcPinyinTokenFilter.incrementToken()) { System.out.println(charTermAttribute.toString() + ":" + offsetAttribute.startOffset() + "," + offsetAttribute.endOffset() + ":" + positionIncrementAttribute.getPositionIncrement()); } lcPinyinTokenFilter.close(); }
@Override public void copyTo(AttributeImpl target) { if (target instanceof Token) { final Token to = (Token) target; to.reinit(this); // reinit shares the payload, so clone it: if (payload !=null) { to.payload = payload.clone(); } } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); ((FlagsAttribute) target).setFlags(flags); ((TypeAttribute) target).setType(type); } }
public WrappedTokenStream(TokenStream tokenStream, String pText) { this.pText = pText; this.tokenStream = tokenStream; if(tokenStream.hasAttribute(CharTermAttribute.class)) { charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); } if(tokenStream.hasAttribute(OffsetAttribute.class)) { offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); } if(tokenStream.hasAttribute(CharsRefTermAttribute.class)) { charsRefTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } if(tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } additionalTermAttributeLocal.init(this); }
@Test public void testBulk() throws IOException { String str = ""; str = "SK, 하이닉스"; //str = "하이닉스"; StringReader input = new StringReader(str); CSVAnalyzer analyzer = new CSVAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("", input); tokenStream.reset(); logger.debug("tokenStream:{}", tokenStream); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); for(int inx=0;tokenStream.incrementToken();inx++) { String term = charTermAttribute.toString(); logger.debug("[{}] \"{}\" {}~{}", inx, term, offsetAttribute.startOffset(), offsetAttribute.endOffset()); } analyzer.close(); }
/** * @param input 词元输入 * @param type 输出拼音缩写还是完整拼音 可取值:{@link #TYPE_ABBREVIATION}、{@link #TYPE_PINYIN}、{@link #TYPE_BOTH} * @param minTermLength 中文词组过滤长度 * @param maxPolyphoneFreq 多音字出现最大次数 * @param isOutChinese 是否输入原中文词元 */ public PinyinTransformTokenFilter(TokenStream input, int type, int minTermLength, int maxPolyphoneFreq, boolean isOutChinese) { super(input); this._minTermLength = minTermLength; this.maxPolyphoneFreq = maxPolyphoneFreq; if (this._minTermLength < 1) { this._minTermLength = 1; } if (this.maxPolyphoneFreq < 1) { this.maxPolyphoneFreq = Integer.MAX_VALUE; } this.isOutChinese = isOutChinese; this.outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); this.outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); this.type = type; addAttribute(OffsetAttribute.class); // 偏移量属性 }
@NotNull private static List<SearchToken> generateSearchTokens(@NotNull final String searchTerm) throws IOException { final Set<SearchToken> searchTokens = Sets.newHashSet(); final TokenStream tokenStream = getSpellCheckedShingleStream(searchTerm); tokenStream.reset(); while (tokenStream.incrementToken()) { final String searchToken = tokenStream.getAttribute(CharTermAttribute.class).toString(); final OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); searchTokens.add(ImmutableSearchToken.of(searchToken, offsetAttribute.startOffset(), offsetAttribute.endOffset())); } tokenStream.end(); tokenStream.close(); return searchTokens.stream() .sorted(Comparator.comparing(SearchToken::length).reversed().thenComparing(SearchToken::startOffset)) .collect(Collectors.toList()); }
protected void displayTokens(String text, String elementId) throws IOException { if (log.isDebugEnabled()) { Analyzer analyzer = getConfiguredAnalyzer(); StringBuilder sb = new StringBuilder(); sb.append(elementId).append(": ").append(text).append(": "); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); sb.append("[" + term + "](" + startOffset + "," + endOffset + ") "); } log.debug(sb); } }
@Override public void apply(Document doc) { try { TokenStream stream = analyzer.tokenStream("contents", new StringReader(doc.text())); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { Token tok = new Token(doc).setRange(offsetAttribute.startOffset(), offsetAttribute.endOffset()); tok.putProperty(TokenProperties.STEM, charTermAttribute.toString()); } stream.close(); } catch(IOException ex) { throw new LangforiaRuntimeException(ex); } }
private String[] walkTokens() throws IOException { List<String> wordList = new ArrayList<>(); while (input.incrementToken()) { CharTermAttribute textAtt = input.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class); char[] buffer = textAtt.buffer(); String word = new String(buffer, 0, offsetAtt.endOffset() - offsetAtt.startOffset()); wordList.add(word); AttributeSource attrs = input.cloneAttributes(); tokenAttrs.add(attrs); } String[] words = new String[wordList.size()]; for (int i = 0; i < words.length; i++) { words[i] = wordList.get(i); } return words; }
public static MyToken[] tokensFromAnalysis(Analyzer analyzer, String text, String field) throws IOException { ; TokenStream stream = analyzer.tokenStream(field, new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttr = stream.addAttribute(PositionIncrementAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); ArrayList<MyToken> tokenList = new ArrayList<MyToken>(); while (stream.incrementToken()) { tokenList.add(new MyToken(term.toString(), positionIncrementAttr.getPositionIncrement(), typeAttr.type(), offsetAttr.startOffset(), offsetAttr.endOffset())); } return tokenList.toArray(new MyToken[0]); }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testIssue() throws Exception { Map<String, String> args = new TreeMap<>(); args.put("enableTraditionalChineseMode", "true"); args.put("enableNormalization", "true"); HanLPTokenizerFactory factory = new HanLPTokenizerFactory(args); Tokenizer tokenizer = factory.create(); String text = "會辦台星保證最低價的原因?"; tokenizer.setReader(new StringReader(text)); tokenizer.reset(); while (tokenizer.incrementToken()) { CharTermAttribute attribute = tokenizer.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenizer.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenizer.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
public void testCreateComponents() throws Exception { String text = "中华人民共和国很辽阔"; for (int i = 0; i < text.length(); ++i) { System.out.print(text.charAt(i) + "" + i + " "); } System.out.println(); Analyzer analyzer = new HanLPIndexAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("field", text); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); // 偏移量 OffsetAttribute offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); // 距离 PositionIncrementAttribute positionAttr = tokenStream.getAttribute(PositionIncrementAttribute.class); // 词性 TypeAttribute typeAttr = tokenStream.getAttribute(TypeAttribute.class); System.out.printf("[%d:%d %d] %s/%s\n", offsetAtt.startOffset(), offsetAtt.endOffset(), positionAttr.getPositionIncrement(), attribute, typeAttr.type()); } }
/** * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter} * will be passed along to the tokenizer. * @param input a string to be tokenized * @return a list of tokens extracted from the input string * @throws IOException */ private List<Token> tokenize(String input) throws IOException { List<Token> tokens = new ArrayList<>(); URLTokenizer tokenizer = new URLTokenizer(); // create a copy of the parts list to avoid ConcurrentModificationException when sorting tokenizer.setParts(new ArrayList<>(parts)); tokenizer.setUrlDecode(urlDeocde); tokenizer.setTokenizeHost(tokenizeHost); tokenizer.setTokenizePath(tokenizePath); tokenizer.setTokenizeQuery(tokenizeQuery); tokenizer.setAllowMalformed(allowMalformed || passthrough); tokenizer.setTokenizeMalformed(tokenizeMalformed); tokenizer.setReader(new StringReader(input)); tokenizer.reset(); String term; URLPart part; OffsetAttribute offset; while (tokenizer.incrementToken()) { term = tokenizer.getAttribute(CharTermAttribute.class).toString(); part = URLPart.fromString(tokenizer.getAttribute(TypeAttribute.class).type()); offset = tokenizer.getAttribute(OffsetAttribute.class); tokens.add(new Token(term, part, offset.startOffset(), offset.endOffset())); } return tokens; }
protected List<TestToken> collectExtractedNouns(TokenStream stream) throws IOException { CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); List<TestToken> extractedTokens = Lists.newArrayList(); while(stream.incrementToken()) { TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("startoffSetAtt : " + offSetAtt.startOffset()); System.out.println("endoffSetAtt : " + offSetAtt.endOffset()); System.out.println("typeAttr : " + typeAttr.toString()); extractedTokens.add(t); } return extractedTokens; }
@Test public void testIncrementToken() throws IOException { CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class); int expected_token_count = 6; int observed_token_count = 0; while(tokenizer.incrementToken()) { observed_token_count++; TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); System.out.println("termAtt.term() : " + charTermAtt.toString()); System.out.println("startOffset : " + offSetAtt.startOffset()); System.out.println("endOffset : " + offSetAtt.endOffset()); Assert.assertTrue(tokenizedToken.contains(t)); } Assert.assertEquals(expected_token_count, observed_token_count); }
public void testSupplementaryCharacters() throws IOException { final String s = TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = TestUtil.nextInt(random(), 1, 3); final int maxGram = TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new EdgeNGramTokenFilter(tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int end = Character.offsetByCodePoints(s, 0, i); assertEquals(s.substring(0, end), termAtt.toString()); } assertFalse(tk.incrementToken()); }
public void testSupplementaryCharacters() throws IOException { final String s = TestUtil.randomUnicodeString(random(), 10); final int codePointCount = s.codePointCount(0, s.length()); final int minGram = TestUtil.nextInt(random(), 1, 3); final int maxGram = TestUtil.nextInt(random(), minGram, 10); TokenStream tk = new KeywordTokenizer(new StringReader(s)); tk = new NGramTokenFilter(tk, minGram, maxGram); final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class); tk.reset(); for (int start = 0; start < codePointCount; ++start) { for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) { assertTrue(tk.incrementToken()); assertEquals(0, offsetAtt.startOffset()); assertEquals(s.length(), offsetAtt.endOffset()); final int startIndex = Character.offsetByCodePoints(s, 0, start); final int endIndex = Character.offsetByCodePoints(s, 0, end); assertEquals(s.substring(startIndex, endIndex), termAtt.toString()); } } assertFalse(tk.incrementToken()); }
public void testOtherLetterOffset() throws IOException { String s = "a天b"; ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s)); int correctStartOffset = 0; int correctEndOffset = 1; OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class); tokenizer.reset(); while (tokenizer.incrementToken()) { assertEquals(correctStartOffset, offsetAtt.startOffset()); assertEquals(correctEndOffset, offsetAtt.endOffset()); correctStartOffset++; correctEndOffset++; } tokenizer.end(); tokenizer.close(); }
public void testDups(final String expected, final Token... tokens) throws Exception { final Iterator<Token> toks = Arrays.asList(tokens).iterator(); final TokenStream ts = new RemoveDuplicatesTokenFilter( (new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); @Override public boolean incrementToken() { if (toks.hasNext()) { clearAttributes(); Token tok = toks.next(); termAtt.setEmpty().append(tok); offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); posIncAtt.setPositionIncrement(tok.getPositionIncrement()); return true; } else { return false; } } })); assertTokenStreamContents(ts, expected.split("\\s")); }
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception { left.reset(); right.reset(); CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class); CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class); OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class); OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class); PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class); while (left.incrementToken()) { assertTrue("wrong number of tokens for input: " + s, right.incrementToken()); assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString()); assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement()); assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset()); assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); }; assertFalse("wrong number of tokens for input: " + s, right.incrementToken()); left.end(); right.end(); assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset()); left.close(); right.close(); }
public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.toString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload()); }
protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException { TokenStream stream = analyzer.tokenStream("", text); // TODO: support custom attributes CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset()); token.setFlags(flagsAttValue); //overwriting any flags already set... token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } stream.end(); stream.close(); }
@Override public TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer ts = new Tokenizer(reader) { final char[] cbuf = new char[maxChars]; final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @Override public boolean incrementToken() throws IOException { clearAttributes(); int n = input.read(cbuf,0,maxChars); if (n<=0) return false; String s = toInternal(new String(cbuf,0,n)); termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(0),correctOffset(n)); return true; } }; return new TokenStreamComponents(ts); }
private void handleTokenStream(Map<Integer, List<Token>> tokenPosMap, TokenStream tokenStream) throws IOException { tokenStream.reset(); int pos = 0; CharTermAttribute charTermAttribute = getCharTermAttribute(tokenStream); OffsetAttribute offsetAttribute = getOffsetAttribute(tokenStream); TypeAttribute typeAttribute = getTypeAttribute(tokenStream); PositionIncrementAttribute positionIncrementAttribute = getPositionIncrementAttribute(tokenStream); while (tokenStream.incrementToken()) { if (null == charTermAttribute || null == offsetAttribute) { return; } Token token = new Token(charTermAttribute.buffer(), 0, charTermAttribute.length(), offsetAttribute.startOffset(), offsetAttribute.endOffset()); if (null != typeAttribute) { token.setType(typeAttribute.type()); } pos += null != positionIncrementAttribute ? positionIncrementAttribute.getPositionIncrement() : 1; if (!tokenPosMap.containsKey(pos)) { tokenPosMap.put(pos, new LinkedList<Token>()); } tokenPosMap.get(pos).add(token); } tokenStream.close(); }
@Override public List<Annotation> annotate(String text) throws Exception { text = SimpleTokenizer.format(text); Analyzer analyser = new EnglishAnalyzer(Version.LUCENE_47, CharArraySet.EMPTY_SET); TokenFilter filter = new EnglishMinimalStemFilter(analyser.tokenStream("text", new StringReader(text))); List<Annotation> out = Lists.newArrayList(); while (filter.incrementToken()) { CharTermAttribute az = filter.getAttribute(CharTermAttribute.class); OffsetAttribute o = filter.getAttribute(OffsetAttribute.class); String token = text.substring(o.startOffset(), o.endOffset()); String lemma = az.toString(); Annotation t = new Annotation(); t.setForm(token); t.setLemma(lemma); out.add(t); } if (out.size() == 0) { log.debug("Input string is empty"); } filter.close(); analyser.close(); return out; }
@Override public int compare(OffsetAttribute offsetA, OffsetAttribute offsetB) { int lenA = offsetA.endOffset() - offsetA.startOffset(); int lenB = offsetB.endOffset() - offsetB.startOffset(); if (lenA < lenB) { return 1; } else if (lenA > lenB) { return -1; // by here, the length is the same } else if (offsetA.startOffset() < offsetB.startOffset()) { return -1; } else if (offsetA.startOffset() > offsetB.startOffset()) { return 1; } return 0; }
public void testTokenizerReuse() throws IOException { // We should be able to use the same Tokenizer twice. final String path = "uri1:one"; StringReader reader = new StringReader(path); PathTokenFilter ts = new PathTokenFilter(reader, PathTokenFilter.PATH_SEPARATOR, PathTokenFilter.SEPARATOR_TOKEN_TEXT, PathTokenFilter.NO_NS_TOKEN_TEXT, PathTokenFilter.NAMESPACE_START_DELIMITER, PathTokenFilter.NAMESPACE_END_DELIMITER, true); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); // First use tokenise(ts, new String[]{"uri1", "one"}); assertEquals(path.length(), offsetAtt.startOffset()); assertEquals(path.length(), offsetAtt.endOffset()); // Second use final String path2 = "/{uri1}one/uri2:two/"; StringReader reader2 = new StringReader(path2); ts.setReader(reader2); tokenise(ts, new String[]{"uri1", "one", "uri2", "two"}); assertEquals(path2.length(), offsetAtt.startOffset()); assertEquals(path2.length(), offsetAtt.endOffset()); }