public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) { super(suffix); this.suffix = suffix; this.prefix = prefix; prefixExhausted = false; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); p_termAtt = prefix.addAttribute(CharTermAttribute.class); p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class); p_payloadAtt = prefix.addAttribute(PayloadAttribute.class); p_offsetAtt = prefix.addAttribute(OffsetAttribute.class); p_typeAtt = prefix.addAttribute(TypeAttribute.class); p_flagsAtt = prefix.addAttribute(FlagsAttribute.class); }
@Override public void copyTo(AttributeImpl target) { if (target instanceof Token) { final Token to = (Token) target; to.reinit(this); // reinit shares the payload, so clone it: if (payload !=null) { to.payload = payload.clone(); } } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); ((FlagsAttribute) target).setFlags(flags); ((TypeAttribute) target).setType(type); } }
public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.toString()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new BytesRef(new byte[]{0,1,2,3}), payloadAtt.getPayload()); }
@Override public void reflectWith(AttributeReflector reflector) { super.reflectWith(reflector); reflector.reflect(OffsetAttribute.class, "startOffset", startOffset); reflector.reflect(OffsetAttribute.class, "endOffset", endOffset); reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement); reflector.reflect(PayloadAttribute.class, "payload", payload); reflector.reflect(FlagsAttribute.class, "flags", flags); reflector.reflect(TypeAttribute.class, "type", type); }
public void testBoth() throws Exception { Set<String> untoks = new HashSet<>(); untoks.add(WikipediaTokenizer.CATEGORY); untoks.add(WikipediaTokenizer.ITALICS); String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; //should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens WikipediaTokenizer tf = new WikipediaTokenizer(newAttributeFactory(), new StringReader(test), WikipediaTokenizer.BOTH, untoks); assertTokenStreamContents(tf, new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h i j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 } ); // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase? tf = new WikipediaTokenizer(newAttributeFactory(), new StringReader(test), WikipediaTokenizer.BOTH, untoks); int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 }; FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class); tf.reset(); for (int i = 0; i < expectedFlags.length; i++) { assertTrue(tf.incrementToken()); assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags()); } assertFalse(tf.incrementToken()); tf.close(); }
public void testAttributeReuse() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_3_0); // just consume TokenStream ts = analyzer.tokenStream("dummy", "ภาษาไทย"); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); // this consumer adds flagsAtt, which this analyzer does not use. ts = analyzer.tokenStream("dummy", "ภาษาไทย"); ts.addAttribute(FlagsAttribute.class); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); }
@Override public Collection<Token> convert(String origQuery) { Collection<Token> result = new HashSet<>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(); TokenStream ts = null; try { ts = analyzer.tokenStream("", origQuery); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); return result; } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeWhileHandlingException(ts); } }
public void testBoth() throws Exception { Set<String> untoks = new HashSet<String>(); untoks.add(WikipediaTokenizer.CATEGORY); untoks.add(WikipediaTokenizer.ITALICS); String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]"; //should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); assertTokenStreamContents(tf, new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g", "link", "here", "link", "there", "italics here", "italics", "here", "something", "more italics", "more", "italics", "h i j", "h", "i", "j" }, new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 }, new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 }, new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 } ); // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase? tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks); int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 }; FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class); tf.reset(); for (int i = 0; i < expectedFlags.length; i++) { assertTrue(tf.incrementToken()); assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags()); } assertFalse(tf.incrementToken()); tf.close(); }
public void testAttributeReuse() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30); // just consume TokenStream ts = analyzer.tokenStream("dummy", new StringReader("ภาษาไทย")); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); // this consumer adds flagsAtt, which this analyzer does not use. ts = analyzer.tokenStream("dummy", new StringReader("ภาษาไทย")); ts.addAttribute(FlagsAttribute.class); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); }
@Override public Collection<Token> convert(String origQuery) { try { Collection<Token> result = new HashSet<Token>(); WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_40); TokenStream ts = analyzer.tokenStream("", new StringReader(origQuery)); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { Token tok = new Token(); tok.copyBuffer(termAtt.buffer(), 0, termAtt.length()); tok.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); tok.setFlags(flagsAtt.getFlags()); tok.setPayload(payloadAtt.getPayload()); tok.setPositionIncrement(posIncAtt.getPositionIncrement()); tok.setType(typeAtt.type()); result.add(tok); } ts.end(); ts.close(); return result; } catch (IOException e) { throw new RuntimeException(e); } }
protected NamedEntityPopulateFilter(ResultNamedEntityExtraction result, TokenStream input) { super(input); this.result = result; this.termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class); this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); }
public TokenTerm(final CharTermAttribute termAtt, final PositionIncrementAttribute posIncrAtt, final OffsetAttribute offsetAtt, final TypeAttribute typeAtt, final FlagsAttribute flagsAtt) { this.term = termAtt != null ? termAtt.toString() : null; this.start = offsetAtt != null ? offsetAtt.startOffset() : 0; this.end = offsetAtt != null ? offsetAtt.endOffset() : 0; this.increment = posIncrAtt != null ? posIncrAtt.getPositionIncrement() : 0; this.type = typeAtt != null ? typeAtt.type() : null; this.flags = flagsAtt != null ? flagsAtt.getFlags() : 0; }
protected AbstractTermFilter(TokenStream input) { super(input); termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); }
public void testAttributeReuse() throws Exception { ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30); // just consume TokenStream ts = analyzer.tokenStream("dummy", "ภาษาไทย"); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); // this consumer adds flagsAtt, which this analyzer does not use. ts = analyzer.tokenStream("dummy", "ภาษาไทย"); ts.addAttribute(FlagsAttribute.class); assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" }); }
private void copyToWithoutPayloadClone(AttributeImpl target) { super.copyTo(target); ((FlagsAttribute) target).setFlags(flags); ((PayloadAttribute) target).setPayload(payload); }
@Override public void copyTo(AttributeImpl target) { super.copyTo(target); ((FlagsAttribute) target).setFlags(flags); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); }
@Override public void reflectWith(AttributeReflector reflector) { super.reflectWith(reflector); reflector.reflect(FlagsAttribute.class, "flags", flags); reflector.reflect(PayloadAttribute.class, "payload", payload); }