public static void printlnToken(String txt, Analyzer analyzer) throws IOException { System.out.println("---------"+txt.length()+"\n"+txt); TokenStream ts = analyzer.tokenStream("text", new StringReader(txt)); /*//lucene 2.9 以下 for(Token t= new Token(); (t=ts.next(t)) !=null;) { System.out.println(t); }*/ /*while(ts.incrementToken()) { TermAttribute termAtt = (TermAttribute)ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute)ts.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute)ts.getAttribute(TypeAttribute.class); System.out.println("("+termAtt.term()+","+offsetAtt.startOffset()+","+offsetAtt.endOffset()+",type="+typeAtt.type()+")"); }*/ ts.reset(); for(PackedTokenAttributeImpl t= new PackedTokenAttributeImpl(); (t=TokenUtils.nextToken(ts, t)) !=null;) { System.out.println(t); } ts.close(); }
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { if (current == null) { throw new IllegalArgumentException("current is null"); } PackedTokenAttributeImpl token = tokens.removeFirst(); restoreState(current); termAtt.setEmpty().append(token); posIncAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { process(); if (!tokens.isEmpty()) { current = captureState(); } return true; } else { return false; } }
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { if (current == null) { throw new IllegalArgumentException("current is null"); } PackedTokenAttributeImpl token = tokens.removeFirst(); restoreState(current); termAtt.setEmpty().append(token); posIncAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { detect(); if (!tokens.isEmpty()) { current = captureState(); } return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { while (true) { if (curTermBuffer == null) { if (!input.incrementToken()) { return false; } else { curTermBuffer = termAtt.buffer().clone(); curLen = ((PackedTokenAttributeImpl) termAtt).endOffset() - ((PackedTokenAttributeImpl) termAtt).startOffset(); } } else { if (curPos < curLen) { termAtt.copyBuffer(curTermBuffer, curPos, 1); curPos++; return true; } else { curTermBuffer = null; curPos = 0; } } } }
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { PackedTokenAttributeImpl token = eudex(); restoreState(current); termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(0); current = captureState(); return true; } else { return false; } }
protected PackedTokenAttributeImpl eudex() throws CharacterCodingException { String term = new String(termAtt.buffer(), 0, termAtt.length()); CharSequence s = Long.toHexString(eudex.encode(term)); PackedTokenAttributeImpl impl = new PackedTokenAttributeImpl(); impl.append(s); return impl; }
public CutLetterDigitFilter(TokenStream input) { super(input); reusableToken = new PackedTokenAttributeImpl(); termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); }
private void addToken(PackedTokenAttributeImpl oriToken, int termBufferOffset, int termBufferLength, byte type) { PackedTokenAttributeImpl token = TokenUtils.subToken(oriToken, termBufferOffset, termBufferLength); if(type == Character.DECIMAL_DIGIT_NUMBER) { token.setType(Word.TYPE_DIGIT); } else { token.setType(Word.TYPE_LETTER); } tokenQueue.offer(token); }
public final boolean incrementToken() throws IOException { clearAttributes(); PackedTokenAttributeImpl token = nextToken(reusableToken); if(token != null) { termAtt.copyBuffer(token.buffer(), 0, token.length()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); return true; } else { return false; } }
protected void process() throws CharacterCodingException { String term = new String(termAtt.buffer(), 0, termAtt.length()); for (CharSequence charSequence : process(term)) { if (charSequence != null) { PackedTokenAttributeImpl token = new PackedTokenAttributeImpl(); token.append(charSequence); tokens.add(token); } } }
private void detect() throws CharacterCodingException { CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); Collection<CharSequence> variants = service.lookup(settings, term); for (CharSequence ch : variants) { if (ch != null) { PackedTokenAttributeImpl token = new PackedTokenAttributeImpl(); token.append(ch); tokens.add(token); } } }
protected void baseform() throws CharacterCodingException { CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); CharSequence s = dictionary.lookup(term); if (s != null && s.length() > 0) { PackedTokenAttributeImpl impl = new PackedTokenAttributeImpl(); impl.append(s); tokens.add(impl); } }
protected void detect() throws CharacterCodingException { CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); Collection<CharSequence> variants = standardNumberService.lookup(term); for (CharSequence ch : variants) { if (ch != null) { PackedTokenAttributeImpl token = new PackedTokenAttributeImpl(); token.append(ch); tokens.add(token); } } }
protected void detect() throws CharacterCodingException { CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); Collection<CharSequence> variants = service.lookup(settings, term); for (CharSequence ch : variants) { if (ch != null) { PackedTokenAttributeImpl token = new PackedTokenAttributeImpl(); token.append(ch); tokens.add(token); } } }
private PackedTokenAttributeImpl nextToken(PackedTokenAttributeImpl reusableToken) throws IOException { assert reusableToken != null; //先使用上次留下来的。 PackedTokenAttributeImpl nextToken = tokenQueue.poll(); if(nextToken != null) { return nextToken; } nextToken = TokenUtils.nextToken(input, reusableToken); if(nextToken != null && (Word.TYPE_LETTER_OR_DIGIT.equalsIgnoreCase(nextToken.type()) || Word.TYPE_DIGIT_OR_LETTER.equalsIgnoreCase(nextToken.type())) ) { final char[] buffer = nextToken.buffer(); final int length = nextToken.length(); byte lastType = (byte) Character.getType(buffer[0]); //与上次的字符是否同类 int termBufferOffset = 0; int termBufferLength = 0; for(int i=0;i<length;i++) { byte type = (byte) Character.getType(buffer[i]); if(type <= Character.MODIFIER_LETTER) { type = Character.LOWERCASE_LETTER; } if(type != lastType) { //与上一次的不同 addToken(nextToken, termBufferOffset, termBufferLength, lastType); termBufferOffset += termBufferLength; termBufferLength = 0; lastType = type; } termBufferLength++; } if(termBufferLength > 0) { //最后一次 addToken(nextToken, termBufferOffset, termBufferLength, lastType); } nextToken = tokenQueue.poll(); } return nextToken; }
protected StandardNumberTokenFilter(TokenStream input, StandardNumberService standardNumberService) { super(input); this.tokens = new LinkedList<PackedTokenAttributeImpl>(); this.standardNumberService = standardNumberService; }
protected BaseformTokenFilter(TokenStream input, Dictionary dictionary) { super(input); this.tokens = new LinkedList<PackedTokenAttributeImpl>(); this.dictionary = dictionary; }