public PathHierarchyTokenizer (AttributeFactory factory, Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(factory, input); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); } if (skip < 0) { throw new IllegalArgumentException("skip cannot be negative"); } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); }
public ReversePathHierarchyTokenizer (AttributeFactory factory, Reader input, int bufferSize, char delimiter, char replacement, int skip) { super(factory, input); if (bufferSize < 0) { throw new IllegalArgumentException("bufferSize cannot be negative"); } if (skip < 0) { throw new IllegalArgumentException("skip cannot be negative"); } termAtt.resizeBuffer(bufferSize); this.delimiter = delimiter; this.replacement = replacement; this.skip = skip; resultToken = new StringBuilder(bufferSize); resultTokenBuffer = new char[bufferSize]; delimiterPositions = new ArrayList<>(bufferSize/10); }
public IKTokenizer(AttributeFactory factory, boolean useSmart){ super(factory); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); _IKImplement = new IKSegmenter(input , useSmart); }
/** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */ public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) { super(factory, input); this.group = group; // Use "" instead of str so don't consume chars // (fillBuffer) from the input on throwing IAE below: matcher = pattern.matcher(""); // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher if (group >= 0 && group > matcher.groupCount()) { throw new IllegalArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups"); } }
@Override public Tokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { if (!EdgeNGramTokenFilter.Side.FRONT.getLabel().equals(side)) { throw new IllegalArgumentException(EdgeNGramTokenizer.class.getSimpleName() + " does not support backward n-grams as of Lucene 4.4"); } return new EdgeNGramTokenizer(input, minGramSize, maxGramSize); } else { return new Lucene43EdgeNGramTokenizer(luceneMatchVersion, input, side, minGramSize, maxGramSize); } }
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */ @Override public Tokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion.onOrAfter(Version.LUCENE_4_4_0)) { return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize); } else { return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize); } }
/** * Expert: Creates a token stream for numeric values with the specified * <code>precisionStep</code> using the given * {@link org.apache.lucene.util.AttributeFactory}. * The stream is not yet initialized, * before using set a value using the various set<em>???</em>Value() methods. */ public NumericTokenStream(AttributeFactory factory, final int precisionStep) { super(new NumericAttributeFactory(factory)); if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); this.precisionStep = precisionStep; numericAtt.setShift(-precisionStep); }
@Override public StandardTokenizer create(AttributeFactory factory, Reader input) { StandardTokenizer tokenizer; if (luceneMatchVersion == null) { tokenizer = new StandardTokenizer(factory, input); } else { tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input); } tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override public ClassicTokenizer create(AttributeFactory factory, Reader input) { ClassicTokenizer tokenizer; if (luceneMatchVersion == null) { tokenizer = new ClassicTokenizer(factory, input); } else { tokenizer = new ClassicTokenizer(luceneMatchVersion, factory, input); } tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
@Override public UAX29URLEmailTokenizer create(AttributeFactory factory, Reader input) { UAX29URLEmailTokenizer tokenizer; if (luceneMatchVersion == null) { tokenizer = new UAX29URLEmailTokenizer(factory, input); } else { tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, factory, input); } tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }
/** Construct a token stream processing the given input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory, Reader input) { super(factory); if (input == null) { throw new NullPointerException("input must not be null"); } this.inputPending = input; }
/** Creates a new ThaiTokenizer, supplying the AttributeFactory */ public ThaiTokenizer(AttributeFactory factory, Reader reader) { super(factory, reader, (BreakIterator)sentenceProto.clone()); if (!DBBI_AVAILABLE) { throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation"); } wordBreaker = (BreakIterator)proto.clone(); }
/** * @deprecated Use {@link #CharTokenizer(AttributeFactory, Reader)} */ @Deprecated public CharTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(matchVersion); }
@Override public WhitespaceTokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion == null) { return new WhitespaceTokenizer(factory, input); } return new WhitespaceTokenizer(luceneMatchVersion, factory, input); }
public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { super(factory, input); if (bufferSize <= 0) { throw new IllegalArgumentException("bufferSize must be > 0"); } termAtt.resizeBuffer(bufferSize); }
@Override public LetterTokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion == null) { return new LetterTokenizer(factory, input); } return new LetterTokenizer(luceneMatchVersion, factory, input); }
@Override public LowerCaseTokenizer create(AttributeFactory factory, Reader input) { if (luceneMatchVersion == null) { return new LowerCaseTokenizer(factory, input); } return new LowerCaseTokenizer(luceneMatchVersion, factory, input); }
@Override public Tokenizer create(AttributeFactory factory, Reader input) { if (reverse) { return new ReversePathHierarchyTokenizer(factory, input, delimiter, replacement, skip); } return new PathHierarchyTokenizer(factory, input, delimiter, replacement, skip); }
/** Make this tokenizer get attributes from the delegate token stream. */ private static final AttributeFactory delegatingAttributeFactory(final AttributeSource source) { return new AttributeFactory() { @Override public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) { return (AttributeImpl) source.addAttribute(attClass); } }; }
/** * MeCabKoTokenizer 생성자. * Default AttributeFactory 사용. * * @param option Tokenizer 옵션 * @param appender PosAppender * 복합명사 분해가 필요없는 경우, TokenGenerator.NO_DECOMPOUND를 입력한다. */ public MeCabKoTokenizer( TokenizerOption option, PosAppender appender) { this( AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, option, appender); }
/** * MeCabKoTokenizer 생성자. * * @param factory the AttributeFactory to use * @param option MeCabTokenizer 옵션 * @param appender PosAppender * 복합명사 분해가 필요없는 경우, TokenGenerator.NO_DECOMPOUND를 입력한다. */ public MeCabKoTokenizer( AttributeFactory factory, TokenizerOption option, PosAppender appender) { super(factory); posAppender = appender; this.option = option; setMeCab(); setAttributes(); }
@Override public Tokenizer create(AttributeFactory factory) { return new MeCabKoTokenizer( factory, option, new StandardPosAppender(option)); }
public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { super(factory); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); this.maxTokenLength = maxTokenLength; }
@Override public List<String> parseQuery(String queryStr) { // tokenize queryStr, remove stop word, stemming List<String> tokens = new ArrayList<String>(); AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; Tokenizer tokenizer = new StandardTokenizer(factory); tokenizer.setReader(new StringReader(queryStr)); CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); TokenStream tokenStream = new StopFilter(tokenizer, stopWords); // StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } tokenStream.end(); tokenStream.close(); tokenizer.close(); } catch (IOException e) { e.printStackTrace(); } // System.out.println("QU="+ sb.toString()); return tokens; }
public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { super(factory); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); this.maxTokenLength = maxTokenLength; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); }
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters, AttributeFactory factory, Reader input) { super(factory, input, descriptorPath, configurationParameters); this.tokenTypeString = tokenType; this.termAttr = addAttribute(CharTermAttribute.class); this.offsetAttr = addAttribute(OffsetAttribute.class); }
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map<String, Object> configurationParameters, AttributeFactory factory, Reader input) { super(factory, input, descriptorPath, configurationParameters); this.tokenTypeString = tokenType; this.termAttr = addAttribute(CharTermAttribute.class); this.typeAttr = addAttribute(TypeAttribute.class); this.offsetAttr = addAttribute(OffsetAttribute.class); this.typeAttributeFeaturePath = typeAttributeFeaturePath; }