/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public Lucene43EdgeNGramTokenFilter(TokenStream input, int minGram, int maxGram) { super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.charUtils = CharacterUtils.getJava4Instance(); this.minGram = minGram; this.maxGram = maxGram; }
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) { if (!version.onOrAfter(Version.LUCENE_4_4_0)) { throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.onOrAfter(Version.LUCENE_4_4_0) ? CharacterUtils.getInstance(version) : CharacterUtils.getJava4Instance(); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); }
/** * @deprecated Use {@link #MorfologikFilter(TokenStream,String)} */ @Deprecated public MorfologikFilter(final TokenStream in, final String dict, final Version version) { super(in); this.input = in; // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. Thread me = Thread.currentThread(); ClassLoader cl = me.getContextClassLoader(); try { me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader()); this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict)); this.charUtils = CharacterUtils.getInstance(version); this.lemmaList = Collections.emptyList(); } finally { me.setContextClassLoader(cl); } }
/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of * the given range * * @param input * {@link TokenStream} holding the input to be tokenized * @param side * the {@link Side} from which to chop off an n-gram * @param minGram * the smallest n-gram to generate * @param maxGram * the largest n-gram to generate */ public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram) { super(input); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.charUtils = CharacterUtils.getInstance(); this.minGram = minGram; this.maxGram = maxGram; }
/** * Builds a filter for given PolishStemmer.DICTIONARY enum. * * @param in input token stream * @param dict PolishStemmer.DICTIONARY enum * @param version Lucene version compatibility for lowercasing. */ public MorfologikFilter(final TokenStream in, final DICTIONARY dict, final Version version) { super(in); this.input = in; // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. Thread me = Thread.currentThread(); ClassLoader cl = me.getContextClassLoader(); try { me.setContextClassLoader(PolishStemmer.class.getClassLoader()); this.stemmer = new PolishStemmer(dict); this.charUtils = CharacterUtils.getInstance(version); this.lemmaList = Collections.emptyList(); } finally { me.setContextClassLoader(cl); } }
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) { if (!version.onOrAfter(Version.LUCENE_44)) { throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); } charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.getJava4Instance(); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.edgesOnly = edgesOnly; charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader buffer = new int[charBuffer.getBuffer().length]; // Make the term att large enough termAtt.resizeBuffer(2 * maxGram); }
/** * Creates MorfologikFilter * @param in input token stream * @param version Lucene version compatibility for lowercasing. */ public MorfologikFilter(final TokenStream in, final Version version) { super(in); this.input = in; // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. Thread me = Thread.currentThread(); ClassLoader cl = me.getContextClassLoader(); try { me.setContextClassLoader(PolishStemmer.class.getClassLoader()); this.stemmer = new PolishStemmer(); this.charUtils = CharacterUtils.getInstance(version); this.lemmaList = Collections.emptyList(); } finally { me.setContextClassLoader(cl); } }
/** * Creates Lucene43NGramTokenFilter with given min and max n-grams. * @param input {@link org.apache.lucene.analysis.TokenStream} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public Lucene43NGramTokenFilter(TokenStream input, int minGram, int maxGram) { super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE)); this.charUtils = CharacterUtils.getJava4Instance(); if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; posIncAtt = new PositionIncrementAttribute() { @Override public void setPositionIncrement(int positionIncrement) {} @Override public int getPositionIncrement() { return 0; } }; posLenAtt = new PositionLengthAttribute() { @Override public void setPositionLength(int positionLength) {} @Override public int getPositionLength() { return 0; } }; }
/** * @deprecated For {@link Version#LUCENE_4_3_0} or below, use {@link Lucene43EdgeNGramTokenFilter}, otherwise use {@link #EdgeNGramTokenFilter(TokenStream, int, int)} */ @Deprecated public EdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) { super(input); if (version.onOrAfter(Version.LUCENE_4_4) && side == Side.BACK) { throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (side == null) { throw new IllegalArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.onOrAfter(Version.LUCENE_4_4) ? CharacterUtils.getInstance(version) : CharacterUtils.getJava4Instance(); this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/** * @deprecated Use {@link #CharArrayMap(int, boolean)} */ @Deprecated @SuppressWarnings("unchecked") public CharArrayMap(Version matchVersion, int startSize, boolean ignoreCase) { this.ignoreCase = ignoreCase; int size = INIT_SIZE; while(startSize + (startSize>>2) > size) size <<= 1; keys = new char[size][]; values = (V[]) new Object[size]; this.charUtils = CharacterUtils.getInstance(matchVersion); this.matchVersion = matchVersion; }
/** * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * @param version the <a href="#version">Lucene match version</a> * @param input {@link TokenStream} holding the input to be tokenized * @param side the {@link Side} from which to chop off an n-gram * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ @Deprecated public EdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) { super(input); if (version == null) { throw new IllegalArgumentException("version must not be null"); } if (version.onOrAfter(Version.LUCENE_44) && side == Side.BACK) { throw new IllegalArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); } if (side == null) { throw new IllegalArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new IllegalArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new IllegalArgumentException("minGram must not be greater than maxGram"); } this.version = version; this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.getJava4Instance(); this.minGram = minGram; this.maxGram = maxGram; this.side = side; }
/** * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} */ @Deprecated public GreekLowerCaseFilter(Version matchVersion, TokenStream in) { super(in); this.charUtils = CharacterUtils.getInstance(matchVersion); }
/** * @deprecated Use {@link #LowerCaseFilter(TokenStream)} */ @Deprecated public LowerCaseFilter(Version matchVersion, TokenStream in) { super(in); charUtils = CharacterUtils.getInstance(matchVersion); }
/** * @deprecated Use {@link #UpperCaseFilter(TokenStream)} */ @Deprecated public UpperCaseFilter(Version matchVersion, TokenStream in) { super(in); charUtils = CharacterUtils.getInstance(matchVersion); }
public TypeTokenizer(Reader input) { super(input); charUtils = CharacterUtils.getInstance(); }
public TypeTokenizer(AttributeSource source, Reader input) { super(source, input); charUtils = CharacterUtils.getInstance(); }
public TypeTokenizer(AttributeFactory factory, Reader input) { super(factory, input); charUtils = CharacterUtils.getInstance(); }
public WDSTokenizer(Reader in, boolean useSmart) { offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(CharTermAttribute.class); charUtils = CharacterUtils.getInstance(); }
protected PinyinNGramTokenFilter(TokenStream input) { super(input); this.charUtils = CharacterUtils.getInstance(); this.minGram = DEFAULT_MIN_GRAM_SIZE; this.maxGram = DEFAULT_MAX_GRAM_SIZE; }