/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
/** * Creates a CapitalizationFilter with the specified parameters. * @param in input tokenstream * @param onlyFirstWord should each word be capitalized or all of the words? * @param keep a keep word list. Each word that should be kept separated by whitespace. * @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list. * @param okPrefix do not change word capitalization if a word begins with something in this list. * @param minWordLength how long the word needs to be to get capitalization applied. If the * minWordLength is 3, "and" > "And" but "or" stays "or". * @param maxWordCount if the token contains more then maxWordCount words, the capitalization is * assumed to be correct. * @param maxTokenLength ??? */ public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) { super(in); this.onlyFirstWord = onlyFirstWord; this.keep = keep; this.forceFirstLetter = forceFirstLetter; this.okPrefix = okPrefix; if (minWordLength < 0) { throw new IllegalArgumentException("minWordLength must be greater than or equal to zero"); } if (maxWordCount < 1) { throw new IllegalArgumentException("maxWordCount must be greater than zero"); } if (maxTokenLength < 1) { throw new IllegalArgumentException("maxTokenLength must be greater than zero"); } this.minWordLength = minWordLength; this.maxWordCount = maxWordCount; this.maxTokenLength = maxTokenLength; }
@Inject public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); Version esVersion = Version.indexCreated(indexSettingsService.getSettings()); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } boolean lowercase = settings.getAsBoolean("lowercase", true); CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); if (sPattern == null) { throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set"); } Pattern pattern = Regex.compile(sPattern, settings.get("flags")); analyzer = new PatternAnalyzer(pattern, lowercase, stopWords); }
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(input); this.tokens=new LinkedList<>(); if (minWordSize < 0) { throw new IllegalArgumentException("minWordSize cannot be negative"); } this.minWordSize=minWordSize; if (minSubwordSize < 0) { throw new IllegalArgumentException("minSubwordSize cannot be negative"); } this.minSubwordSize=minSubwordSize; if (maxSubwordSize < 0) { throw new IllegalArgumentException("maxSubwordSize cannot be negative"); } this.maxSubwordSize=maxSubwordSize; this.onlyLongestMatch=onlyLongestMatch; this.dictionary = dictionary; }
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
/** * @deprecated Use {@link #FinnishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
@Inject public EnglishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public SwedishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new SwedishAnalyzer(Analysis.parseStopWords(env, settings, SwedishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public HungarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new HungarianAnalyzer(Analysis.parseStopWords(env, settings, HungarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
/** * @deprecated Use {@link #EnglishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
/** * @deprecated Use {@link #LatvianAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
@Inject public CzechAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new CzechAnalyzer(Analysis.parseStopWords(env, settings, CzechAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) { if (metadataStopWords != null && !metadataStopWords.isEmpty()) { return new StopFilter(stream, new CharArraySet(metadataStopWords, false)); } else { try { InputStream in = ClassLoader.getSystemResourceAsStream(lang.toLowerCase() + ".stopwords"); if (in != null) { logger.debug("Loading Stop words for lang={}", lang); CharArraySet stopWords = new CharArraySet(30, true); try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) { String line; String[] parts; while ((line = bin.readLine()) != null) { parts = line.split(Pattern.quote("|")); line = parts[0].trim(); if (line.length() > 0) { stopWords.add(line); } } return new StopFilter(stream, stopWords); } } else { logger.warn("No stop words found for lang={}", lang); } } catch (Exception e) { logger.error("Error creating stop filter for lang={}", lang, e); } } return stream; }
@Inject public RomanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new RomanianAnalyzer(Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public BulgarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new BulgarianAnalyzer(Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
/** * @deprecated Use {@link #DutchAnalyzer(CharArraySet,CharArraySet,CharArrayMap)} */ @Deprecated public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) { setVersion(matchVersion); this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords)); this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable)); if (stemOverrideDict.isEmpty() || !matchVersion.onOrAfter(Version.LUCENE_3_1)) { this.stemdict = null; this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict)); } else { this.origStemdict = null; // we don't need to ignore case here since we lowercase in this analyzer anyway StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false); CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator(); CharsRefBuilder spare = new CharsRefBuilder(); while (iter.hasNext()) { char[] nextKey = iter.nextKey(); spare.copyChars(nextKey, 0, nextKey.length); builder.add(spare.get(), iter.currentValue()); } try { this.stemdict = builder.build(); } catch (IOException ex) { throw new RuntimeException("can not build stem dict", ex); } } }
@Inject public CjkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); CharArraySet stopWords = Analysis.parseStopWords(env, settings, CJKAnalyzer.getDefaultStopSet()); analyzer = new CJKAnalyzer(stopWords); analyzer.setVersion(version); }
/** * @deprecated Use {@link #TurkishAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
@Inject public LatvianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new LatvianAnalyzer(Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
/** * @deprecated Use {@link #GalicianAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
@Inject public PortugueseAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new PortugueseAnalyzer(Analysis.parseStopWords(env, settings, PortugueseAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public SoraniAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new SoraniAnalyzer(Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public RussianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new RussianAnalyzer(Analysis.parseStopWords(env, settings, RussianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
/** * @deprecated Use {@link #ItalianAnalyzer(CharArraySet,CharArraySet)} */ @Deprecated public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) { super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy( matchVersion, stemExclusionSet)); }
@Inject public HindiAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new HindiAnalyzer(Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Inject public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
public RuleWithSetExceptions(String suffix, int min, String replacement, String[] exceptions) { super(suffix, min, replacement); for (int i = 0; i < exceptions.length; i++) { if (!exceptions[i].endsWith(suffix)) throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); } this.exceptions = new CharArraySet(Arrays.asList(exceptions), false); }
@Inject public DanishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new DanishAnalyzer(Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
/** * @deprecated Use {@link #CJKAnalyzer(CharArraySet)} */ @Deprecated public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){ super(matchVersion, stopwords); }
/** * @deprecated Use {@link #GreekAnalyzer(CharArraySet)} */ @Deprecated public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) { super(matchVersion, stopwords); }
/** * @deprecated Use {@link #FinnishAnalyzer(CharArraySet)} */ @Deprecated public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); }
/** * @deprecated Use {@link #UAX29URLEmailAnalyzer(CharArraySet)} */ @Deprecated public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); }
/** * @deprecated Use {@link #StandardAnalyzer(CharArraySet)} */ @Deprecated public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); }
public DutchAnalyzer(CharArraySet stopwords){ this(Version.LATEST, stopwords); }
/** * @deprecated Use {@link #SpanishAnalyzer(CharArraySet)} */ @Deprecated public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); }
/** * @deprecated Use {@link #WordDelimiterFilter(TokenStream, int, CharArraySet)} */ @Deprecated public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) { this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords); }
/** * @deprecated Use {@link #ThaiAnalyzer(CharArraySet)} */ @Deprecated public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) { super(matchVersion, stopwords); }
/** * @deprecated Use {@link #RussianAnalyzer(CharArraySet)} */ @Deprecated public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){ super(matchVersion, stopwords); this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet)); }
public static CharArraySet parseArticles(Environment env, Settings settings) { return parseWords(env, settings, "articles", null, null, settings.getAsBoolean("articles_case", false)); }
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, boolean ignoreCase) { return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, ignoreCase); }