Java 类org.apache.lucene.analysis.util.CharArraySet 实例源码

项目:lams    文件:Stemmer.java   
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
项目:lams    文件:CapitalizationFilter.java   
/**
 * Creates a CapitalizationFilter with the specified parameters.
 * @param in input tokenstream 
 * @param onlyFirstWord should each word be capitalized or all of the words?
 * @param keep a keep word list.  Each word that should be kept separated by whitespace.
 * @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list.
 * @param okPrefix do not change word capitalization if a word begins with something in this list.
 * @param minWordLength how long the word needs to be to get capitalization applied.  If the
 *                      minWordLength is 3, "and" > "And" but "or" stays "or".
 * @param maxWordCount if the token contains more then maxWordCount words, the capitalization is
 *                     assumed to be correct.
 * @param maxTokenLength ???
 */
public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep, 
    boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength, 
    int maxWordCount, int maxTokenLength) {
  super(in);
  this.onlyFirstWord = onlyFirstWord;
  this.keep = keep;
  this.forceFirstLetter = forceFirstLetter;
  this.okPrefix = okPrefix;
  if (minWordLength < 0) {
    throw new IllegalArgumentException("minWordLength must be greater than or equal to zero");
  }
  if (maxWordCount < 1) {
    throw new IllegalArgumentException("maxWordCount must be greater than zero");
  }
  if (maxTokenLength < 1) {
    throw new IllegalArgumentException("maxTokenLength must be greater than zero");
  }
  this.minWordLength = minWordLength;
  this.maxWordCount = maxWordCount;
  this.maxTokenLength = maxTokenLength;
}
项目:Elasticsearch    文件:PatternAnalyzerProvider.java   
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
项目:lams    文件:Lucene43CompoundWordTokenFilterBase.java   
protected Lucene43CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  super(input);
  this.tokens=new LinkedList<>();
  if (minWordSize < 0) {
    throw new IllegalArgumentException("minWordSize cannot be negative");
  }
  this.minWordSize=minWordSize;
  if (minSubwordSize < 0) {
    throw new IllegalArgumentException("minSubwordSize cannot be negative");
  }
  this.minSubwordSize=minSubwordSize;
  if (maxSubwordSize < 0) {
    throw new IllegalArgumentException("maxSubwordSize cannot be negative");
  }
  this.maxSubwordSize=maxSubwordSize;
  this.onlyLongestMatch=onlyLongestMatch;
  this.dictionary = dictionary;
}
项目:Elasticsearch    文件:Analysis.java   
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) {
    if (namedWords == null) {
        return new CharArraySet(words, ignoreCase);
    }
    CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
    for (String word : words) {
        if (namedWords.containsKey(word)) {
            setWords.addAll(namedWords.get(word));
        } else {
            setWords.add(word);
        }
    }
    return setWords;
}
项目:lams    文件:FinnishAnalyzer.java   
/**
 * @deprecated Use {@link #FinnishAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:EnglishAnalyzerProvider.java   
@Inject
public EnglishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:SwedishAnalyzerProvider.java   
@Inject
public SwedishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SwedishAnalyzer(Analysis.parseStopWords(env, settings, SwedishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:HungarianAnalyzerProvider.java   
@Inject
public HungarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new HungarianAnalyzer(Analysis.parseStopWords(env, settings, HungarianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:EnglishAnalyzer.java   
/**
 * @deprecated Use {@link #EnglishAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public EnglishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:lams    文件:LatvianAnalyzer.java   
/**
 * @deprecated Use {@link #LatvianAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public LatvianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:CzechAnalyzerProvider.java   
@Inject
public CzechAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new CzechAnalyzer(Analysis.parseStopWords(env, settings, CzechAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Indra    文件:IndraAnalyzer.java   
private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) {

        if (metadataStopWords != null && !metadataStopWords.isEmpty()) {
            return new StopFilter(stream, new CharArraySet(metadataStopWords, false));

        } else {
            try {
                InputStream in = ClassLoader.getSystemResourceAsStream(lang.toLowerCase() + ".stopwords");
                if (in != null) {
                    logger.debug("Loading Stop words for lang={}", lang);
                    CharArraySet stopWords = new CharArraySet(30, true);
                    try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) {
                        String line;
                        String[] parts;
                        while ((line = bin.readLine()) != null) {
                            parts = line.split(Pattern.quote("|"));
                            line = parts[0].trim();

                            if (line.length() > 0) {
                                stopWords.add(line);
                            }
                        }
                        return new StopFilter(stream, stopWords);
                    }
                } else {
                    logger.warn("No stop words found for lang={}", lang);
                }
            } catch (Exception e) {
                logger.error("Error creating stop filter for lang={}", lang, e);
            }
        }

        return stream;
    }
项目:Elasticsearch    文件:RomanianAnalyzerProvider.java   
@Inject
public RomanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new RomanianAnalyzer(Analysis.parseStopWords(env, settings, RomanianAnalyzer.getDefaultStopSet()),
                                    Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:BulgarianAnalyzerProvider.java   
@Inject
public BulgarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BulgarianAnalyzer(Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:DutchAnalyzer.java   
/**
 * @deprecated Use {@link #DutchAnalyzer(CharArraySet,CharArraySet,CharArrayMap)}
 */
@Deprecated
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
  setVersion(matchVersion);
  this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
  this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
  if (stemOverrideDict.isEmpty() || !matchVersion.onOrAfter(Version.LUCENE_3_1)) {
    this.stemdict = null;
    this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
  } else {
    this.origStemdict = null;
    // we don't need to ignore case here since we lowercase in this analyzer anyway
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    CharArrayMap<String>.EntryIterator iter = stemOverrideDict.entrySet().iterator();
    CharsRefBuilder spare = new CharsRefBuilder();
    while (iter.hasNext()) {
      char[] nextKey = iter.nextKey();
      spare.copyChars(nextKey, 0, nextKey.length);
      builder.add(spare.get(), iter.currentValue());
    }
    try {
      this.stemdict = builder.build();
    } catch (IOException ex) {
      throw new RuntimeException("can not build stem dict", ex);
    }
  }
}
项目:Elasticsearch    文件:CjkAnalyzerProvider.java   
@Inject
public CjkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, CJKAnalyzer.getDefaultStopSet());

    analyzer = new CJKAnalyzer(stopWords);
    analyzer.setVersion(version);
}
项目:lams    文件:TurkishAnalyzer.java   
/**
 * @deprecated Use {@link #TurkishAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public TurkishAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:LatvianAnalyzerProvider.java   
@Inject
public LatvianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new LatvianAnalyzer(Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:GalicianAnalyzer.java   
/**
 * @deprecated Use {@link #GalicianAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public GalicianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:PortugueseAnalyzerProvider.java   
@Inject
public PortugueseAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new PortugueseAnalyzer(Analysis.parseStopWords(env, settings, PortugueseAnalyzer.getDefaultStopSet()),
                                      Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:SoraniAnalyzerProvider.java   
@Inject
public SoraniAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SoraniAnalyzer(Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:RussianAnalyzerProvider.java   
@Inject
public RussianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new RussianAnalyzer(Analysis.parseStopWords(env, settings, RussianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:ItalianAnalyzer.java   
/**
 * @deprecated Use {@link #ItalianAnalyzer(CharArraySet,CharArraySet)}
 */
@Deprecated
public ItalianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) {
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(
      matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:HindiAnalyzerProvider.java   
@Inject
public HindiAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new HindiAnalyzer(Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:BasqueAnalyzerProvider.java   
@Inject
public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:RSLPStemmerBase.java   
public RuleWithSetExceptions(String suffix, int min, String replacement,
    String[] exceptions) {
  super(suffix, min, replacement);
  for (int i = 0; i < exceptions.length; i++) {
    if (!exceptions[i].endsWith(suffix))
      throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
  }
  this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
}
项目:Elasticsearch    文件:DanishAnalyzerProvider.java   
@Inject
public DanishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new DanishAnalyzer(Analysis.parseStopWords(env, settings, DanishAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}
项目:lams    文件:CJKAnalyzer.java   
/**
 * @deprecated Use {@link #CJKAnalyzer(CharArraySet)}
 */
@Deprecated
public CJKAnalyzer(Version matchVersion, CharArraySet stopwords){
  super(matchVersion, stopwords);
}
项目:lams    文件:GreekAnalyzer.java   
/**
 * @deprecated Use {@link #GreekAnalyzer(CharArraySet)}
 */
@Deprecated
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
  super(matchVersion, stopwords);
}
项目:lams    文件:FinnishAnalyzer.java   
/**
 * @deprecated Use {@link #FinnishAnalyzer(CharArraySet)}
 */
@Deprecated
public FinnishAnalyzer(Version matchVersion, CharArraySet stopwords) {
  this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
项目:lams    文件:UAX29URLEmailAnalyzer.java   
/**
 * @deprecated Use {@link #UAX29URLEmailAnalyzer(CharArraySet)}
 */
@Deprecated
public UAX29URLEmailAnalyzer(Version matchVersion, CharArraySet stopWords) {
  super(matchVersion, stopWords);
}
项目:lams    文件:StandardAnalyzer.java   
/**
 * @deprecated Use {@link #StandardAnalyzer(CharArraySet)}
 */
@Deprecated
public StandardAnalyzer(Version matchVersion, CharArraySet stopWords) {
  super(matchVersion, stopWords);
}
项目:lams    文件:DutchAnalyzer.java   
public DutchAnalyzer(CharArraySet stopwords){
  this(Version.LATEST, stopwords);
}
项目:lams    文件:SpanishAnalyzer.java   
/**
 * @deprecated Use {@link #SpanishAnalyzer(CharArraySet)}
 */
@Deprecated
public SpanishAnalyzer(Version matchVersion, CharArraySet stopwords) {
  this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}
项目:lams    文件:WordDelimiterFilter.java   
/**
 * @deprecated Use {@link #WordDelimiterFilter(TokenStream, int, CharArraySet)}
 */
@Deprecated
public WordDelimiterFilter(Version matchVersion, TokenStream in, int configurationFlags, CharArraySet protWords) {
  this(matchVersion, in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords);
}
项目:lams    文件:ThaiAnalyzer.java   
/**
 * @deprecated Use {@link #ThaiAnalyzer(CharArraySet)}
 */
@Deprecated
public ThaiAnalyzer(Version matchVersion, CharArraySet stopwords) {
  super(matchVersion, stopwords);
}
项目:lams    文件:RussianAnalyzer.java   
/**
 * @deprecated Use {@link #RussianAnalyzer(CharArraySet)}
 */
@Deprecated
public RussianAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet){
  super(matchVersion, stopwords);
  this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
}
项目:Elasticsearch    文件:Analysis.java   
public static CharArraySet parseArticles(Environment env, Settings settings) {
    return parseWords(env, settings, "articles", null, null, settings.getAsBoolean("articles_case", false));
}
项目:Elasticsearch    文件:Analysis.java   
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, boolean ignoreCase) {
    return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, ignoreCase);
}