Java 类org.apache.lucene.analysis.core.StopAnalyzer 实例源码

项目:Elasticsearch    文件:StandardAnalyzerProvider.java   
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}
项目:Elasticsearch    文件:PatternAnalyzerProvider.java   
@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}
项目:search    文件:PatternAnalyzerTest.java   
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

  // dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
  final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
  Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
    @Override
    public void uncaughtException(Thread thread, Throwable throwable) {
      assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
      // otherwise its some other bug, pass to default handler
      savedHandler.uncaughtException(thread, throwable);
    }
  });

  try {
    Thread.getDefaultUncaughtExceptionHandler();
    checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
  } catch (ArrayIndexOutOfBoundsException ex) {
    assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
    throw ex; // otherwise rethrow
  } finally {
    Thread.setDefaultUncaughtExceptionHandler(savedHandler);
  }
}
项目:NYBC    文件:StopFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  String stopWordFiles = args.get("words");
  ignoreCase = getBoolean("ignoreCase",false);
  enablePositionIncrements = getBoolean("enablePositionIncrements",false);

  if (stopWordFiles != null) {
    if ("snowball".equalsIgnoreCase(args.get("format"))) {
      stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
    } else {
      stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
    }
  } else {
    stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
  }
}
项目:NYBC    文件:PatternAnalyzerTest.java   
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

  // dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
  final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
  Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
    @Override
    public void uncaughtException(Thread thread, Throwable throwable) {
      assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
      // otherwise its some other bug, pass to default handler
      savedHandler.uncaughtException(thread, throwable);
    }
  });

  try {
    Thread.getDefaultUncaughtExceptionHandler();
    checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
  } catch (ArrayIndexOutOfBoundsException ex) {
    assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
    throw ex; // otherwise rethrow
  } finally {
    Thread.setDefaultUncaughtExceptionHandler(savedHandler);
  }
}
项目:information-retrieval-adventure    文件:SkippingNumbersPreservingChemicals.java   
public static void main(String[] args) throws IOException {

    String theSentence =
        "this is the scientific article about chemicals like H20 C2H50H with concentration "
            + "of 3.99 kilograms and 0,123 micrograms also i have some CO2 gas n=3 x=45";
    StringReader reader = new StringReader(theSentence);
    Tokenizer whitespaceTokenizer = new WhitespaceTokenizer(reader);
    TokenStream tokenStream =
        new StopFilter(whitespaceTokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    tokenStream = new ScientificFiltering(tokenStream);

    final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
      System.out.println(charTermAttribute.toString());
    }

    tokenStream.end();
    tokenStream.close();
  }
项目:Maskana-Gestor-de-Conocimiento    文件:PatternAnalyzerTest.java   
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);

  // dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
  final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
  Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
    @Override
    public void uncaughtException(Thread thread, Throwable throwable) {
      assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
      // otherwise its some other bug, pass to default handler
      savedHandler.uncaughtException(thread, throwable);
    }
  });

  try {
    Thread.getDefaultUncaughtExceptionHandler();
    checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
  } catch (ArrayIndexOutOfBoundsException ex) {
    assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
    throw ex; // otherwise rethrow
  } finally {
    Thread.setDefaultUncaughtExceptionHandler(savedHandler);
  }
}
项目:elasticsearch_my    文件:StopAnalyzerProvider.java   
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(
        env, indexSettings.getIndexVersionCreated(), settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
项目:elasticsearch_my    文件:StopTokenFilterFactory.java   
public StopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.ignoreCase =
        settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
    this.removeTrailing = settings.getAsBooleanLenientForPreEs6Indices(
        indexSettings.getIndexVersionCreated(), "remove_trailing", true, deprecationLogger);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore. Please fix your analysis chain");
    }
}
项目:elasticsearch_my    文件:PatternAnalyzerTests.java   
/**
 * Test PatternAnalyzer when it is configured with a non-word pattern.
 */
public void testNonWordPattern() throws IOException {
  // Split on non-letter pattern, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\W+"), false, null);
  assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.",
                      new String[] { "The", "quick", "brown", "Fox", "the", "abcd1234", "56", "78", "dc" });

  // split on non-letter pattern, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\W+"), true,
                                          StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.",
                       new String[] { "quick", "brown", "fox", "abcd1234", "56", "78", "dc" });
}
项目:elasticsearch_my    文件:PatternAnalyzerTests.java   
/**
 * Test PatternAnalyzer when it is configured with a whitespace pattern.
 * Behavior can be similar to WhitespaceAnalyzer (depending upon options)
 */
public void testWhitespacePattern() throws IOException {
  // Split on whitespace patterns, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null);
  assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.",
                      new String[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });

  // Split on whitespace patterns, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\s+"), true,
                                          StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.",
                       new String[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
}
项目:elasticsearch_my    文件:PatternAnalyzerTests.java   
/**
 * Test PatternAnalyzer when it is configured with a custom pattern. In this
 * case, text is tokenized on the comma ","
 */
public void testCustomPattern() throws IOException {
  // Split on comma, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(Pattern.compile(","), false, null);
  assertAnalyzesTo(a, "Here,Are,some,Comma,separated,words,",
                       new String[] { "Here", "Are", "some", "Comma", "separated", "words" });

  // split on comma, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(Pattern.compile(","), true,
                                           StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(b, "Here,Are,some,Comma,separated,words,",
                       new String[] { "here", "some", "comma", "separated", "words" });
}
项目:lams    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:Elasticsearch    文件:StandardHtmlStripAnalyzerProvider.java   
@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}
项目:Elasticsearch    文件:StopAnalyzerProvider.java   
@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}
项目:Elasticsearch    文件:StopTokenFilterFactory.java   
@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}
项目:search    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:search    文件:TestThaiAnalyzer.java   
public void testPositionIncrements() throws Exception {
  final ThaiAnalyzer analyzer = new ThaiAnalyzer(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
      new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

  // case that a stopword is adjacent to thai text, with no whitespace
  assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
      new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
}
项目:search    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a non-word pattern.
 * Behavior can be similar to SimpleAnalyzer (depending upon options)
 */
public void testNonWordPattern() throws IOException {
  // Split on non-letter pattern, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox", "the", "abcd", "dc" });

  // split on non-letter pattern, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox", "abcd", "dc" });
}
项目:search    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a whitespace pattern.
 * Behavior can be similar to WhitespaceAnalyzer (depending upon options)
 */
public void testWhitespacePattern() throws IOException {
  // Split on whitespace patterns, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });

  // Split on whitespace patterns, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
}
项目:search    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a custom pattern. In this
 * case, text is tokenized on the comma ","
 */
public void testCustomPattern() throws IOException {
  // Split on comma, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
  check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
      "Are", "some", "Comma", "separated", "words" });

  // split on comma, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
      StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
      "some", "comma", "separated", "words" });
}
项目:SciGraph    文件:GraphUtil.java   
/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}
项目:SciGraph    文件:LuceneUtils.java   
public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}
项目:NYBC    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  String commonWordFiles = args.get("words");
  ignoreCase = getBoolean("ignoreCase", false);

  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(args.get("format"))) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:NYBC    文件:CommonGramsQueryFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  String commonWordFiles = args.get("words");
  ignoreCase = getBoolean("ignoreCase", false);

  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(args.get("format"))) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:NYBC    文件:TestThaiAnalyzer.java   
public void testPositionIncrements() throws Exception {
  final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
      new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

  // case that a stopword is adjacent to thai text, with no whitespace
  assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
      new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
}
项目:NYBC    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a non-word pattern.
 * Behavior can be similar to SimpleAnalyzer (depending upon options)
 */
public void testNonWordPattern() throws IOException {
  // Split on non-letter pattern, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox", "the", "abcd", "dc" });

  // split on non-letter pattern, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox", "abcd", "dc" });
}
项目:NYBC    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a whitespace pattern.
 * Behavior can be similar to WhitespaceAnalyzer (depending upon options)
 */
public void testWhitespacePattern() throws IOException {
  // Split on whitespace patterns, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });

  // Split on whitespace patterns, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
}
项目:NYBC    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a custom pattern. In this
 * case, text is tokenized on the comma ","
 */
public void testCustomPattern() throws IOException {
  // Split on comma, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
  check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
      "Are", "some", "Comma", "separated", "words" });

  // split on comma, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
      StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
      "some", "comma", "separated", "words" });
}
项目:read-open-source-code    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:read-open-source-code    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:read-open-source-code    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:dli-downloader    文件:LuceneSearcher.java   
public LuceneSearcher(IndexWriter writer) throws IOException, ParseException {
    searcherManager = new SearcherManager(writer, true, null);
    analyzer = new StandardAnalyzer(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    parser = new QueryParser(Version.LUCENE_46, field, analyzer);
    parser.setAllowLeadingWildcard(true);
    parser.setAnalyzeRangeTerms(true);
}
项目:dli-downloader    文件:LuceneSearcher.java   
public LuceneSearcher(AppContext appContext) throws IOException, ParseException {
    directory = NIOFSDirectory.open(new File(appContext.getIndexLocation(), AppConstants.DLI_INDEX));
    searcherManager = new SearcherManager(directory, null);
    analyzer = new StandardAnalyzer(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    parser = new QueryParser(Version.LUCENE_46, field, analyzer);
    parser.setAllowLeadingWildcard(true);
    parser.setAnalyzeRangeTerms(true);
}
项目:Maskana-Gestor-de-Conocimiento    文件:CommonGramsFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (commonWordFiles != null) {
    if ("snowball".equalsIgnoreCase(format)) {
      commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
    } else {
      commonWords = getWordSet(loader, commonWordFiles, ignoreCase);
    }
  } else {
    commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
  }
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestThaiAnalyzer.java   
public void testPositionIncrements() throws Exception {
  final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 18, 22, 25, 28 },
      new int[] { 3, 6, 9, 13, 22, 25, 28, 30 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });

  // case that a stopword is adjacent to thai text, with no whitespace
  assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", 
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
      new int[] { 0, 3, 6, 9, 17, 21, 24, 27 },
      new int[] { 3, 6, 9, 13, 21, 24, 27, 29 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1 });
}
项目:Maskana-Gestor-de-Conocimiento    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a non-word pattern.
 * Behavior can be similar to SimpleAnalyzer (depending upon options)
 */
public void testNonWordPattern() throws IOException {
  // Split on non-letter pattern, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox", "the", "abcd", "dc" });

  // split on non-letter pattern, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox", "abcd", "dc" });
}
项目:Maskana-Gestor-de-Conocimiento    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a whitespace pattern.
 * Behavior can be similar to WhitespaceAnalyzer (depending upon options)
 */
public void testWhitespacePattern() throws IOException {
  // Split on whitespace patterns, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      false, null);
  check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });

  // Split on whitespace patterns, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
      true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
      "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
}
项目:Maskana-Gestor-de-Conocimiento    文件:PatternAnalyzerTest.java   
/**
 * Test PatternAnalyzer when it is configured with a custom pattern. In this
 * case, text is tokenized on the comma ","
 */
public void testCustomPattern() throws IOException {
  // Split on comma, do not lowercase, no stopwords
  PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
  check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
      "Are", "some", "Comma", "separated", "words" });

  // split on comma, lowercase, english stopwords
  PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
      StopAnalyzer.ENGLISH_STOP_WORDS_SET);
  check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
      "some", "comma", "separated", "words" });
}
项目:elasticsearch_my    文件:StopAnalyzerProvider.java   
@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}