public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) { super(index, indexSettings, name, settings); this.esVersion = Version.indexCreated(indexSettings); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); standardAnalyzer = new StandardAnalyzer(stopWords); standardAnalyzer.setVersion(version); standardAnalyzer.setMaxTokenLength(maxTokenLength); }
@Inject public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); Version esVersion = Version.indexCreated(indexSettingsService.getSettings()); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } boolean lowercase = settings.getAsBoolean("lowercase", true); CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); if (sPattern == null) { throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set"); } Pattern pattern = Regex.compile(sPattern, settings.get("flags")); analyzer = new PatternAnalyzer(pattern, lowercase, stopWords); }
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); // dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012 final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler(); Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() { @Override public void uncaughtException(Thread thread, Throwable throwable) { assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable)); // otherwise its some other bug, pass to default handler savedHandler.uncaughtException(thread, throwable); } }); try { Thread.getDefaultUncaughtExceptionHandler(); checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER); } catch (ArrayIndexOutOfBoundsException ex) { assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex)); throw ex; // otherwise rethrow } finally { Thread.setDefaultUncaughtExceptionHandler(savedHandler); } }
@Override public void inform(ResourceLoader loader) throws IOException { String stopWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase",false); enablePositionIncrements = getBoolean("enablePositionIncrements",false); if (stopWordFiles != null) { if ("snowball".equalsIgnoreCase(args.get("format"))) { stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); } else { stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } } else { stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } }
public static void main(String[] args) throws IOException { String theSentence = "this is the scientific article about chemicals like H20 C2H50H with concentration " + "of 3.99 kilograms and 0,123 micrograms also i have some CO2 gas n=3 x=45"; StringReader reader = new StringReader(theSentence); Tokenizer whitespaceTokenizer = new WhitespaceTokenizer(reader); TokenStream tokenStream = new StopFilter(whitespaceTokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET); tokenStream = new ScientificFiltering(tokenStream); final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); }
public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); CharArraySet stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET); this.stopAnalyzer = new StopAnalyzer(stopWords); this.stopAnalyzer.setVersion(version); }
public StopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); this.ignoreCase = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger); this.removeTrailing = settings.getAsBooleanLenientForPreEs6Indices( indexSettings.getIndexVersionCreated(), "remove_trailing", true, deprecationLogger); this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); if (settings.get("enable_position_increments") != null) { throw new IllegalArgumentException("enable_position_increments is not supported anymore. Please fix your analysis chain"); } }
/** * Test PatternAnalyzer when it is configured with a non-word pattern. */ public void testNonWordPattern() throws IOException { // Split on non-letter pattern, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\W+"), false, null); assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "The", "quick", "brown", "Fox", "the", "abcd1234", "56", "78", "dc" }); // split on non-letter pattern, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\W+"), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "quick", "brown", "fox", "abcd1234", "56", "78", "dc" }); }
/** * Test PatternAnalyzer when it is configured with a whitespace pattern. * Behavior can be similar to WhitespaceAnalyzer (depending upon options) */ public void testWhitespacePattern() throws IOException { // Split on whitespace patterns, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(Pattern.compile("\\s+"), false, null); assertAnalyzesTo(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." }); // Split on whitespace patterns, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(Pattern.compile("\\s+"), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); assertAnalyzesTo(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." }); }
/** * Test PatternAnalyzer when it is configured with a custom pattern. In this * case, text is tokenized on the comma "," */ public void testCustomPattern() throws IOException { // Split on comma, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(Pattern.compile(","), false, null); assertAnalyzesTo(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here", "Are", "some", "Comma", "separated", "words" }); // split on comma, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); assertAnalyzesTo(b, "Here,Are,some,Comma,separated,words,", new String[] { "here", "some", "comma", "separated", "words" }); }
@Override public void inform(ResourceLoader loader) throws IOException { if (commonWordFiles != null) { if ("snowball".equalsIgnoreCase(format)) { commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); } else { commonWords = getWordSet(loader, commonWordFiles, ignoreCase); } } else { commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } }
@Inject public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); this.esVersion = Version.indexCreated(indexSettingsService.getSettings()); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); analyzer = new StandardHtmlStripAnalyzer(stopWords); analyzer.setVersion(version); }
@Inject public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET); this.stopAnalyzer = new StopAnalyzer(stopWords); this.stopAnalyzer.setVersion(version); }
@Inject public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); this.ignoreCase = settings.getAsBoolean("ignore_case", false); this.removeTrailing = settings.getAsBoolean("remove_trailing", true); this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) { throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams." + " Please fix your analysis chain or use an older compatibility version (<= 4.3)."); } this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true); }
public void testPositionIncrements() throws Exception { final ThaiAnalyzer analyzer = new ThaiAnalyzer(StopAnalyzer.ENGLISH_STOP_WORDS_SET); assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); // case that a stopword is adjacent to thai text, with no whitespace assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); }
/** * Test PatternAnalyzer when it is configured with a non-word pattern. * Behavior can be similar to SimpleAnalyzer (depending upon options) */ public void testNonWordPattern() throws IOException { // Split on non-letter pattern, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, false, null); check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "The", "quick", "brown", "Fox", "the", "abcd", "dc" }); // split on non-letter pattern, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "quick", "brown", "fox", "abcd", "dc" }); }
/** * Test PatternAnalyzer when it is configured with a whitespace pattern. * Behavior can be similar to WhitespaceAnalyzer (depending upon options) */ public void testWhitespacePattern() throws IOException { // Split on whitespace patterns, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, false, null); check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." }); // Split on whitespace patterns, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] { "quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." }); }
/** * Test PatternAnalyzer when it is configured with a custom pattern. In this * case, text is tokenized on the comma "," */ public void testCustomPattern() throws IOException { // Split on comma, do not lowercase, no stopwords PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null); check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here", "Are", "some", "Comma", "separated", "words" }); // split on comma, lowercase, english stopwords PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here", "some", "comma", "separated", "words" }); }
/*** * TODO: This and every spot that uses it is a bit of a hack * This should ideally be handled by the index. * @param value * @return */ public static boolean ignoreProperty(Object value) { if (value instanceof String && (CharMatcher.WHITESPACE.matchesAllOf((String) value) || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) { return true; } return false; }
public static boolean isStopword(String word) { for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) { String stopword = new String((char[]) stopWord.next()); if (stopword.equalsIgnoreCase(word)) { return true; } } return false; }
@Override public void inform(ResourceLoader loader) throws IOException { String commonWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); if (commonWordFiles != null) { if ("snowball".equalsIgnoreCase(args.get("format"))) { commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); } else { commonWords = getWordSet(loader, commonWordFiles, ignoreCase); } } else { commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } }
public void testPositionIncrements() throws Exception { final ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT, StopAnalyzer.ENGLISH_STOP_WORDS_SET); assertAnalyzesTo(analyzer, "การที่ได้ต้อง the แสดงว่างานดี", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 18, 22, 25, 28 }, new int[] { 3, 6, 9, 13, 22, 25, 28, 30 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); // case that a stopword is adjacent to thai text, with no whitespace assertAnalyzesTo(analyzer, "การที่ได้ต้องthe แสดงว่างานดี", new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" }, new int[] { 0, 3, 6, 9, 17, 21, 24, 27 }, new int[] { 3, 6, 9, 13, 21, 24, 27, 29 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1 }); }
public LuceneSearcher(IndexWriter writer) throws IOException, ParseException { searcherManager = new SearcherManager(writer, true, null); analyzer = new StandardAnalyzer(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET); parser = new QueryParser(Version.LUCENE_46, field, analyzer); parser.setAllowLeadingWildcard(true); parser.setAnalyzeRangeTerms(true); }
public LuceneSearcher(AppContext appContext) throws IOException, ParseException { directory = NIOFSDirectory.open(new File(appContext.getIndexLocation(), AppConstants.DLI_INDEX)); searcherManager = new SearcherManager(directory, null); analyzer = new StandardAnalyzer(Version.LUCENE_46, StopAnalyzer.ENGLISH_STOP_WORDS_SET); parser = new QueryParser(Version.LUCENE_46, field, analyzer); parser.setAllowLeadingWildcard(true); parser.setAnalyzeRangeTerms(true); }
@Override public StopAnalyzer get() { return this.stopAnalyzer; }