/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { assureMatchVersion(); List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, ignoreCase)); } } return words; }
/** * Creates a token stream that tokenizes the given string into token terms * (aka words). * * @param fieldName * the name of the field to tokenize (currently ignored). * @param reader * reader (e.g. charfilter) of the original text. can be null. * @param text * the string to tokenize * @return a new token stream */ public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) { // Ideally the Analyzer superclass should have a method with the same signature, // with a default impl that simply delegates to the StringReader flavour. if (reader == null) reader = new FastStringReader(text); if (pattern == NON_WORD_PATTERN) { // fast path return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords)); } else if (pattern == WHITESPACE_PATTERN) { // fast path return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords)); } Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase); TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer; return new TokenStreamComponents(tokenizer, result); }
public void testEndingHole() throws Exception { // Just deletes "of" Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); CharArraySet stopSet = StopFilter.makeStopSet("of"); return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet)); } }; Iterable<Input> keys = AnalyzingSuggesterTest.shuffle( new Input("wizard of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20); sug.build(new InputArrayIterator(keys)); assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10))); // Falls back to unigram model, with backoff 0.4 times // prop 0.5: assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10))); }
public void testTwoEndingHoles() throws Exception { // Just deletes "of" Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); CharArraySet stopSet = StopFilter.makeStopSet("of"); return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet)); } }; Iterable<Input> keys = AnalyzingSuggesterTest.shuffle( new Input("wizard of of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20); sug.build(new InputArrayIterator(keys)); assertEquals("", toString(sug.lookup("wizard of of", 10))); }
public void testEndIsStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to"); TokenStream stream = new MockTokenizer(new StringReader("go to ")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go"}, new int[] {0}, new int[] {2}, null, new int[] {1}, null, 6, new boolean[] {false}, true); }
public void testMidStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to"); TokenStream stream = new MockTokenizer(new StringReader("go to school")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go", "school"}, new int[] {0, 6}, new int[] {2, 12}, null, new int[] {1, 2}, null, 12, new boolean[] {false, false}, true); }
public void testMultipleStopWords() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the school")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go", "school" }, new int[] {0, 12}, new int[] {2, 18}, null, new int[] {1, 4}, null, 18, new boolean[] {false, false}, true); }
public void testMultipleStopWordsEnd() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go", "the"}, new int[] {0, 8}, new int[] {2, 11}, null, new int[] {1, 3}, null, 11, new boolean[] {false, true}, true); }
public void testMultipleStopWordsEnd2() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the ")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go"}, new int[] {0}, new int[] {2}, null, new int[] {1}, null, 12, new boolean[] {false}, true); }
private CharArraySet getWordSet( ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet( files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet( wlist, ignoreCase)); } } return words; }
/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { assureMatchVersion(); List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(luceneMatchVersion, files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(luceneMatchVersion, wlist, ignoreCase)); } } return words; }
public static void main(String[] args) throws IOException { String theSentence = "this is the scientific article about chemicals like H20 C2H50H with concentration " + "of 3.99 kilograms and 0,123 micrograms also i have some CO2 gas n=3 x=45"; StringReader reader = new StringReader(theSentence); Tokenizer whitespaceTokenizer = new WhitespaceTokenizer(reader); TokenStream tokenStream = new StopFilter(whitespaceTokenizer, StopAnalyzer.ENGLISH_STOP_WORDS_SET); tokenStream = new ScientificFiltering(tokenStream); final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(charTermAttribute.toString()); } tokenStream.end(); tokenStream.close(); }
public void testEndingHole() throws Exception { // Just deletes "of" Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of"); return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet)); } }; Iterable<Input> keys = shuffle( new Input("wizard of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20); sug.build(new InputArrayIterator(keys)); assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10))); // Falls back to unigram model, with backoff 0.4 times // prop 0.5: assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10))); }
public void testTwoEndingHoles() throws Exception { // Just deletes "of" Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new MockTokenizer(reader); CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of"); return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet)); } }; Iterable<Input> keys = shuffle( new Input("wizard of of oz", 50) ); FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20); sug.build(new InputArrayIterator(keys)); assertEquals("", toString(sug.lookup("wizard of of", 10))); }
public void testEndIsStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to"); TokenStream stream = new MockTokenizer(new StringReader("go to ")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go"}, new int[] {0}, new int[] {2}, null, new int[] {1}, null, 6, new boolean[] {false}, true); }
public void testMidStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to"); TokenStream stream = new MockTokenizer(new StringReader("go to school")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go", "school"}, new int[] {0, 6}, new int[] {2, 12}, null, new int[] {1, 2}, null, 12, new boolean[] {false, false}, true); }
public void testMultipleStopWords() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the school")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go", "school" }, new int[] {0, 12}, new int[] {2, 18}, null, new int[] {1, 4}, null, 18, new boolean[] {false, false}, true); }
public void testMultipleStopWordsEnd() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go", "the"}, new int[] {0, 8}, new int[] {2, 11}, null, new int[] {1, 3}, null, 11, new boolean[] {false, true}, true); }
public void testMultipleStopWordsEnd2() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "to", "the", "a"); TokenStream stream = new MockTokenizer(new StringReader("go to a the ")); TokenStream filter = new SuggestStopFilter(stream, stopWords); filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] { "go"}, new int[] {0}, new int[] {2}, null, new int[] {1}, null, 12, new boolean[] {false}, true); }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { if (version.onOrAfter(Version.LUCENE_4_4)) { return new StopFilter(tokenStream, stopWords); } else { return new Lucene43StopFilter(enablePositionIncrements, tokenStream, stopWords); } } else { return new SuggestStopFilter(tokenStream, stopWords); } }
private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) { if (metadataStopWords != null && !metadataStopWords.isEmpty()) { return new StopFilter(stream, new CharArraySet(metadataStopWords, false)); } else { try { InputStream in = ClassLoader.getSystemResourceAsStream(lang.toLowerCase() + ".stopwords"); if (in != null) { logger.debug("Loading Stop words for lang={}", lang); CharArraySet stopWords = new CharArraySet(30, true); try (BufferedReader bin = new BufferedReader(new InputStreamReader(in))) { String line; String[] parts; while ((line = bin.readLine()) != null) { parts = line.split(Pattern.quote("|")); line = parts[0].trim(); if (line.length() > 0) { stopWords.add(line); } } return new StopFilter(stream, stopWords); } } else { logger.warn("No stop words found for lang={}", lang); } } catch (Exception e) { logger.error("Error creating stop filter for lang={}", lang, e); } } return stream; }
/** * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then * user provided stop words will be added to the Lucene default stopset. */ public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) { LuceneAnalyzerUtil.stemFilterType = stemFilterType; if(addToDefault) { stopSet.addAll(stopWords); } else { stopSet = StopFilter.makeStopSet(stopWords); } }
@Override public List<String> parseQuery(String queryStr) { // tokenize queryStr, remove stop word, stemming List<String> tokens = new ArrayList<String>(); AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; Tokenizer tokenizer = new StandardTokenizer(factory); tokenizer.setReader(new StringReader(queryStr)); CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); TokenStream tokenStream = new StopFilter(tokenizer, stopWords); // StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } tokenStream.end(); tokenStream.close(); tokenizer.close(); } catch (IOException e) { e.printStackTrace(); } // System.out.println("QU="+ sb.toString()); return tokens; }
public static void main(String[] args) throws IOException { System.out.println(NumberUtils.isDigits("12345")); System.out.println(NumberUtils.isDigits("12345.1")); System.out.println(NumberUtils.isDigits("12345,2")); System.out.println(NumberUtils.isNumber("12345")); System.out.println(NumberUtils.isNumber("12345.1")); System.out.println(NumberUtils.isNumber("12345,2".replace(",", "."))); System.out.println(NumberUtils.isNumber("12345,2")); StringReader input = new StringReader( "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24" .replaceAll("br2n", "")); LetterTokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(input); TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET); TokenFilter length = new LengthFilter(stopFilter, 3, 1000); TokenFilter stemmer = new BulgarianStemFilter(length); TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2); try (TokenFilter filter = ngrams) { Attribute termAtt = filter.addAttribute(CharTermAttribute.class); filter.reset(); while (filter.incrementToken()) { String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", ""); System.out.println(word); } } }
public void testEndNotStopWord() throws Exception { CharArraySet stopWords = StopFilter.makeStopSet("to"); TokenStream stream = new MockTokenizer(new StringReader("go to")); TokenStream filter = new SuggestStopFilter(stream, stopWords); assertTokenStreamContents(filter, new String[] {"go", "to"}, new int[] {0, 3}, new int[] {2, 5}, null, new int[] {1, 1}, null, 5, new boolean[] {false, true}, true); }
/** * Returns a StopFilter based on our managed stop word set. */ @Override public TokenStream create(TokenStream input) { if (stopWords == null) { throw new IllegalStateException("Managed stopwords not initialized correctly!"); } return new StopFilter(input, stopWords); }
@Test public void returns_tokens_when_underlying_stream_skips_over_tokens() throws IOException { try (Tokenizer tok = new WhitespaceTokenizer(); TokenFilter stop = new StopFilter(tok, new CharArraySet(ImmutableList.of(ONE), false)); TokenFilter f = new AnnotatorTokenFilter(stop, annotator)) { stubAnnotator(TWO); tok.setReader(new StringReader(ONE_TWO)); assertTokenInfos(f, new TokenInfo(TWO, 1)); } }