public void testCorrectPositionIncrementSetting() throws IOException { Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop"); if (random().nextBoolean()) { builder.put("index.analysis.filter.my_stop.version", Version.LATEST); } else { // don't specify } builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop"); assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class)); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("foo bar")); TokenStream create = tokenFilter.create(tokenizer); assertThat(create, instanceOf(StopFilter.class)); }
/** * Creates a new {@link TibetanAnalyzer} * * @param segmentInWords if the segmentation is on words instead of syllables * @param lemmatize if the analyzer should remove affixed particles, and normalize words in words mode * @param filterChars if the text should be converted to NFD (necessary for texts containing NFC strings) * @param inputMethod if the text should be converted from EWTS to Unicode * @param stopFilename a file name with a stop word list * @throws IOException if the file containing stopwords can't be opened */ public TibetanAnalyzer(boolean segmentInWords, boolean lemmatize, boolean filterChars, String inputMethod, String stopFilename) throws IOException { this.segmentInWords = segmentInWords; this.lemmatize = lemmatize; this.filterChars = filterChars; this.inputMethod = inputMethod; if (stopFilename != null) { if (stopFilename.isEmpty()) { InputStream stream = null; stream = TibetanAnalyzer.class.getResourceAsStream("/bo-stopwords.txt"); if (stream == null) { // we're not using the jar, there is no resource, assuming we're running the code this.tibStopSet = null; } else { this.tibStopSet = StopFilter.makeStopSet(getWordList(stream, "#")); } } else { this.tibStopSet = StopFilter.makeStopSet(getWordList(new FileInputStream(stopFilename), "#")); } } else { this.tibStopSet = null; } }
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET); streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); return streams.filteredTokenStream; }
/** * Read a Lucene index and make a spelling dictionary from it. A minimal token * analyzer will be used, which is usually just what is needed for the * dictionary. The default set of English stop words will be used (see * {@link StopAnalyzer#ENGLISH_STOP_WORDS}). * * @param indexDir directory containing the Lucene index * @param dictDir directory to receive the spelling dictionary * @param prog tracker called periodically to display progress */ public static void createDict(Directory indexDir, File dictDir, ProgressTracker prog) throws IOException { // Open and clear the dictionary (since we're going to totally rebuild it) SpellWriter spellWriter = SpellWriter.open(dictDir); spellWriter.clearDictionary(); spellWriter.setStopwords(StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); // Now re-tokenize all the fields and queue the words for the dictionary. IndexReader indexReader = IndexReader.open(indexDir); createDict(indexReader, new MinimalAnalyzer(), spellWriter, prog); // All done. spellWriter.close(); }
/** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new WhitespaceTokenizer(reader); streams.result = new StopFilter(true, streams.source, stopwords, true); streams.result = new PorterStemFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
/** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new LowerCaseTokenizer(reader); streams.result = new StopFilter(true, streams.source, stopWords, true); // streams.result = new PorterStemFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
/** * * @param fieldName * @param reader * @return * @throws IOException */ @Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new WhitespaceTokenizer(reader); streams.result = new StopFilter(true, streams.source, stopWords, true); // streams.result = new PorterStemFilter(streams.source); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Analysis in; try { in = analysis.getConstructor(Reader.class).newInstance(reader); } catch (Exception e) { throw new RuntimeException("Ansj analysis can't be instance!"); } final Tokenizer source = new AnsjTokenizer(reader,in); TokenStreamComponents result; if (stopwords.isEmpty()) { result = new TokenStreamComponents(source); } else { result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords)); } return result; }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }
public void testFillerToken() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("shingle_filler"); String source = "simon the sorcerer"; String[] expected = new String[]{"simon FILLER", "simon FILLER sorcerer", "FILLER sorcerer"}; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); TokenStream stream = new StopFilter(tokenizer, StopFilter.makeStopSet("the")); assertTokenStreamContents(tokenFilter.create(stream), expected); }
@Test public void stopwordFilterTest() throws IOException { System.out.println("Testing TibetanAnalyzer.tibStopWords"); String input = "ཧ་ཏུ་གི་ཀྱི་གིས་ཀྱིས་ཡིས་ཀྱང་སྟེ་ཏེ་མམ་རམ་སམ་ཏམ་ནོ་བོ་ཏོ་གིན་ཀྱིན་གྱིན་ཅིང་ཅིག་ཅེས་ཞེས་ཧ།"; Reader reader = new StringReader(input); List<String> expected = Arrays.asList("ཧ", "ཧ"); System.out.print(input + " => "); TokenStream syllables = tokenize(reader, new TibSyllableTokenizer()); CharArraySet stopSet = StopFilter.makeStopSet(TibetanAnalyzer.getWordList(new FileInputStream("src/main/resources/bo-stopwords.txt"), "#")); StopFilter res = new StopFilter(syllables, stopSet); assertTokenStream(res, expected); }
@Override public Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> getTokenFilters() { return Collections.singletonMap("benz_cjk", (indexSettings, environment, name, settings) -> new AbstractTokenFilterFactory(indexSettings, name, settings) { @Override public TokenStream create(TokenStream tokenStream) { return new StopFilter(tokenStream, config.getStopWords()); } }); }
@Override public TokenStream create(final TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }
@Override public final TokenStream tokenStream(String fieldName, Reader reader) { // Apply stop words and porter stemmer using a lower-case tokenizer. TokenStream stream = new StopFilter(new LowerCaseTokenizer(reader), StandardAnalyzer.STOP_WORDS); return new PorterStemFilter(stream); }
/** * Takes a gloss-like string (text) and returns it tokenized. * with: * - stopwords * - lower case * - porter stemmer */ protected Set<String> tokenizeGloss( String s ) throws IOException { Set<String> result = new HashSet<String>(); // I am affraid that I am reimplementing the StandardAnalizer... TokenStream ts = new PorterStemFilter( new StopFilter( true, new LowerCaseTokenizer( new StringReader( s ) ), stopWords, true )); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while ( ts.incrementToken() ) { result.add( termAtt.term() ); } return result; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new AnsjTokenizer(reader,new IndexAnalysis(reader)); TokenStreamComponents result; if (stopwords.isEmpty()) { result = new TokenStreamComponents(source); } else { result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords)); } return result; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new AnsjTokenizer(reader,new ToAnalysis(reader)); TokenStreamComponents result; if (stopwords.isEmpty()) { result = new TokenStreamComponents(source); } else { result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords)); } return result; }
public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new SynonymFilter( new StopFilter(true, new LowerCaseFilter( new StandardFilter( new StandardTokenizer( Version.LUCENE_41, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET), engine ); return result; }
@Override public TokenStream create(TokenStream tokenStream) { return new StopFilter(tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); }
/** * Builds an analyzer with the given stop words. */ public GermanAnalyzer(String[] stopwords) { stopSet = StopFilter.makeStopSet(stopwords); }
/** * Builds an exclusionlist from an array of Strings. */ public void setStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.makeStopSet(exclusionlist); }
/** Builds an analyzer with the given stop words. */ public AlfrescoStandardAnalyser(String[] stopWords) { stopSet = StopFilter.makeStopSet(stopWords); }
/** * Builds an analyzer which writes to the given spelling dictionary, using the * given stop words. */ public SpellWritingAnalyzer(String[] stopWords, SpellWriter spellWriter) { this(StopFilter.makeStopSet(stopWords), spellWriter); }
/** * Builds an analyzer which removes words in {@link #STOP_WORDS}. */ public CJKAnalyzer() { stopTable = StopFilter.makeStopSet(STOP_WORDS); }
public StopAnalyzer2(String[] stopWords) { this.stopWords = StopFilter.makeStopSet(stopWords); }
public TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(true, new LowerCaseFilter(new LetterTokenizer(reader)), stopWords); }
public StopAnalyzer1(String[] stopWords) { this.stopWords = StopFilter.makeStopSet(stopWords); }
public TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(true, new LowerCaseTokenizer(reader), stopWords); }
public StopAnalyzerFlawed(String[] stopWords) { this.stopWords = StopFilter.makeStopSet(stopWords); }
/** * Ordering mistake here */ public TokenStream tokenStream(String fieldName, Reader reader) { return new LowerCaseFilter( new StopFilter(true, new LetterTokenizer(reader), stopWords)); }
/** * Builds an analyzer with the default stop words * (<code>GERMAN_STOP_WORDS</code>). */ public GermanAnalyzer() { stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); }
/** * Builds an analyzer which removes words in the provided array. * * @param stopWords stop word array */ public CJKAnalyzer(String[] stopWords) { stopTable = StopFilter.makeStopSet(stopWords); }