@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET); streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); return streams.filteredTokenStream; }
public void testHanOnly() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(reader); return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN)); } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); }
public void testUnigramsAndBigramsHanOnly() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(reader); return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true)); } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 }); }
/** * Test Position increments applied by TypeTokenFilter with and without enabling this option. */ public void testStopPositons() throws IOException { StringBuilder sb = new StringBuilder(); for (int i = 10; i < 20; i++) { if (i % 3 != 0) { sb.append(i).append(" "); } else { String w = English.intToEnglish(i).trim(); sb.append(w).append(" "); } } log(sb.toString()); String stopTypes[] = new String[]{"<NUM>"}; Set<String> stopSet = asSet(stopTypes); // with increments StringReader reader = new StringReader(sb.toString()); TypeTokenFilter typeTokenFilter = new TypeTokenFilter(Version.LATEST, new StandardTokenizer(reader), stopSet); testPositons(typeTokenFilter); // without increments reader = new StringReader(sb.toString()); typeTokenFilter = new TypeTokenFilter(Version.LUCENE_4_3, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); testPositons(typeTokenFilter); }
/** * Stem a list of words with a configured stemmer. * * @param words * The list of words to stem. * @param stemming * The stemmer to be used. * @return The stemmed list of words. */ @SuppressWarnings("resource") public static String[] stemWords(String[] words, Stemming stemming) { Set<String> stemmedStopWords = Sets.newHashSet(); for (String word : words) { TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word)); tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); stemmedStopWords.add(term); } } catch (IOException e) { logger.error("Failed to stem a list of words", e); } } return stemmedStopWords.toArray(new String[] {}); }
public void testHanOnly() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN)); } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }); }
public void testUnigramsAndBigramsHanOnly() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader); return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true)); } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 }); }
/** * Test Position increments applied by TypeTokenFilter with and without enabling this option. */ public void testStopPositons() throws IOException { StringBuilder sb = new StringBuilder(); for (int i = 10; i < 20; i++) { if (i % 3 != 0) { sb.append(i).append(" "); } else { String w = English.intToEnglish(i).trim(); sb.append(w).append(" "); } } log(sb.toString()); String stopTypes[] = new String[]{"<NUM>"}; Set<String> stopSet = asSet(stopTypes); // with increments StringReader reader = new StringReader(sb.toString()); TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); testPositons(typeTokenFilter); // without increments reader = new StringReader(sb.toString()); typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); testPositons(typeTokenFilter); }
@Test public void testTwo() throws IOException { AnalysisService analysisService = createAnalysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); String source = "Das sind Autos, die Nudeln transportieren."; String[] expected = { "Das", "Das", "sind", "sind", "Autos", "Auto", "die", "der", "Nudeln", "Nudel", "transportieren", "transportieren" }; Tokenizer tokenizer = new StandardTokenizer(Version.LATEST, new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
/** * Test Position increments applied by TypeTokenFilter with and without enabling this option. */ public void testStopPositons() throws IOException { StringBuilder sb = new StringBuilder(); for (int i = 10; i < 20; i++) { if (i % 3 != 0) { sb.append(i).append(" "); } else { String w = English.intToEnglish(i).trim(); sb.append(w).append(" "); } } log(sb.toString()); String stopTypes[] = new String[]{"<NUM>"}; Set<String> stopSet = asSet(stopTypes); // with increments StringReader reader = new StringReader(sb.toString()); TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); testPositons(typeTokenFilter); // without increments reader = new StringReader(sb.toString()); typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet); testPositons(typeTokenFilter); }
public void testKeepTypes() throws IOException { Settings settings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put("index.analysis.filter.keep_numbers.type", "keep_types") .putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"}) .build(); ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers"); assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class)); String source = "Hello 123 world"; String[] expected = new String[]{"123"}; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2}); }
public void testDefault() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testNoFlags() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testHanOnly() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
public void testHanUnigramOnly() throws IOException { ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }; Tokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
private String analyzeQuery(String query) throws IOException { StringBuilder result = new StringBuilder(); ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query))); TermAttribute termAttribute = filter.getAttribute(TermAttribute.class); while (filter.incrementToken()) { result.append(termAttribute.term()).append("* "); } return result.toString(); }
public IndraAnalyzer(String lang, ModelMetadata metadata) { if (lang == null || metadata == null) { throw new IllegalArgumentException("all parameters are mandatory."); } logger.debug("Creating analyzer, lang={}, preprocessing={}", lang, metadata); tokenizer = new StandardTokenizer(); tokenStream = createStream(lang, metadata, tokenizer); }
@Override public boolean incrementToken() throws IOException { clearAttributes(); int posIncr = 1; while (true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerInterface.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); if (tokenType == StandardTokenizer.ACRONYM_DEP) { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]); termAtt.setLength(termAtt.length() - 1); } else typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]); return true; } else posIncr++; } }
@Override public List<String> parseQuery(String queryStr) { // tokenize queryStr, remove stop word, stemming List<String> tokens = new ArrayList<String>(); AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; Tokenizer tokenizer = new StandardTokenizer(factory); tokenizer.setReader(new StringReader(queryStr)); CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); TokenStream tokenStream = new StopFilter(tokenizer, stopWords); // StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } tokenStream.end(); tokenStream.close(); tokenizer.close(); } catch (IOException e) { e.printStackTrace(); } // System.out.println("QU="+ sb.toString()); return tokens; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]); return true; } else { return false; } }
public void testAllScripts() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new StandardTokenizer(reader); return new TokenStreamComponents(t, new CJKBigramFilter(t, 0xff, false)); } }; assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }); }
public void testElision() throws Exception { String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin."; Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), new StringReader(test)); CharArraySet articles = new CharArraySet(asSet("l", "M"), false); TokenFilter filter = new ElisionFilter(tokenizer, articles); List<String> tas = filter(filter); assertEquals("embrouille", tas.get(4)); assertEquals("O'brian", tas.get(6)); assertEquals("enfin", tas.get(7)); }
@SuppressWarnings("unused") public void _testStandardConstants() { int x = StandardTokenizer.ALPHANUM; x = StandardTokenizer.APOSTROPHE; x = StandardTokenizer.ACRONYM; x = StandardTokenizer.COMPANY; x = StandardTokenizer.EMAIL; x = StandardTokenizer.HOST; x = StandardTokenizer.NUM; x = StandardTokenizer.CJ; String[] y = StandardTokenizer.TOKEN_TYPES; }
@Override public Object create(Random random) { // TypeTokenFilter Set<String> set = new HashSet<>(); int num = random.nextInt(5); for (int i = 0; i < num; i++) { set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]); } return set; }
@Override public Object create(Random random) { // TODO: make nastier if (random.nextBoolean()) { // a token type return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]; } else { return TestUtil.randomSimpleString(random); } }
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
@Override protected TokenStreamComponents createComponents (String fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), reader); return new TokenStreamComponents(tokenizer); }
public void testRandomHugeStringsGraphAfter() throws Exception { Random random = random(); checkRandomData(random, new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), reader); TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer); return new TokenStreamComponents(tokenizer, tokenStream); } }, 100*RANDOM_MULTIPLIER, 8192); }
/** * * @param string * @return arrayList of tokens of string converted to lowercase * @throws IOException */ public static ArrayList<String> tokenize(String string) throws IOException{ ArrayList<String> retList = new ArrayList<String>(); StringReader reader = new StringReader(string); StandardTokenizer tokenizer = new StandardTokenizer(); while(tokenizer.incrementToken()){ retList.add(tokenizer.getAttribute(Token.class).toString()); } tokenizer.close(); reader.close(); return retList; }
@Override public StandardTokenizer create(Reader input) { StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, input); tokenizer.setMaxTokenLength(maxTokenLength); return tokenizer; }