Java 类org.apache.lucene.analysis.core.WhitespaceTokenizer 实例源码

项目:lucenelab    文件:SynonymFilterExample.java   
@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Tokenizer tok = new WhitespaceTokenizer();
    tok.setReader(new StringReader("dark sea green sea green"));

    final SynonymMap.Builder builder = new SynonymMap.Builder(true);
    addSynonym("dark sea green", "color", builder);
    addSynonym("green", "color", builder);
    addSynonym("dark sea", "color", builder);
    addSynonym("sea green", "color", builder);
    final SynonymMap synMap = builder.build();
    final TokenStream ts = new SynonymFilter(tok, synMap, true);

    final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = ts.addAttribute(PositionLengthAttribute.class);

    ts.reset();
    int pos = -1;
    while (ts.incrementToken()) {
        pos += posIncrAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + pos + ", posLen=" + posLengthAtt.getPositionLength());
    }
    ts.end();
    ts.close();
}
项目:elasticsearch_my    文件:StemmerTokenFilterFactoryTests.java   
public void testEnglishFilterFactory() throws IOException {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        Version v = VersionUtils.randomVersion(random());
        Settings settings = Settings.builder()
                .put("index.analysis.filter.my_english.type", "stemmer")
                .put("index.analysis.filter.my_english.language", "english")
                .put("index.analysis.analyzer.my_english.tokenizer","whitespace")
                .put("index.analysis.analyzer.my_english.filter","my_english")
                .put(SETTING_VERSION_CREATED,v)
                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
                .build();

        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
        assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
        Tokenizer tokenizer = new WhitespaceTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream create = tokenFilter.create(tokenizer);
        IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
        NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
        assertThat(create, instanceOf(PorterStemFilter.class));
        assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
    }

}
项目:elasticsearch_my    文件:MinHashFilterFactoryTests.java   
public void testDefault() throws IOException {
    int default_hash_count = 1;
    int default_bucket_size = 512;
    int default_hash_set_size = 1;
    Settings settings = Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
    String source = "the quick brown fox";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
    // have enough tokens to fill all the buckets, we still expect 512 tokens.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
        default_hash_count * default_bucket_size * default_hash_set_size);
}
项目:elasticsearch_my    文件:MinHashFilterFactoryTests.java   
public void testSettings() throws IOException {
    Settings settings = Settings.builder()
        .put("index.analysis.filter.test_min_hash.type", "min_hash")
        .put("index.analysis.filter.test_min_hash.hash_count", "1")
        .put("index.analysis.filter.test_min_hash.bucket_count", "2")
        .put("index.analysis.filter.test_min_hash.hash_set_size", "1")
        .put("index.analysis.filter.test_min_hash.with_rotation", false)
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
    String source = "sushi";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // despite the fact that bucket_count is 2 and hash_set_size is 1,
    // because with_rotation is false, we only expect 1 token here.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
项目:elasticsearch_my    文件:StopTokenFilterTests.java   
public void testCorrectPositionIncrementSetting() throws IOException {
    Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
    if (random().nextBoolean()) {
        builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
    } else {
        // don't specify
    }
    builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
    assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("foo bar"));
    TokenStream create = tokenFilter.create(tokenizer);
    assertThat(create, instanceOf(StopFilter.class));
}
项目:elasticsearch_my    文件:WordDelimiterGraphTokenFilterFactoryTests.java   
public void testMultiTerms() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
        .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
        .build());

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
        "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
        "ONeil", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
    int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
            expectedIncr, expectedPosLen, null);
}
项目:elasticsearch_my    文件:WordDelimiterGraphTokenFilterFactoryTests.java   
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
        .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
        .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot";
    int[] expectedIncr = new int[]{1, 0, 1};
    int[] expectedPosLen = new int[]{2, 1, 1};
    String[] expected = new String[]{"PowerShot", "Power", "Shot" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
        expectedIncr, expectedPosLen, null);
}
项目:lams    文件:FSTSynonymFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
      TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  try {
    String formatClass = format;
    if (format == null || format.equals("solr")) {
      formatClass = SolrSynonymParser.class.getName();
    } else if (format.equals("wordnet")) {
      formatClass = WordnetSynonymParser.class.getName();
    }
    // TODO: expose dedup as a parameter?
    map = loadSynonyms(loader, formatClass, true, analyzer);
  } catch (ParseException e) {
    throw new IOException("Error parsing synonyms file:", e);
  }
}
项目:BENGAL    文件:DatasetAnalyzer.java   
private int countTokensInText(String text) {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader(text));
    // tokenizer.setReader(new StringReader(text));
    int tokens = 0;
    try {
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            ++tokens;
        }
    } catch (Exception e) {
        LOGGER.error("Error while tokenizing text. Returning.", e);
    } finally {
        IOUtils.closeQuietly(tokenizer);
    }
    return tokens;
}
项目:analyzers-ja    文件:ReloadableStopFilterTest.java   
@Test
public void testBasic() throws Exception {
    final Path dictPath = Files.createTempFile("rsf_", ".txt");
    final long reloadInterval = 500;
    writeFile(dictPath, "aaa");

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new ReloadableStopFilter(tokenizer, dictPath, true, reloadInterval));
        }
    };

    String input = "aaa bbb";
    assertAnalyzesTo(analyzer, input, new String[] { "bbb" });

    Thread.sleep(1000L);
    writeFile(dictPath, "bbb");
    Thread.sleep(1000L);

    assertAnalyzesTo(analyzer, input, new String[] { "aaa" });

}
项目:analyzers-ja    文件:ReloadableKeywordMarkerFilterTest.java   
@Test
public void testBasic() throws Exception {
    final Path dictPath = Files.createTempFile("rkmf_", ".txt");
    final long reloadInterval = 500;
    writeFile(dictPath, "aaa");

    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new ReloadableKeywordMarkerFilter(tokenizer, dictPath, reloadInterval));
        }
    };

    String input = "aaa bbb";
    assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
            new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { true, false }, true);

    Thread.sleep(1000L);
    writeFile(dictPath, "bbb");
    Thread.sleep(1000L);

    assertTokenStreamContents(analyzer.tokenStream("dummy", input), new String[] { "aaa", "bbb" }, new int[] { 0, 4 },
            new int[] { 3, 7 }, null, null, null, input.length(), new boolean[] { false, true }, true);

}
项目:analyzers-ja    文件:PatternConcatenationFilterTest.java   
@Test
public void testBasic() throws IOException {
    final Pattern pattern1 = Pattern.compile("平成|昭和");
    final Pattern pattern2 = Pattern.compile("[0-9]+年");
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new PatternConcatenationFilter(tokenizer, pattern1, pattern2));
        }
    };

    assertAnalyzesTo(analyzer, "平成 10年", //
            new String[] { "平成10年" }, //
            new int[] { 0 },//
            new int[] { 6 },//
            new int[] { 1 });
    assertAnalyzesTo(analyzer, "aaa 昭和 56年 bbb", //
            new String[] { "aaa", "昭和56年", "bbb" }, //
            new int[] { 1, 1, 1 });
    assertAnalyzesTo(analyzer, "大正 5年", //
            new String[] { "大正", "5年" }, //
            new int[] { 1, 1 });

}
项目:analyzers-ja    文件:StopTokenPrefixFilterTest.java   
@Test
public void testBasic() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new StopTokenPrefixFilter(tokenizer, new String[] { "b", "dd" }, false));
        }
    };

    assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
            new String[] { "aaa", "ccc", "eee" }, //
            new int[] { 0, 8, 16 }, //
            new int[] { 3, 11, 19 }, //
            new int[] { 1, 2, 2 });
    assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
    assertAnalyzesTo(analyzer, "ddd", new String[0]);
    assertAnalyzesTo(analyzer, "add", new String[] { "add" });
    assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
    assertAnalyzesTo(analyzer, "dda", new String[0]);
    assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });

}
项目:analyzers-ja    文件:StopTokenSuffixFilterTest.java   
@Test
public void testBasic() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new WhitespaceTokenizer();
            return new TokenStreamComponents(tokenizer, new StopTokenSuffixFilter(tokenizer, new String[] { "b", "dd" }, false));
        }
    };

    assertAnalyzesTo(analyzer, "aaa bbb ccc ddd eee", //
            new String[] { "aaa", "ccc", "eee" }, //
            new int[] { 0, 8, 16 }, //
            new int[] { 3, 11, 19 }, //
            new int[] { 1, 2, 2 });
    assertAnalyzesTo(analyzer, "aaa", new String[] { "aaa" });
    assertAnalyzesTo(analyzer, "ddd", new String[0]);
    assertAnalyzesTo(analyzer, "add", new String[0]);
    assertAnalyzesTo(analyzer, "aad", new String[] { "aad" });
    assertAnalyzesTo(analyzer, "dda", new String[] { "dda" });
    assertAnalyzesTo(analyzer, "daa", new String[] { "daa" });

}
项目:search    文件:FSTSynonymFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
      TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  try {
    String formatClass = format;
    if (format == null || format.equals("solr")) {
      formatClass = SolrSynonymParser.class.getName();
    } else if (format.equals("wordnet")) {
      formatClass = WordnetSynonymParser.class.getName();
    }
    // TODO: expose dedup as a parameter?
    map = loadSynonyms(loader, formatClass, true, analyzer);
  } catch (ParseException e) {
    throw new IOException("Error parsing synonyms file:", e);
  }
}
项目:search    文件:TestCompoundWordTokenFilter.java   
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdef"), MockTokenizer.WHITESPACE, false);
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    new WhitespaceTokenizer(
      new StringReader(
        "abcdef")
      ),
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 0, 0},
    new int[] { 6, 6, 6, 6},
    new int[] { 1, 0, 0, 0}
    );
}
项目:search    文件:TestCompoundWordTokenFilter.java   
public void testWordComponentWithLessThanMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");

  Tokenizer tokenizer = new MockTokenizer(new StringReader("abcdefg"), MockTokenizer.WHITESPACE, false);
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    new WhitespaceTokenizer(
      new StringReader(
        "abcdefg")
      ),
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

// since "d" is shorter than the minimum subword size, it should not be added to the token stream
  assertTokenStreamContents(tf,
    new String[] { "abcdefg", "abc", "efg" },
    new int[] { 0, 0, 0},
    new int[] { 7, 7, 7},
    new int[] { 1, 0, 0}
    );
}
项目:search    文件:CommonGramsFilterTest.java   
public void testReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);

  CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the_s", term.toString());
  cgf.close();

  wt.setReader(new StringReader(input));
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
}
项目:search    文件:CommonGramsFilterTest.java   
public void testQueryReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);

  CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(nsf.incrementToken());
  assertEquals("the_s", term.toString());
  nsf.close();

  wt.setReader(new StringReader(input));
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
}
项目:search    文件:ShingleFilterTest.java   
public void testReset() throws Exception {
  Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
  TokenStream filter = new ShingleFilter(wsTokenizer, 2);
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
}
项目:search    文件:FSTSynonymFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
      TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  try {
    String formatClass = format;
    if (format == null || format.equals("solr")) {
      formatClass = SolrSynonymParser.class.getName();
    } else if (format.equals("wordnet")) {
      formatClass = WordnetSynonymParser.class.getName();
    }
    // TODO: expose dedup as a parameter?
    map = loadSynonyms(loader, formatClass, true, analyzer);
  } catch (ParseException e) {
    throw new IOException("Error parsing synonyms file:", e);
  }
}
项目:gerbil    文件:DatasetAnalyzer.java   
private int countTokensInText(String text) {
    WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    int tokens = 0;
    try {
        tokenizer.reset();
        while (tokenizer.incrementToken()) {
            ++tokens;
        }
    } catch (Exception e) {
        LOGGER.error("Error while tokenizing text. Returning.", e);
    } finally {
        IOUtils.closeQuietly(tokenizer);
    }
    return tokens;
}
项目:lucenelab    文件:AnnotatorTeeSinkFilterExample.java   
@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnnotatorTokenFilter(
            textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors());
    final TokenStream animalsStream = new AnnotatorTokenFilter(
            textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals());

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}
项目:lucenelab    文件:AnnotatorAnyExample.java   
@SuppressWarnings("resource")
private static void addDocument(IndexWriter writer, String text) throws IOException {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    final TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    final TokenStream colorsStream = new AnyAnnotationTokenFilter(new AnnotatorTokenFilter(
            textStream.newSinkTokenStream(), ColorAnnotator.withDefaultColors()));
    final TokenStream animalsStream = new AnyAnnotationTokenFilter(new AnnotatorTokenFilter(
            textStream.newSinkTokenStream(), AnimalAnnotator.withDefaultAnimals()));

    final Document doc = new Document();
    doc.add(new StoredField(TEXT_FIELD, text));
    doc.add(new TextField(TEXT_FIELD, textStream));
    doc.add(new TextField(COLOR_FIELD, colorsStream));
    doc.add(new TextField(ANIMAL_FIELD, animalsStream));
    writer.addDocument(doc);
}
项目:auto-phrase-tokenfilter    文件:TestAutoPhrasingTokenFilter.java   
public void testOverlappingAtBeginning( ) throws Exception {
  final CharArraySet phraseSets = new CharArraySet( Arrays.asList(
          "new york", "new york city", "city of new york" ), false);

  final String input = "new york city is great";

  StringReader reader = new StringReader(input);
  final WhitespaceTokenizer in = new WhitespaceTokenizer( reader );

  AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false );
  aptf.setReplaceWhitespaceWith( new Character( '_' ) );
  CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
  aptf.reset();

  assertTrue(aptf.incrementToken());
  assertEquals( "new_york_city", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "is", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "great", term.toString());
}
项目:auto-phrase-tokenfilter    文件:TestAutoPhrasingTokenFilter.java   
public void testOverlappingAtEnd( ) throws Exception {
  final CharArraySet phraseSets = new CharArraySet( Arrays.asList(
      "new york", "new york city", "city of new york" ), false);

  final String input = "the great city of new york";

  StringReader reader = new StringReader(input);
  final WhitespaceTokenizer in = new WhitespaceTokenizer( reader );

  AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false );
  aptf.setReplaceWhitespaceWith( new Character( '_' ) );
  CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
  aptf.reset();

  assertTrue(aptf.incrementToken());
  assertEquals( "the", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "great", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "city_of_new_york", term.toString());
}
项目:auto-phrase-tokenfilter    文件:TestAutoPhrasingTokenFilter.java   
public void testIncompletePhrase( ) throws Exception {
  final CharArraySet phraseSets = new CharArraySet( Arrays.asList(
      "big apple", "new york city", "property tax", "three word phrase"), false);

  final String input = "some new york";

  StringReader reader = new StringReader(input);
  final WhitespaceTokenizer in = new WhitespaceTokenizer( reader );

  AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter( in, phraseSets, false );
  aptf.setReplaceWhitespaceWith( new Character( '_' ) );
  CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
  aptf.reset();

  assertTrue(aptf.incrementToken());
  assertEquals( "some", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "new", term.toString());
  assertTrue(aptf.incrementToken());
  assertEquals( "york", term.toString());
}
项目:NYBC    文件:TestCompoundWordTokenFilter.java   
public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("ab", "cd", "ef");

  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
    new WhitespaceTokenizer(TEST_VERSION_CURRENT,
      new StringReader(
        "abcdef")
      ),
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  assertTokenStreamContents(tf,
    new String[] { "abcdef", "ab", "cd", "ef" },
    new int[] { 0, 0, 2, 4},
    new int[] { 6, 2, 4, 6},
    new int[] { 1, 0, 0, 0}
    );
}
项目:NYBC    文件:TestCompoundWordTokenFilter.java   
public void testWordComponentWithLessThanMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");

  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
    new WhitespaceTokenizer(TEST_VERSION_CURRENT,
      new StringReader(
        "abcdefg")
      ),
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

// since "d" is shorter than the minimum subword size, it should not be added to the token stream
  assertTokenStreamContents(tf,
    new String[] { "abcdefg", "abc", "efg" },
    new int[] { 0, 0, 4},
    new int[] { 7, 3, 7},
    new int[] { 1, 0, 0}
    );
}
项目:NYBC    文件:TestCompoundWordTokenFilter.java   
public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
      "Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT, 
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}
项目:NYBC    文件:TestCompoundWordTokenFilter.java   
public void testRetainMockAttribute() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");
  Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
      new StringReader("abcdefg"));
  TokenStream stream = new MockRetainAttributeFilter(tokenizer);
  stream = new DictionaryCompoundWordTokenFilter(
      TEST_VERSION_CURRENT, stream, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
  stream.reset();
  while (stream.incrementToken()) {
    assertTrue("Custom attribute value was lost", retAtt.getRetain());
  }

}
项目:NYBC    文件:CommonGramsFilterTest.java   
public void testReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);

  CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class);
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the", term.toString());
  assertTrue(cgf.incrementToken());
  assertEquals("the_s", term.toString());

  wt.setReader(new StringReader(input));
  cgf.reset();
  assertTrue(cgf.incrementToken());
  assertEquals("How", term.toString());
}
项目:NYBC    文件:CommonGramsFilterTest.java   
public void testQueryReset() throws Exception {
  final String input = "How the s a brown s cow d like A B thing?";
  WhitespaceTokenizer wt = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(TEST_VERSION_CURRENT, wt, commonWords);
  CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf);

  CharTermAttribute term = wt.addAttribute(CharTermAttribute.class);
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
  assertTrue(nsf.incrementToken());
  assertEquals("the_s", term.toString());

  wt.setReader(new StringReader(input));
  nsf.reset();
  assertTrue(nsf.incrementToken());
  assertEquals("How_the", term.toString());
}
项目:NYBC    文件:ShingleFilterTest.java   
public void testReset() throws Exception {
  Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
  TokenStream filter = new ShingleFilter(wsTokenizer, 2);
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
  wsTokenizer.setReader(new StringReader("please divide this sentence"));
  assertTokenStreamContents(filter,
    new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
    new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
    new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
    new int[]{1,0,1,0,1,0,1}
  );
}
项目:taxonomy    文件:IAViewTextCasPuncAnalyser.java   
@Override
   protected TokenStreamComponents createComponents(final String fieldName) {
Tokenizer source = new WhitespaceTokenizer();

TokenStream result = null;

if (stopFilterFactory != null) {
    result = this.stopFilterFactory.create(source);
} else {
    logger.warn(".createComponents: stopFilter disabled");
}

if (AnalyzerType.QUERY.equals(analyzerType)) {
    if (synonymFilterFactory != null) {
    result = this.synonymFilterFactory.create(result == null ? source : result);
    } else {
    logger.warn(".createComponents: synonymFilter disabled");
    }
}
return new TokenStreamComponents(source, result == null ? source : result);
   }
项目:elasticsearch-plugin-bundle    文件:AutoPhrasingTokenFilterTest.java   
@Test
public void testOverlappingAtBeginning() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "new york city is great";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("new_york_city", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("is", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
}
项目:elasticsearch-plugin-bundle    文件:AutoPhrasingTokenFilterTest.java   
@Test
public void testOverlappingAtEnd() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "new york", "new york city", "city of new york"), false);

    final String input = "the great city of new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("the", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("great", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("city_of_new_york", term.toString());
}
项目:elasticsearch-plugin-bundle    文件:AutoPhrasingTokenFilterTest.java   
@Test
public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}
项目:read-open-source-code    文件:FSTSynonymFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
      TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  try {
    String formatClass = format;
    if (format == null || format.equals("solr")) {
      formatClass = SolrSynonymParser.class.getName();
    } else if (format.equals("wordnet")) {
      formatClass = WordnetSynonymParser.class.getName();
    }
    // TODO: expose dedup as a parameter?
    map = loadSynonyms(loader, formatClass, true, analyzer);
  } catch (ParseException e) {
    throw new IOException("Error parsing synonyms file:", e);
  }
}