Java 类org.apache.lucene.analysis.Tokenizer 实例源码

项目:improved-journey    文件:TestAnsj.java   
public static void main(String[] args) throws IOException {
    List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
    System.out.println(parse);
    List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");


    //System.out.println(parse1);
    String text11="ZW321282050000000325";

    Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = 
            tokenizer.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAtt = 
            tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
    while (tokenizer.incrementToken()){

          System.out.print(new String(termAtt.toString()+" ") );
        //  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
        //System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

    }
    tokenizer.close();
}
项目:improved-journey    文件:TestAnsj.java   
public static void main(String[] args) throws IOException {
    List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
    System.out.println(parse);
    List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");


    //System.out.println(parse1);
    String text11="ZW321282050000000325";

    Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = 
            tokenizer.addAttribute(OffsetAttribute.class);
        PositionIncrementAttribute positionIncrementAtt = 
            tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
    while (tokenizer.incrementToken()){

          System.out.print(new String(termAtt.toString()+" ") );
        //  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
        //System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

    }
    tokenizer.close();
}
项目:elasticsearch_my    文件:UniqueTokenFilterTests.java   
public void testSimple() throws IOException {
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(t, new UniqueTokenFilter(t));
        }
    };

    TokenStream test = analyzer.tokenStream("test", "this test with test");
    test.reset();
    CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class);
    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("this"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("test"));

    assertThat(test.incrementToken(), equalTo(true));
    assertThat(termAttribute.toString(), equalTo("with"));

    assertThat(test.incrementToken(), equalTo(false));
}
项目:elasticsearch_my    文件:StemmerTokenFilterFactoryTests.java   
public void testEnglishFilterFactory() throws IOException {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        Version v = VersionUtils.randomVersion(random());
        Settings settings = Settings.builder()
                .put("index.analysis.filter.my_english.type", "stemmer")
                .put("index.analysis.filter.my_english.language", "english")
                .put("index.analysis.analyzer.my_english.tokenizer","whitespace")
                .put("index.analysis.analyzer.my_english.filter","my_english")
                .put(SETTING_VERSION_CREATED,v)
                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
                .build();

        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_english");
        assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
        Tokenizer tokenizer = new WhitespaceTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream create = tokenFilter.create(tokenizer);
        IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
        NamedAnalyzer analyzer = indexAnalyzers.get("my_english");
        assertThat(create, instanceOf(PorterStemFilter.class));
        assertAnalyzesTo(analyzer, "consolingly", new String[]{"consolingli"});
    }

}
项目:elasticsearch_my    文件:NGramTokenizerFactoryTests.java   
public void testPreTokenization() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
        new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"});
}
项目:elasticsearch_my    文件:NGramTokenizerFactoryTests.java   
public void testPreTokenizationEdge() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f "));
    assertTokenStreamContents(tokenizer,
            new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"});
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer,
            new String[] {" a", " a!"});
}
项目:elasticsearch_my    文件:NGramTokenizerFactoryTests.java   
public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        final Index index = new Index("test", "_na_");
        final String name = "ngr";
        Version v = randomVersion(random());
        Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3);
        boolean reverse = random().nextBoolean();
        if (reverse) {
            builder.put("side", "back");
        }
        Settings settings = builder.build();
        Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build();
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer);
        if (reverse) {
            assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class));
        } else {
            assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
        }
    }
}
项目:elasticsearch_my    文件:MinHashFilterFactoryTests.java   
public void testDefault() throws IOException {
    int default_hash_count = 1;
    int default_bucket_size = 512;
    int default_hash_set_size = 1;
    Settings settings = Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
    String source = "the quick brown fox";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
    // have enough tokens to fill all the buckets, we still expect 512 tokens.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer),
        default_hash_count * default_bucket_size * default_hash_set_size);
}
项目:elasticsearch_my    文件:MinHashFilterFactoryTests.java   
public void testSettings() throws IOException {
    Settings settings = Settings.builder()
        .put("index.analysis.filter.test_min_hash.type", "min_hash")
        .put("index.analysis.filter.test_min_hash.hash_count", "1")
        .put("index.analysis.filter.test_min_hash.bucket_count", "2")
        .put("index.analysis.filter.test_min_hash.hash_set_size", "1")
        .put("index.analysis.filter.test_min_hash.with_rotation", false)
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
    String source = "sushi";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));

    // despite the fact that bucket_count is 2 and hash_set_size is 1,
    // because with_rotation is false, we only expect 1 token here.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
项目:elasticsearch_my    文件:StopTokenFilterTests.java   
public void testCorrectPositionIncrementSetting() throws IOException {
    Builder builder = Settings.builder().put("index.analysis.filter.my_stop.type", "stop");
    if (random().nextBoolean()) {
        builder.put("index.analysis.filter.my_stop.version", Version.LATEST);
    } else {
        // don't specify
    }
    builder.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString());
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(builder.build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_stop");
    assertThat(tokenFilter, instanceOf(StopTokenFilterFactory.class));
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("foo bar"));
    TokenStream create = tokenFilter.create(tokenizer);
    assertThat(create, instanceOf(StopFilter.class));
}
项目:elasticsearch_my    文件:WordDelimiterGraphTokenFilterFactoryTests.java   
public void testMultiTerms() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
        .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
        .build());

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
        "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
        "ONeil", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
    int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
            expectedIncr, expectedPosLen, null);
}
项目:elasticsearch_my    文件:WordDelimiterGraphTokenFilterFactoryTests.java   
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
        .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
        .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot";
    int[] expectedIncr = new int[]{1, 0, 1};
    int[] expectedPosLen = new int[]{2, 1, 1};
    String[] expected = new String[]{"PowerShot", "Power", "Shot" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
        expectedIncr, expectedPosLen, null);
}
项目:elasticsearch_my    文件:DocumentFieldMapperTests.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new Tokenizer() {
        boolean incremented = false;
        CharTermAttribute term = addAttribute(CharTermAttribute.class);

        @Override
        public boolean incrementToken() throws IOException {
            if (incremented) {
                return false;
            }
            term.setLength(0).append(output);
            incremented = true;
            return true;
        }
    };
    return new TokenStreamComponents(tokenizer);
}
项目:elasticsearch_my    文件:SimplePolishTokenFilterTests.java   
private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();
    TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());

    TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}
项目:lams    文件:FSTSynonymFilterFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);

  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
      TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
      return new TokenStreamComponents(tokenizer, stream);
    }
  };

  try {
    String formatClass = format;
    if (format == null || format.equals("solr")) {
      formatClass = SolrSynonymParser.class.getName();
    } else if (format.equals("wordnet")) {
      formatClass = WordnetSynonymParser.class.getName();
    }
    // TODO: expose dedup as a parameter?
    map = loadSynonyms(loader, formatClass, true, analyzer);
  } catch (ParseException e) {
    throw new IOException("Error parsing synonyms file:", e);
  }
}
项目:lams    文件:PatternAnalyzer.java   
/**
 * Creates a token stream that tokenizes the given string into token terms
 * (aka words).
 * 
 * @param fieldName
 *            the name of the field to tokenize (currently ignored).
 * @param reader
 *            reader (e.g. charfilter) of the original text. can be null.
 * @param text
 *            the string to tokenize
 * @return a new token stream
 */
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
  // Ideally the Analyzer superclass should have a method with the same signature, 
  // with a default impl that simply delegates to the StringReader flavour. 
  if (reader == null) 
    reader = new FastStringReader(text);

  if (pattern == NON_WORD_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords));
  } else if (pattern == WHITESPACE_PATTERN) { // fast path
    return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords));
  }

  Tokenizer tokenizer = new PatternTokenizer(reader, pattern, toLowerCase);
  TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
  return new TokenStreamComponents(tokenizer, result);
}
项目:Elasticsearch    文件:CustomAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = tokenizerFactory.create();
    TokenStream tokenStream = tokenizer;
    for (TokenFilterFactory tokenFilter : tokenFilters) {
        tokenStream = tokenFilter.create(tokenStream);
    }
    return new TokenStreamComponents(tokenizer, tokenStream);
}
项目:elasticsearch-analysis-openkoreantext    文件:OpenKoreanTextStemmerTest.java   
@Test
public void testBasicUsage() throws IOException {
    String query = "한국어를 처리하는 예시입니다ㅋㅋ";
    String[] expectedCharTerms = new String[]{"한국어", "를", " ", "처리", "하다", " ", "예시", "이다", "ㅋㅋ"};
    String[] expectedTypes = new String[]{"Noun", "Josa", "Space", "Noun", "Verb", "Space", "Noun", "Adjective", "KoreanParticle"};
    int[] expectedStartOffsets = new int[]{0, 3, 4, 5, 7, 9, 10, 12, 15};
    int[] expectedEndOffsets = new int[]{3, 4, 5, 7, 9, 10, 12, 15, 17};

    Tokenizer tokenizer = new OpenKoreanTextTokenizer();
    tokenizer.setReader(new StringReader(query));

    OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextStemmer(tokenizer);

    TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets);
}
项目:elasticsearch-analysis-openkoreantext    文件:OpenKoreanTextPhraseExtractorTest.java   
@Test
public void testBasicUsage() throws IOException {
    String query = "한국어를 처리하는 예시입니다ㅋㅋ #한국어";

    String[] expectedCharTerms = new String[]{"한국어", "처리", "처리하는 예시", "예시", "#한국어"};
    String[] expectedTypes = new String[]{"Noun", "Noun", "Noun", "Noun", "Hashtag"};
    int[] expectedStartOffsets = new int[]{0, 5, 5, 10, 18};
    int[] expectedEndOffsets = new int[]{3, 7, 12, 12, 22};

    Tokenizer tokenizer = new OpenKoreanTextTokenizer();
    tokenizer.setReader(new StringReader(query));

    OpenKoreanTextTokenFilter tokenFilter = new OpenKoreanTextPhraseExtractor(tokenizer);
    TokenStreamAssertions.assertTokenStream(tokenFilter, expectedCharTerms, expectedTypes, expectedStartOffsets, expectedEndOffsets);
}
项目:elasticsearch-analysis-metaphone_ptBR    文件:MetaphoneTokenFilterTests.java   
@Test
public void testMetaphoneWords() throws Exception {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
            .put("index.analysis.filter.myStemmer.type", "br_metaphone")
            .build();

    AnalysisService analysisService = createAnalysisService(index, settings, new AnalysisMetaphonePlugin());

    TokenFilterFactory filterFactory = analysisService.tokenFilter("br_metaphone");

    Tokenizer tokenizer = new KeywordTokenizer();

    Map<String,String> words = buildWordList();

    Set<String> inputWords = words.keySet();
    for(String word : inputWords) {
        tokenizer.setReader(new StringReader(word));
        TokenStream ts = filterFactory.create(tokenizer);

        CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        assertThat(ts.incrementToken(), equalTo(true));
        assertThat(term1.toString(), equalTo(words.get(word)));
        ts.close();
    }
}
项目:Elasticsearch    文件:PathHierarchyTokenizerFactory.java   
@Override
public Tokenizer create() {
    if (reverse) {
        return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip);
    }
    return new PathHierarchyTokenizer(bufferSize, delimiter, replacement, skip);
}
项目:elasticsearch_my    文件:EdgeNGramTokenizerFactory.java   
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new EdgeNGramTokenizer(minGram, maxGram);
    } else {
        return new EdgeNGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
项目:elasticsearch_my    文件:NGramTokenizerFactory.java   
@Override
public Tokenizer create() {
    if (matcher == null) {
        return new NGramTokenizer(minGram, maxGram);
    } else {
        return new NGramTokenizer(minGram, maxGram) {
            @Override
            protected boolean isTokenChar(int chr) {
                return matcher.isTokenChar(chr);
            }
        };
    }
}
项目:elasticsearch_my    文件:CustomAnalyzer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = tokenizerFactory.create();
    TokenStream tokenStream = tokenizer;
    for (TokenFilterFactory tokenFilter : tokenFilters) {
        tokenStream = tokenFilter.create(tokenStream);
    }
    return new TokenStreamComponents(tokenizer, tokenStream);
}
项目:elasticsearch_my    文件:PathHierarchyTokenizerFactory.java   
@Override
public Tokenizer create() {
    if (reverse) {
        return new ReversePathHierarchyTokenizer(bufferSize, delimiter, replacement, skip);
    }
    return new PathHierarchyTokenizer(bufferSize, delimiter, replacement, skip);
}
项目:elasticsearch_my    文件:TransportAnalyzeAction.java   
private static TokenStream createStackedTokenStream(String source, CharFilterFactory[] charFilterFactories, TokenizerFactory tokenizerFactory, TokenFilterFactory[] tokenFilterFactories, int current) {
    Reader reader = new FastStringReader(source);
    for (CharFilterFactory charFilterFactory : charFilterFactories) {
        reader = charFilterFactory.create(reader);
    }
    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(reader);
    TokenStream tokenStream = tokenizer;
    for (int i = 0; i < current; i++) {
        tokenStream = tokenFilterFactories[i].create(tokenStream);
    }
    return tokenStream;
}
项目:elasticsearch_my    文件:AnalysisModuleTests.java   
private void assertTokenFilter(String name, Class<?> clazz) throws IOException {
    Settings settings = Settings.builder()
                           .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
                           .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get(name);
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("foo bar"));
    TokenStream stream = tokenFilter.create(tokenizer);
    assertThat(stream, instanceOf(clazz));
}
项目:elasticsearch_my    文件:StemmerTokenFilterFactoryTests.java   
public void testPorter2FilterFactory() throws IOException {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {

        Version v = VersionUtils.randomVersion(random());
        Settings settings = Settings.builder()
                .put("index.analysis.filter.my_porter2.type", "stemmer")
                .put("index.analysis.filter.my_porter2.language", "porter2")
                .put("index.analysis.analyzer.my_porter2.tokenizer","whitespace")
                .put("index.analysis.analyzer.my_porter2.filter","my_porter2")
                .put(SETTING_VERSION_CREATED,v)
                .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
                .build();

        ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_porter2");
        assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class));
        Tokenizer tokenizer = new WhitespaceTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream create = tokenFilter.create(tokenizer);
        IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
        NamedAnalyzer analyzer = indexAnalyzers.get("my_porter2");
        assertThat(create, instanceOf(SnowballFilter.class));
        assertAnalyzesTo(analyzer, "possibly", new String[]{"possibl"});
    }

}
项目:elasticsearch_my    文件:ASCIIFoldingTokenFilterFactoryTests.java   
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
    String source = "Ansprüche";
    String[] expected = new String[]{"Anspruche"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:NGramTokenizerFactoryTests.java   
public void testNoTokenChars() throws IOException {
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4).putArray("token_chars", new String[0]).build();
    Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("1.34"));
    assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"});
}
项目:elasticsearch_my    文件:KeepFilterFactoryTests.java   
public void testCaseInsensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "hello small world";
    String[] expected = new String[]{"hello", "world"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1, 2});
}
项目:elasticsearch_my    文件:KeepFilterFactoryTests.java   
public void testCaseSensitiveMapping() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_case_sensitive_keep_filter");
    assertThat(tokenFilter, instanceOf(KeepWordFilterFactory.class));
    String source = "Hello small world";
    String[] expected = new String[]{"Hello"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{1});
}
项目:elasticsearch_my    文件:WordDelimiterTokenFilterFactoryTests.java   
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
public void testPartsAndCatenate() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
        .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
        .put("index.analysis.filter.my_word_delimiter.type", type)
        .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
        .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
        .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot";
    String[] expected = new String[]{"Power", "PowerShot", "Shot" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
        "fi", "4000", "j", "2", "se", "O", "Neil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testCatenateWords() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testCatenateNumbers() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
            .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testCatenateAll() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
            .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
            .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testSplitOnCaseChange() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot";
    String[] expected = new String[]{"PowerShot"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testPreserveOriginal() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
        "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:BaseWordDelimiterTokenFilterFactoryTestCase.java   
public void testStemEnglishPossessive() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.my_word_delimiter.type", type)
            .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
            .build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
        "se", "O", "Neil", "s"};
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}