Java 类org.apache.lucene.analysis.core.KeywordTokenizer 实例源码

项目：elasticsearch_my 文件：SimplePolishTokenFilterTests.java

private void testToken(String source, String expected) throws IOException {
    Index index = new Index("test", "_na_");
    Settings settings = Settings.builder()
            .put("index.analysis.filter.myStemmer.type", "polish_stem")
            .build();
    TestAnalysis analysis = createTestAnalysis(index, settings, new AnalysisStempelPlugin());

    TokenFilterFactory filterFactory = analysis.tokenFilter.get("myStemmer");

    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(source));
    TokenStream ts = filterFactory.create(tokenizer);

    CharTermAttribute term1 = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    assertThat(ts.incrementToken(), equalTo(true));

    assertThat(term1.toString(), equalTo(expected));
}

项目：search 文件：TestICUCollationKeyFilterFactory.java

public void testIgnoreWhitespace() throws Exception {
  String withSpace = "foo bar";
  String withoutSpace = "foobar";
  String withPunctuation = "foo-bar";
  TokenFilterFactory factory = tokenFilterFactory("ICUCollationKey",
      "locale", "en",
      "strength", "primary",
      "alternate", "shifted",
      "variableTop", " ");
  TokenStream tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithoutSpace = factory.create(
      new KeywordTokenizer(new StringReader(withoutSpace)));
  assertCollatesToSame(tsWithSpace, tsWithoutSpace);
  // now assert that punctuation still matters: foo-bar < foo bar
  tsWithSpace = factory.create(
      new KeywordTokenizer(new StringReader(withSpace)));
  TokenStream tsWithPunctuation = factory.create(
      new KeywordTokenizer(new StringReader(withPunctuation)));
  assertCollation(tsWithPunctuation, tsWithSpace, -1);
}

项目：search 文件：TestSynonymMapFilter.java

public void testEmptyTerm() throws IOException {
  Random random = random();
  final int numIters = atLeast(10);
  for (int i = 0; i < numIters; i++) {
    b = new SynonymMap.Builder(random.nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(randomNonEmptyString(), randomNonEmptyString(), random.nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random.nextBoolean();

    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
      }
    };

    checkAnalysisConsistency(random, analyzer, random.nextBoolean(), "");
  }
}

项目：search 文件：EdgeNGramTokenFilterTest.java

public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new EdgeNGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
    assertTrue(tk.incrementToken());
    assertEquals(0, offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
    final int end = Character.offsetByCodePoints(s, 0, i);
    assertEquals(s.substring(0, end), termAtt.toString());
  }
  assertFalse(tk.incrementToken());
}

项目：search 文件：NGramTokenFilterTest.java

public void testSupplementaryCharacters() throws IOException {
  final String s = TestUtil.randomUnicodeString(random(), 10);
  final int codePointCount = s.codePointCount(0, s.length());
  final int minGram = TestUtil.nextInt(random(), 1, 3);
  final int maxGram = TestUtil.nextInt(random(), minGram, 10);
  TokenStream tk = new KeywordTokenizer(new StringReader(s));
  tk = new NGramTokenFilter(tk, minGram, maxGram);
  final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
  final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
  tk.reset();
  for (int start = 0; start < codePointCount; ++start) {
    for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int startIndex = Character.offsetByCodePoints(s, 0, start);
      final int endIndex = Character.offsetByCodePoints(s, 0, end);
      assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
    }
  }
  assertFalse(tk.incrementToken());
}

项目：search 文件：TestLucene47WordDelimiterFilter.java

public void testEmptyTerm() throws IOException {
  Random random = random();
  for (int i = 0; i < 512; i++) {
    final int flags = i;
    final CharArraySet protectedWords;
    if (random.nextBoolean()) {
      protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
    } else {
      protectedWords = null;
    }

    Analyzer a = new Analyzer() { 
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new Lucene47WordDelimiterFilter(tokenizer, flags, protectedWords));
      }
    };
    // depending upon options, this thing may or may not preserve the empty term
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
  }
}

项目：search 文件：TestWordDelimiterFilter.java

public void testEmptyTerm() throws IOException {
  Random random = random();
  for (int i = 0; i < 512; i++) {
    final int flags = i;
    final CharArraySet protectedWords;
    if (random.nextBoolean()) {
      protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
    } else {
      protectedWords = null;
    }

    Analyzer a = new Analyzer() { 
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
      }
    };
    // depending upon options, this thing may or may not preserve the empty term
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
  }
}

项目：search 文件：TestCodepointCountFilter.java

public void testRandomStrings() throws IOException {
  for (int i = 0; i < 10000; i++) {
    String text = TestUtil.randomUnicodeString(random(), 100);
    int min = TestUtil.nextInt(random(), 0, 100);
    int max = TestUtil.nextInt(random(), 0, 100);
    int count = text.codePointCount(0, text.length());
    if(min>max){
      int temp = min;
      min = max;
      max = temp;
    }
    boolean expected = count >= min && count <= max;
    TokenStream stream = new KeywordTokenizer(new StringReader(text));
    stream = new CodepointCountFilter(stream, min, max);
    stream.reset();
    assertEquals(expected, stream.incrementToken());
    stream.end();
    stream.close();
  }
}

项目：search 文件：TestSnowballVocab.java

/**
 * For the supplied language, run the stemmer against all strings in voc.txt
 * The output should be the same as the string in output.txt
 */
private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
    throws IOException {
  if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);

  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
      Tokenizer t = new KeywordTokenizer(reader);
      return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage));
    }  
  };

  assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"), 
      dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
}