Java 类org.apache.lucene.analysis.BaseTokenStreamTestCase 实例源码

项目:elasticsearch_my    文件:AnalysisPolishFactoryTests.java   
private void testThreadSafety(TokenFilterFactory factory) throws IOException {
    final Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer();
            return new TokenStreamComponents(tokenizer, factory.create(tokenizer));
        }
    };
    BaseTokenStreamTestCase.checkRandomData(random(), analyzer, 100);
}
项目:elasticsearch-analysis-jieba    文件:JiebaAnalyzerTest.java   
public void testStandardAnalyzer() throws IOException {
    Analyzer analyzer = new JiebaAnalyzer();

    checkRandomData(new Random(0), analyzer, 1);

    System.out.println(BaseTokenStreamTestCase.toString(analyzer, "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作"));
    System.out.println("==============");
    System.out.println(BaseTokenStreamTestCase.toString(analyzer, "hello  world,this is my first program"));
    System.out.println("==============");
    System.out.println(BaseTokenStreamTestCase.toString(analyzer, "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。"));

}
项目:search    文件:TestPerfTasksLogic.java   
public void testAnalyzerFactory() throws Exception {
  String text = "Fortieth, Quarantième, Cuadragésimo";
  Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
      ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
       "positionIncrementGap:100,offsetGap:1111,"
       +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
       +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
       +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
  BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
      new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
                     "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
                     "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only on token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
项目:search    文件:TestUAX29URLEmailAnalyzer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  BaseTokenStreamTestCase.assertAnalyzesTo(a, input, new String[]{"testing", "1234"}) ;
}
项目:search    文件:TestUAX29URLEmailAnalyzer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTER E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only one token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:search    文件:TestUAX29URLEmailAnalyzer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
}
项目:search    文件:TestUAX29URLEmailAnalyzer.java   
public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
项目:search    文件:TestUAX29URLEmailAnalyzer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}
项目:search    文件:TestStandardAnalyzer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:search    文件:TestStandardAnalyzer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only on token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:search    文件:TestStandardAnalyzer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
项目:search    文件:TestStandardAnalyzer.java   
public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
项目:search    文件:TestStandardAnalyzer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
项目:search    文件:VocabularyAssert.java   
/** Run a vocabulary test against two data files. */
public static void assertVocabulary(Analyzer a, InputStream voc, InputStream out)
throws IOException {
  BufferedReader vocReader = new BufferedReader(
      new InputStreamReader(voc, StandardCharsets.UTF_8));
  BufferedReader outputReader = new BufferedReader(
      new InputStreamReader(out, StandardCharsets.UTF_8));
  String inputWord = null;
  while ((inputWord = vocReader.readLine()) != null) {
    String expectedWord = outputReader.readLine();
    Assert.assertNotNull(expectedWord);
    BaseTokenStreamTestCase.checkOneTerm(a, inputWord, expectedWord);
  }
}
项目:search    文件:VocabularyAssert.java   
/** Run a vocabulary test against one file: tab separated. */
public static void assertVocabulary(Analyzer a, InputStream vocOut)
throws IOException {
  BufferedReader vocReader = new BufferedReader(
      new InputStreamReader(vocOut, StandardCharsets.UTF_8));
  String inputLine = null;
  while ((inputLine = vocReader.readLine()) != null) {
    if (inputLine.startsWith("#") || inputLine.trim().length() == 0)
      continue; /* comment */
    String words[] = inputLine.split("\t");
    BaseTokenStreamTestCase.checkOneTerm(a, words[0], words[1]);
  }
}
项目:NYBC    文件:TestPerfTasksLogic.java   
public void testAnalyzerFactory() throws Exception {
  String text = "Fortieth, Quarantième, Cuadragésimo";
  Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
      ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
       "positionIncrementGap:100,offsetGap:1111,"
       +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
       +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
       +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
  BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
      new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
                     "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
                     "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only on token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}
项目:NYBC    文件:TestUAX29URLEmailAnalyzer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  BaseTokenStreamTestCase.assertAnalyzesTo(a, input, new String[]{"testing", "1234"}) ;
}
项目:NYBC    文件:TestUAX29URLEmailAnalyzer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTER E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only one token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:NYBC    文件:TestUAX29URLEmailAnalyzer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
}
项目:NYBC    文件:TestUAX29URLEmailAnalyzer.java   
public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
项目:NYBC    文件:TestUAX29URLEmailAnalyzer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}
项目:NYBC    文件:TestStandardAnalyzer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  StandardTokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:NYBC    文件:TestStandardAnalyzer.java   
public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only on token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}
项目:NYBC    文件:TestStandardAnalyzer.java   
public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}
项目:NYBC    文件:TestStandardAnalyzer.java   
public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
项目:NYBC    文件:TestStandardAnalyzer.java   
public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}