Java 类org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer 实例源码

项目:search    文件:TestUAX29URLEmailTokenizer.java   
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents
      (String fieldName, Reader reader) {

      Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_3_1, reader);
      return new TokenStreamComponents(tokenizer);
    }
  };
  checkOneTerm(a, "ざ", "さ"); // hiragana Bug
  checkOneTerm(a, "ザ", "ザ"); // katakana Works
  checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
  checkOneTerm(a, "아゙",  "아゙"); // hangul Works
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents
      (String fieldName, Reader reader) {

      Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader);
      return new TokenStreamComponents(tokenizer);
    }
  };
  checkOneTerm(a, "ざ", "さ"); // hiragana Bug
  checkOneTerm(a, "ザ", "ザ"); // katakana Works
  checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
  checkOneTerm(a, "아゙",  "아゙"); // hangul Works
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
@Deprecated
public void testCombiningMarksBackwards() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents
      (String fieldName, Reader reader) {

      Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader);
      return new TokenStreamComponents(tokenizer);
    }
  };
  checkOneTerm(a, "ざ", "さ"); // hiragana Bug
  checkOneTerm(a, "ザ", "ザ"); // katakana Works
  checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
  checkOneTerm(a, "아゙",  "아゙"); // hangul Works
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testLongEMAILatomText() throws Exception {
  // EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
  char[] emailAtomChars
      = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
  StringBuilder builder = new StringBuilder();
  int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
  for (int i = 0 ; i < numChars ; ++i) {
    builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
  }
  int tokenCount = 0;
  String text = builder.toString();
  UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer(new StringReader(text));
  ts.reset();
  while (ts.incrementToken()) {
    tokenCount++;
  }
  ts.end();
  ts.close();
  assertTrue(tokenCount > 0);

  tokenCount = 0;
  int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
  ts.setMaxTokenLength(newBufferSize);
  ts.setReader(new StringReader(text));
  ts.reset();
  while (ts.incrementToken()) {
    tokenCount++;
  }
  ts.end();
  ts.close();
  assertTrue(tokenCount > 0);
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents
  (String fieldName, Reader reader) {

  Tokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader);
  return new TokenStreamComponents(tokenizer);
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader);
  tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
  TokenFilter filter = new URLFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents
  (String fieldName, Reader reader) {

  Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  return new TokenStreamComponents(tokenizer);
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
  TokenFilter filter = new URLFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}
项目:elasticsearch-analysis-lemmagen    文件:LemmagenAnalysisTest.java   
public void testLemmagenFilterFactoryWithDefaultLexicon() throws IOException {
    ESTestCase.TestAnalysis analysis = createAnalysis();

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_default_filter");
    assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class));

    String source = "I was late.";
    String[] expected = new String[]{"I", "be", "late"};

    Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setReader(new StringReader(source));

    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch-analysis-lemmagen    文件:LemmagenAnalysisTest.java   
public void testLemmagenFilterFactoryWithCustomLexicon() throws IOException {
    ESTestCase.TestAnalysis analysis = createAnalysis();

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_cs_filter");
    assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class));

    String source = "Děkuji, že jsi přišel.";
    String[] expected = {"Děkovat", "že", "být", "přijít"};

    Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setReader(new StringReader(source));

    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch-analysis-lemmagen    文件:LemmagenAnalysisTest.java   
public void testLemmagenFilterFactoryWithShortLexiconCode() throws IOException {
    ESTestCase.TestAnalysis analysis = createAnalysis();

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_fr_filter");
    assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class));

    String source = "Il faut encore ajouter une pincée de sel.";
    String[] expected = new String[]{"Il", "falloir", "encore", "ajouter", "un", "pincer", "de", "sel"};

    Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setReader(new StringReader(source));

    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch-analysis-lemmagen    文件:LemmagenAnalysisTest.java   
public void testLemmagenFilterFactoryWithPath() throws IOException {
    ESTestCase.TestAnalysis analysis = createAnalysis();

    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("lemmagen_cs_path_filter");
    assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class));

    String source = "Děkuji, že jsi přišel.";
    String[] expected = {"Děkovat", "že", "být", "přijít"};

    Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setReader(new StringReader(source));

    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents
  (String fieldName, Reader reader) {

  Tokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  return new TokenStreamComponents(tokenizer);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.URL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
@Override
public final boolean incrementToken() throws java.io.IOException {
  boolean isTokenAvailable = false;
  while (input.incrementToken()) {
    if (typeAtt.type() == UAX29URLEmailTokenizer.TOKEN_TYPES[UAX29URLEmailTokenizer.EMAIL]) {
      isTokenAvailable = true;
      break;
    }
  }
  return isTokenAvailable;
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  tokenizer.setMaxTokenLength(Integer.MAX_VALUE);  // Tokenize arbitrary length URLs
  TokenFilter filter = new URLFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}
项目:elasticsearch_my    文件:UAX29URLEmailTokenizerFactory.java   
@Override
public Tokenizer create() {
    UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setMaxTokenLength(maxTokenLength);
    return tokenizer;
}
项目:search    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory(), reader);
  TokenFilter filter = new EmailFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}
项目:NYBC    文件:UAX29URLEmailTokenizerFactory.java   
@Override
public UAX29URLEmailTokenizer create(Reader input) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input); 
  tokenizer.setMaxTokenLength(maxTokenLength);
  return tokenizer;
}
项目:NYBC    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  TokenFilter filter = new EmailFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestUAX29URLEmailTokenizer.java   
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(TEST_VERSION_CURRENT, reader);
  TokenFilter filter = new EmailFilter(tokenizer);
  return new TokenStreamComponents(tokenizer, filter);
}