Java 类org.apache.lucene.analysis.core.LetterTokenizer 实例源码

项目:news-credibility    文件:EgdeMain.java   
public static void main(String[] args) throws IOException {
    System.out.println(NumberUtils.isDigits("12345"));
    System.out.println(NumberUtils.isDigits("12345.1"));
    System.out.println(NumberUtils.isDigits("12345,2"));

    System.out.println(NumberUtils.isNumber("12345"));
    System.out.println(NumberUtils.isNumber("12345.1"));
    System.out.println(NumberUtils.isNumber("12345,2".replace(",", ".")));
    System.out.println(NumberUtils.isNumber("12345,2"));
    StringReader input = new StringReader(
            "Правя тест на класификатор и после др.Дулитъл, пада.br2n ще се оправя с данните! които,са много зле. Но това е по-добре. Но24"
                    .replaceAll("br2n", ""));

    LetterTokenizer tokenizer = new LetterTokenizer();
    tokenizer.setReader(input);

    TokenFilter stopFilter = new StopFilter(tokenizer, BULGARIAN_STOP_WORDS_SET);
    TokenFilter length = new LengthFilter(stopFilter, 3, 1000);
    TokenFilter stemmer = new BulgarianStemFilter(length);
    TokenFilter ngrams = new ShingleFilter(stemmer, 2, 2);

    try (TokenFilter filter = ngrams) {

        Attribute termAtt = filter.addAttribute(CharTermAttribute.class);
        filter.reset();
        while (filter.incrementToken()) {
            String word = termAtt.toString().replaceAll(",", "\\.").replaceAll("\n|\r", "");
            System.out.println(word);
        }
    }
}
项目:search    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), reader) {
        @Override
        protected int normalize(int c) {
          if (c > 0xffff) {
            return 'δ';
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      ts.reset();
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
        for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
          cp = highlightedText.codePointAt(j);
          assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
        }
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:search    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization2() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory(), reader) {
        @Override
        protected int normalize(int c) {
          if (c <= 0xffff) {
            return 0x1043C;
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      ts.reset();
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
        for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
          cp = highlightedText.codePointAt(j);
          assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
        }
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
        @Override
        protected int normalize(int c) {
          if (c > 0xffff) {
            return 'δ';
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = _TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
    ts.reset();
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
      String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
      for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
        cp = highlightedText.codePointAt(j);
        assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
      }
    }
    ts.end();
    ts.close();
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:NYBC    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization2() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
        @Override
        protected int normalize(int c) {
          if (c <= 0xffff) {
            return 0x1043C;
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = _TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", new StringReader(s));
    ts.reset();
    OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
    while (ts.incrementToken()) {
      String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
      for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
        cp = highlightedText.codePointAt(j);
        assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
      }
    }
    ts.end();
    ts.close();
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
        @Override
        protected int normalize(int c) {
          if (c > 0xffff) {
            return 'δ';
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = _TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      ts.reset();
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
        for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
          cp = highlightedText.codePointAt(j);
          assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
        }
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCharTokenizers.java   
public void testCrossPlaneNormalization2() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
        @Override
        protected int normalize(int c) {
          if (c <= 0xffff) {
            return 0x1043C;
          } else {
            return c;
          }
        }
      };
      return new TokenStreamComponents(tokenizer, tokenizer);
    }
  };
  int num = 1000 * RANDOM_MULTIPLIER;
  for (int i = 0; i < num; i++) {
    String s = _TestUtil.randomUnicodeString(random());
    TokenStream ts = analyzer.tokenStream("foo", s);
    try {
      ts.reset();
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
        for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
          cp = highlightedText.codePointAt(j);
          assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
        }
      }
      ts.end();
    } finally {
      IOUtils.closeWhileHandlingException(ts);
    }
  }
  // just for fun
  checkRandomData(random(), analyzer, num);
}
项目:elasticsearch_my    文件:LetterTokenizerFactory.java   
@Override
public Tokenizer create() {
    return new LetterTokenizer();
}
项目:Elasticsearch    文件:LetterTokenizerFactory.java   
@Override
public Tokenizer create() {
    return new LetterTokenizer();
}
项目:NYBC    文件:LetterTokenizerFactory.java   
@Override
public LetterTokenizer create(Reader input) {
  return new LetterTokenizer(luceneMatchVersion, input);
}
项目:t4f-data    文件:MetaphoneReplacementAnalyzer.java   
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new MetaphoneReplacementFilter(new LetterTokenizer(reader));
}