Java 类org.apache.lucene.analysis.standard.StandardTokenizer 实例源码

项目:airsonic    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:subsonic    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:search    文件:TestCJKBigramFilter.java   
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
项目:search    文件:TestCJKBigramFilter.java   
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
项目:search    文件:TestTypeTokenFilter.java   
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(Version.LATEST, new StandardTokenizer(reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(Version.LUCENE_4_3, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
项目:FutureSonic-Server    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:madsonic-server-5.1    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:SPLevo    文件:LuceneCodeAnalyzer.java   
/**
 * Stem a list of words with a configured stemmer.
 *
 * @param words
 *            The list of words to stem.
 * @param stemming
 *            The stemmer to be used.
 * @return The stemmed list of words.
 */
@SuppressWarnings("resource")
public static String[] stemWords(String[] words, Stemming stemming) {
    Set<String> stemmedStopWords = Sets.newHashSet();

    for (String word : words) {
        TokenStream tokenStream = new StandardTokenizer(LUCENE_VERSION, new StringReader(word));
        tokenStream = Stemming.wrapStemmingFilter(tokenStream, stemming);

        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                stemmedStopWords.add(term);
            }
        } catch (IOException e) {
            logger.error("Failed to stem a list of words", e);
        }
    }
    return stemmedStopWords.toArray(new String[] {});
}
项目:madsonic-server-5.0    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:NYBC    文件:TestCJKBigramFilter.java   
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
项目:NYBC    文件:TestCJKBigramFilter.java   
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
项目:NYBC    文件:TestTypeTokenFilter.java   
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(true, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
项目:madsonic-server-5.0    文件:SearchService.java   
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
项目:elasticsearch-analysis-german    文件:BaseformTokenFilterTests.java   
@Test
public void testTwo() throws IOException {
    AnalysisService analysisService = createAnalysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");

    String source = "Das sind Autos, die Nudeln transportieren.";

    String[] expected = {
            "Das",
            "Das",
            "sind",
            "sind",
            "Autos",
            "Auto",
            "die",
            "der",
            "Nudeln",
            "Nudel",
            "transportieren",
            "transportieren"
    };

    Tokenizer tokenizer = new StandardTokenizer(Version.LATEST, new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCJKBigramFilter.java   
public void testHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestCJKBigramFilter.java   
public void testUnigramsAndBigramsHanOnly() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
      return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
      new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
      new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
      new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
                     "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
                     "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
      new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
      new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestTypeTokenFilter.java   
/**
 * Test Position increments applied by TypeTokenFilter with and without enabling this option.
 */
public void testStopPositons() throws IOException {
  StringBuilder sb = new StringBuilder();
  for (int i = 10; i < 20; i++) {
    if (i % 3 != 0) {
      sb.append(i).append(" ");
    } else {
      String w = English.intToEnglish(i).trim();
      sb.append(w).append(" ");
    }
  }
  log(sb.toString());
  String stopTypes[] = new String[]{"<NUM>"};
  Set<String> stopSet = asSet(stopTypes);

  // with increments
  StringReader reader = new StringReader(sb.toString());
  TypeTokenFilter typeTokenFilter = new TypeTokenFilter(TEST_VERSION_CURRENT, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

  // without increments
  reader = new StringReader(sb.toString());
  typeTokenFilter = new TypeTokenFilter(Version.LUCENE_43, false, new StandardTokenizer(TEST_VERSION_CURRENT, reader), stopSet);
  testPositons(typeTokenFilter);

}
项目:elasticsearch_my    文件:KeepTypesFilterFactoryTests.java   
public void testKeepTypes() throws IOException {
    Settings settings = Settings.builder()
            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
            .put("index.analysis.filter.keep_numbers.type", "keep_types")
            .putArray("index.analysis.filter.keep_numbers.types", new String[] {"<NUM>", "<SOMETHINGELSE>"})
            .build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("keep_numbers");
    assertThat(tokenFilter, instanceOf(KeepTypesFilterFactory.class));
    String source = "Hello 123 world";
    String[] expected = new String[]{"123"};
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected, new int[]{2});
}
项目:elasticsearch_my    文件:CJKFilterFactoryTests.java   
public void testDefault() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:CJKFilterFactoryTests.java   
public void testNoFlags() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:CJKFilterFactoryTests.java   
public void testHanOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:CJKFilterFactoryTests.java   
public void testHanUnigramOnly() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
    String source = "多くの学生が試験に落ちた。";
    String[] expected = new String[]{"多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た"  };
    Tokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
项目:airsonic    文件:SearchService.java   
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
项目:Indra    文件:IndraAnalyzer.java   
public IndraAnalyzer(String lang, ModelMetadata metadata) {
    if (lang == null || metadata == null) {
        throw new IllegalArgumentException("all parameters are mandatory.");
    }

    logger.debug("Creating analyzer, lang={}, preprocessing={}", lang, metadata);
    tokenizer = new StandardTokenizer();
    tokenStream = createStream(lang, metadata, tokenizer);
}
项目:fastcatsearch3    文件:GeneralTokenizer.java   
@Override
public boolean incrementToken() throws IOException {
    clearAttributes();
    int posIncr = 1;

    while (true) {
        int tokenType = scanner.getNextToken();

        if (tokenType == StandardTokenizerInterface.YYEOF) {
            return false;
        }

        if (scanner.yylength() <= maxTokenLength) {
            posIncrAtt.setPositionIncrement(posIncr);
            scanner.getText(termAtt);
            final int start = scanner.yychar();
            offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));

            if (tokenType == StandardTokenizer.ACRONYM_DEP) {
                typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
                termAtt.setLength(termAtt.length() - 1);
            } else
                typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
            return true;
        } else
            posIncr++;
    }
}
项目:subsonic    文件:SearchService.java   
private String analyzeQuery(String query) throws IOException {
    StringBuilder result = new StringBuilder();
    ASCIIFoldingFilter filter = new ASCIIFoldingFilter(new StandardTokenizer(LUCENE_VERSION, new StringReader(query)));
    TermAttribute termAttribute = filter.getAttribute(TermAttribute.class);
    while (filter.incrementToken()) {
        result.append(termAttribute.term()).append("* ");
    }
    return result.toString();
}
项目:AdSearch_Endpoints    文件:QueryParserImpl.java   
@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
    List<String> tokens = new ArrayList<String>();
    AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
    Tokenizer tokenizer = new StandardTokenizer(factory);
    tokenizer.setReader(new StringReader(queryStr));
    CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
    } catch (IOException e) {
        e.printStackTrace();
    }
//  System.out.println("QU="+ sb.toString());
    return tokens;  
  }
项目:search    文件:TestCJKAnalyzer.java   
@Override
public boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
    typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
    return true;
  } else {
    return false;
  }
}
项目:search    文件:TestCJKBigramFilter.java   
public void testAllScripts() throws Exception {
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
      Tokenizer t = new StandardTokenizer(reader);
      return new TokenStreamComponents(t, 
          new CJKBigramFilter(t, 0xff, false));
    }
  };
  assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
      new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
}
项目:search    文件:TestElision.java   
public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}
项目:search    文件:TestAnalyzers.java   
@SuppressWarnings("unused")
public void _testStandardConstants() {
  int x = StandardTokenizer.ALPHANUM;
  x = StandardTokenizer.APOSTROPHE;
  x = StandardTokenizer.ACRONYM;
  x = StandardTokenizer.COMPANY;
  x = StandardTokenizer.EMAIL;
  x = StandardTokenizer.HOST;
  x = StandardTokenizer.NUM;
  x = StandardTokenizer.CJ;
  String[] y = StandardTokenizer.TOKEN_TYPES;
}
项目:search    文件:TestRandomChains.java   
@Override public Object create(Random random) {
  // TypeTokenFilter
  Set<String> set = new HashSet<>();
  int num = random.nextInt(5);
  for (int i = 0; i < num; i++) {
    set.add(StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)]);
  }
  return set;
}
项目:search    文件:TestRandomChains.java   
@Override public Object create(Random random) {
  // TODO: make nastier
  if (random.nextBoolean()) {
    // a token type
    return StandardTokenizer.TOKEN_TYPES[random.nextInt(StandardTokenizer.TOKEN_TYPES.length)];
  } else {
    return TestUtil.randomSimpleString(random);
  }
}
项目:search    文件:TestStandardAnalyzer.java   
public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
项目:search    文件:TestStandardAnalyzer.java   
@Override
protected TokenStreamComponents createComponents
  (String fieldName, Reader reader) {

  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), reader);
  return new TokenStreamComponents(tokenizer);
}
项目:search    文件:TestStandardAnalyzer.java   
public void testRandomHugeStringsGraphAfter() throws Exception {
  Random random = random();
  checkRandomData(random,
                  new Analyzer() {
                    @Override
                    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                      Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory(), reader);
                      TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
                      return new TokenStreamComponents(tokenizer, tokenStream);
                    }
                  },
                  100*RANDOM_MULTIPLIER, 8192);
}
项目:semanticvectors    文件:LuceneTokenizer.java   
/**
 * 
 * @param string
 * @return arrayList of tokens of string converted to lowercase
 * @throws IOException
 */
public static ArrayList<String> tokenize(String string) throws IOException{
    ArrayList<String> retList = new ArrayList<String>();
    StringReader reader = new StringReader(string);
    StandardTokenizer tokenizer = new StandardTokenizer();
    while(tokenizer.incrementToken()){
        retList.add(tokenizer.getAttribute(Token.class).toString());
    }
    tokenizer.close();
    reader.close();
    return retList;
}
项目:NYBC    文件:StandardTokenizerFactory.java   
@Override
public StandardTokenizer create(Reader input) {
  StandardTokenizer tokenizer
    = new StandardTokenizer(luceneMatchVersion, input); 
  tokenizer.setMaxTokenLength(maxTokenLength);
  return tokenizer;
}