Java 类org.apache.lucene.analysis.ja.JapaneseAnalyzer 实例源码

项目:incubator-hivemall    文件:KuromojiUDF.java   
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
    final int arglen = arguments.length;
    if (arglen < 1 || arglen > 5) {
        throw new UDFArgumentException("Invalid number of arguments for `tokenize_ja`: "
                + arglen);
    }

    this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL;
    this._stopWords = (arglen >= 3) ? stopWords(arguments[2])
            : JapaneseAnalyzer.getDefaultStopSet();
    this._stopTags = (arglen >= 4) ? stopTags(arguments[3])
            : JapaneseAnalyzer.getDefaultStopTags();
    this._userDict = (arglen >= 5) ? userDictionary(arguments[4]) : null;

    this._analyzer = null;

    return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
项目:incubator-hivemall    文件:KuromojiUDF.java   
@Nonnull
private static CharArraySet stopWords(@Nonnull final ObjectInspector oi)
        throws UDFArgumentException {
    if (HiveUtils.isVoidOI(oi)) {
        return JapaneseAnalyzer.getDefaultStopSet();
    }
    final String[] array = HiveUtils.getConstStringArray(oi);
    if (array == null) {
        return JapaneseAnalyzer.getDefaultStopSet();
    }
    if (array.length == 0) {
        return CharArraySet.EMPTY_SET;
    }
    CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
    return results;
}
项目:incubator-hivemall    文件:KuromojiUDF.java   
@Nonnull
private static Set<String> stopTags(@Nonnull final ObjectInspector oi)
        throws UDFArgumentException {
    if (HiveUtils.isVoidOI(oi)) {
        return JapaneseAnalyzer.getDefaultStopTags();
    }
    final String[] array = HiveUtils.getConstStringArray(oi);
    if (array == null) {
        return JapaneseAnalyzer.getDefaultStopTags();
    }
    final int length = array.length;
    if (length == 0) {
        return Collections.emptySet();
    }
    final Set<String> results = new HashSet<String>(length);
    for (int i = 0; i < length; i++) {
        String s = array[i];
        if (s != null) {
            results.add(s);
        }
    }
    return results;
}
项目:langpi    文件:JaSWEliminator.java   
private static List<String> getStopList(File f) {

List<String> stopwords = new ArrayList<String>();
   try {

    InputStream in = JapaneseAnalyzer.class.getResourceAsStream(f.getPath());
    BufferedReader input = new BufferedReader(
                new InputStreamReader(in));
     for(String line = input.readLine(); line != null; line = input.readLine()) {
      if (line.startsWith("#")) continue;
      stopwords.add(line);
     }
     input.close();

     return stopwords;

   } catch(IOException e) {
     e.printStackTrace();
     System.exit(1);
     return null;
   } 
 }
项目:elasticsearch_my    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
        env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:elasticsearch_my    文件:JapaneseStopTokenFilterFactory.java   
public JapaneseStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    this.ignoreCase = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger);
    this.removeTrailing = settings
        .getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "remove_trailing", true, deprecationLogger);
    this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
}
项目:elasticsearch_my    文件:KuromojiAnalysisTests.java   
public void testDefaultsKuromojiAnalysis() throws IOException {
    TestAnalysis analysis = createTestAnalysis();

    TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_tokenizer");
    assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));

    TokenFilterFactory filterFactory = analysis.tokenFilter.get("kuromoji_part_of_speech");
    assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_readingform");
    assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_baseform");
    assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_stemmer");
    assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

    filterFactory = analysis.tokenFilter.get("ja_stop");
    assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_number");
    assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

    IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
    NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
    assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

    analyzer = indexAnalyzers.get("my_analyzer");
    assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
    assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));

    CharFilterFactory  charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark");
    assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));

}
项目:incubator-hivemall    文件:KuromojiUDF.java   
@Override
public List<Text> evaluate(DeferredObject[] arguments) throws HiveException {
    if (_analyzer == null) {
        this._analyzer = new JapaneseAnalyzer(_userDict, _mode, _stopWords, _stopTags);
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    String line = arg0.toString();

    final List<Text> results = new ArrayList<Text>(32);
    TokenStream stream = null;
    try {
        stream = _analyzer.tokenStream("", line);
        if (stream != null) {
            analyzeTokens(stream, results);
        }
    } catch (IOException e) {
        IOUtils.closeQuietly(_analyzer);
        throw new HiveException(e);
    } finally {
        IOUtils.closeQuietly(stream);
    }
    return results;
}
项目:elasticsearch-analysis-ja    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
            env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:easyjasub    文件:LuceneParser.java   
public LuceneParser(boolean ignoreDefaultWordSet) throws IOException {
    CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer
            .getDefaultStopSet() : new CharArraySet(
            new ArrayList<String>(), true);
    Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer
            .getDefaultStopTags() : new HashSet<String>();
    analyzer = new JapaneseAnalyzer(null,
            JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags);
}
项目:elasticsearch_my    文件:KuromojiAnalyzerProvider.java   
@Override
public JapaneseAnalyzer get() {
    return this.analyzer;
}
项目:elasticsearch-analysis-ja    文件:KuromojiAnalyzerProvider.java   
@Override
public JapaneseAnalyzer get() {
    return this.analyzer;
}
项目:elasticsearch-analysis-ja    文件:JapaneseStopTokenFilterFactory.java   
public JapaneseStopTokenFilterFactory(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) {
    super(indexSettings, name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase);
}