Java 类org.apache.lucene.analysis.ja.JapaneseTokenizer 实例源码

项目:incubator-hivemall    文件:KuromojiUDF.java   
@Nonnull
private static Mode tokenizationMode(@Nonnull final ObjectInspector oi)
        throws UDFArgumentException {
    final String arg = HiveUtils.getConstString(oi);
    if (arg == null) {
        return Mode.NORMAL;
    }
    final Mode mode;
    if ("NORMAL".equalsIgnoreCase(arg)) {
        mode = Mode.NORMAL;
    } else if ("SEARCH".equalsIgnoreCase(arg)) {
        mode = Mode.SEARCH;
    } else if ("EXTENDED".equalsIgnoreCase(arg)) {
        mode = Mode.EXTENDED;
    } else if ("DEFAULT".equalsIgnoreCase(arg)) {
        mode = JapaneseTokenizer.DEFAULT_MODE;
    } else {
        throw new UDFArgumentException(
            "Expected NORMAL|SEARCH|EXTENDED|DEFAULT but got an unexpected mode: " + arg);
    }
    return mode;
}
项目:analyzers-ja    文件:KanjiNumberFilterTest.java   
@Ignore("Used for detailed testing")
@Test
public void testLargeData() throws IOException {
    final String inputFilename = "sample1.txt";
    final String tokenizedOutput = "sample1.tok.txt";
    final String normalizedOutput = "sample1.tok.norm.txt";

    final Analyzer plainAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(
                final String fieldName) {
            final Tokenizer tokenizer = new JapaneseTokenizer(
                    newAttributeFactory(), null, false,
                    JapaneseTokenizer.Mode.SEARCH);
            return new TokenStreamComponents(tokenizer);
        }
    };

    analyze(plainAnalyzer,
            new BufferedReader(new FileReader(inputFilename)),
            new BufferedWriter(new FileWriter(tokenizedOutput)));

    analyze(analyzer, new BufferedReader(new FileReader(inputFilename)),
            new BufferedWriter(new FileWriter(normalizedOutput)));
}
项目:analyzers-ja    文件:PosConcatenationFilterTest.java   
@Test
public void testNoPos() throws IOException {
    final Set<String> posTags = new HashSet<>();
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH);
            final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class);
            return new TokenStreamComponents(tokenizer,
                    new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() {
                @Override
                public String get() {
                    return posAtt.getPartOfSpeech();
                }
            }));
        }
    };

    assertAnalyzesTo(analyzer, "明日は詳細設計です。", //
            new String[] { "明日", "は", "詳細", "設計", "です", "。" }, //
            new int[] { 0, 2, 3, 5, 7, 9 }, //
            new int[] { 2, 3, 5, 7, 9, 10 }, //
            new int[] { 1, 1, 1, 1, 1, 1 });

}
项目:elasticsearch-analysis-ja    文件:ReloadableKuromojiTokenizerFactory.java   
TokenizerWrapper() {
    super();

    tokenizerTimestamp = dictionaryTimestamp;
    tokenizer = new JapaneseTokenizer(userDictionary,
            discartPunctuation, mode);

    try {
        final Field attributesField = getAccessibleField(AttributeSource.class, "attributes");
        final Object attributesObj = attributesField.get(tokenizer);
        attributesField.set(this, attributesObj);

        final Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls");
        final Object attributeImplsObj = attributeImplsField.get(tokenizer);
        attributeImplsField.set(this, attributeImplsObj);

        final Field currentStateField = getAccessibleField(AttributeSource.class, "currentState");
        final Object currentStateObj = currentStateField.get(tokenizer);
        currentStateField.set(this, currentStateObj);
    } catch (final Exception e) {
        throw new IllegalStateException(
                "Failed to update the tokenizer.", e);
    }
}
项目:fess-solr-plugin    文件:TestJapaneseNumberFilter.java   
@Ignore("Used for detailed testing")
@Test
public void testLargeData() throws IOException {
    final String inputFilename = "sample1.txt";
    final String tokenizedOutput = "sample1.tok.txt";
    final String normalizedOutput = "sample1.tok.norm.txt";

    final Analyzer plainAnalyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(
                final String fieldName, final Reader reader) {
            final Tokenizer tokenizer = new JapaneseTokenizer(
                    newAttributeFactory(), null, null, false,
                    JapaneseTokenizer.Mode.SEARCH);
            return new TokenStreamComponents(tokenizer);
        }
    };

    analyze(plainAnalyzer,
            new BufferedReader(new FileReader(inputFilename)),
            new BufferedWriter(new FileWriter(tokenizedOutput)));

    analyze(analyzer, new BufferedReader(new FileReader(inputFilename)),
            new BufferedWriter(new FileWriter(normalizedOutput)));
}
项目:elasticsearch_my    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
        env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:elasticsearch_my    文件:KuromojiTokenizerFactory.java   
public static JapaneseTokenizer.Mode getMode(Settings settings) {
    JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
    String modeSetting = settings.get("mode", null);
    if (modeSetting != null) {
        if ("search".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.SEARCH;
        } else if ("normal".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.NORMAL;
        } else if ("extended".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.EXTENDED;
        }
    }
    return mode;
}
项目:elasticsearch_my    文件:KuromojiTokenizerFactory.java   
@Override
public Tokenizer create() {
    JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
    int nBestCost = this.nBestCost;
    if (nBestExamples != null) {
        nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
    }
    t.setNBestCost(nBestCost);
    return t;
}
项目:elasticsearch_my    文件:KuromojiAnalysisTests.java   
public void testDefaultsKuromojiAnalysis() throws IOException {
    TestAnalysis analysis = createTestAnalysis();

    TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_tokenizer");
    assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));

    TokenFilterFactory filterFactory = analysis.tokenFilter.get("kuromoji_part_of_speech");
    assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_readingform");
    assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_baseform");
    assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_stemmer");
    assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class));

    filterFactory = analysis.tokenFilter.get("ja_stop");
    assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));

    filterFactory = analysis.tokenFilter.get("kuromoji_number");
    assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

    IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
    NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
    assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));

    analyzer = indexAnalyzers.get("my_analyzer");
    assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
    assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));

    CharFilterFactory  charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark");
    assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));

}
项目:elasticsearch_my    文件:KuromojiAnalysisTests.java   
public void testBaseFormFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
    assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
    String source = "私は制限スピードを超える。";
    String[] expected = new String[]{"私", "は", "制限", "スピード", "を"};
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:KuromojiAnalysisTests.java   
public void testJapaneseStopFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("ja_stop");
    assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
    String source = "私は制限スピードを超える。";
    String[] expected = new String[]{"私", "制限", "超える"};
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
项目:elasticsearch_my    文件:KuromojiAnalysisTests.java   
public void testNumberFilterFactory() throws Exception {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_number");
    assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
    String source = "本日十万二千五百円のワインを買った";
    String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"};
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
项目:analyzers-ja    文件:KanjiNumberFilterTest.java   
@Override
protected TokenStreamComponents createComponents(
        final String fieldName) {
    final Tokenizer tokenizer = new JapaneseTokenizer(null,
            false, JapaneseTokenizer.Mode.SEARCH);
    return new TokenStreamComponents(tokenizer,
            new KanjiNumberFilter(tokenizer));
}
项目:analyzers-ja    文件:PosConcatenationFilterTest.java   
@Test
public void testBasic() throws IOException {
    final Set<String> posTags = new HashSet<>();
    posTags.add("名詞-副詞可能");
    posTags.add("名詞-形容動詞語幹");
    posTags.add("名詞-サ変接続");
    posTags.add("名詞-一般");
    posTags.add("名詞-接尾-一般");
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(final String fieldName) {
            final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH);
            final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class);
            return new TokenStreamComponents(tokenizer,
                    new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() {
                @Override
                public String get() {
                    return posAtt.getPartOfSpeech();
                }
            }));
        }
    };

    assertAnalyzesTo(analyzer, "歯科医院の歯科衛生士", //
            new String[] { "歯科医院", "の", "歯科衛生士" }, //
            new int[] { 0, 4, 5 }, //
            new int[] { 4, 5, 10 }, //
            new int[] { 1, 1, 1 });

    assertAnalyzesTo(analyzer, "明日は詳細設計です。", //
            new String[] { "明日", "は", "詳細設計", "です", "。" }, //
            new int[] { 0, 2, 3, 7, 9 }, //
            new int[] { 2, 3, 7, 9, 10 }, //
            new int[] { 1, 1, 1, 1, 1 });
}
项目:elasticsearch-analysis-ja    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
            env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:elasticsearch-analysis-ja    文件:KuromojiTokenizerFactory.java   
public static JapaneseTokenizer.Mode getMode(final Settings settings) {
    JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE;
    final String modeSetting = settings.get("mode", null);
    if (modeSetting != null) {
        if ("search".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.SEARCH;
        } else if ("normal".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.NORMAL;
        } else if ("extended".equalsIgnoreCase(modeSetting)) {
            mode = JapaneseTokenizer.Mode.EXTENDED;
        }
    }
    return mode;
}
项目:elasticsearch-analysis-ja    文件:KuromojiTokenizerFactory.java   
@Override
public Tokenizer create() {
    final JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
    int nBestCost = this.nBestCost;
    if (nBestExamples != null) {
        nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
    }
    t.setNBestCost(nBestCost);
    return t;
}
项目:easyjasub    文件:LuceneParser.java   
public LuceneParser(boolean ignoreDefaultWordSet) throws IOException {
    CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer
            .getDefaultStopSet() : new CharArraySet(
            new ArrayList<String>(), true);
    Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer
            .getDefaultStopTags() : new HashSet<String>();
    analyzer = new JapaneseAnalyzer(null,
            JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags);
}
项目:NYBC    文件:JapaneseTokenizerFactory.java   
private Mode getMode(Map<String, String> args) {
  String mode = args.get(MODE);
  if (mode != null) {
    return Mode.valueOf(mode.toUpperCase(Locale.ROOT));
  } else {
    return JapaneseTokenizer.DEFAULT_MODE;
  }
}
项目:langpi    文件:JaSegmenter.java   
@Override
public List<String> segmentWords(String text) {

    List<String> ret = new ArrayList<String>();

    StringReader textreader = new StringReader(text);
    JapaneseTokenizer segmenter = 
            new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH);

    JaStemmer.lemma.clear();
    CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class);
    BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class);
    try {
        segmenter.reset();
        while (segmenter.incrementToken()){
            //segmenter.clearAttributes();
            ret.add(termAtt.toString());
            if(baseAtt.getBaseForm()!=null)
                JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm());
        }

        segmenter.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block.
        e.printStackTrace();
    }

    return ret;
}
项目:fess-solr-plugin    文件:TestJapaneseNumberFilter.java   
@Override
protected TokenStreamComponents createComponents(
        final String fieldName, final Reader reader) {
    final Tokenizer tokenizer = new JapaneseTokenizer(reader, null,
            false, JapaneseTokenizer.Mode.SEARCH);
    return new TokenStreamComponents(tokenizer,
            new JapaneseNumberFilter(tokenizer));
}
项目:elasticsearch-analysis-ja    文件:ReloadableKuromojiTokenizerFactory.java   
public ReloadableKuromojiTokenizerFactory(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) {
    super(indexSettings, name, settings);
    this.env = env;
    this.settings = settings;
    mode = KuromojiTokenizerFactory.getMode(settings);
    userDictionary = KuromojiTokenizerFactory.getUserDictionary(env,
            settings);
    discartPunctuation = settings.getAsBoolean("discard_punctuation", true);

    inputPendingField = getAccessibleField(Tokenizer.class, "inputPending");
    userDictionaryField = getAccessibleField(JapaneseTokenizer.class, "userDictionary");
    userFSTField = getAccessibleField(JapaneseTokenizer.class, "userFST");
    userFSTReaderField = getAccessibleField(JapaneseTokenizer.class, "userFSTReader");
    dictionaryMapField = getAccessibleField(JapaneseTokenizer.class, "dictionaryMap");

    dictionaryTimestamp = System.currentTimeMillis();

    final String monitoringFilePath = settings.get("user_dictionary");
    if (monitoringFilePath != null) {
        final Path path = env.configFile().resolve(monitoringFilePath);

        try {
            final File file = path.toFile();
            if (file.exists()) {
                reloadableFile = file;
                dictionaryTimestamp = reloadableFile.lastModified();

                reloadInterval = settings.getAsTime("reload_interval",
                        TimeValue.timeValueMinutes(1)).getMillis();

                if (VERBOSE) {
                    System.out.println("Check "
                            + reloadableFile.getAbsolutePath()
                            + " (interval: " + reloadInterval + "ms)");
                }
            }
        } catch (final Exception e) {
            throw new IllegalArgumentException(
                    "Could not access " + monitoringFilePath, e);
        }
    }

}
项目:NYBC    文件:JapaneseTokenizerFactory.java   
@Override
public Tokenizer create(Reader input) {
  return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode);
}