@Nonnull private static Mode tokenizationMode(@Nonnull final ObjectInspector oi) throws UDFArgumentException { final String arg = HiveUtils.getConstString(oi); if (arg == null) { return Mode.NORMAL; } final Mode mode; if ("NORMAL".equalsIgnoreCase(arg)) { mode = Mode.NORMAL; } else if ("SEARCH".equalsIgnoreCase(arg)) { mode = Mode.SEARCH; } else if ("EXTENDED".equalsIgnoreCase(arg)) { mode = Mode.EXTENDED; } else if ("DEFAULT".equalsIgnoreCase(arg)) { mode = JapaneseTokenizer.DEFAULT_MODE; } else { throw new UDFArgumentException( "Expected NORMAL|SEARCH|EXTENDED|DEFAULT but got an unexpected mode: " + arg); } return mode; }
@Ignore("Used for detailed testing") @Test public void testLargeData() throws IOException { final String inputFilename = "sample1.txt"; final String tokenizedOutput = "sample1.tok.txt"; final String normalizedOutput = "sample1.tok.norm.txt"; final Analyzer plainAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents( final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer( newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH); return new TokenStreamComponents(tokenizer); } }; analyze(plainAnalyzer, new BufferedReader(new FileReader(inputFilename)), new BufferedWriter(new FileWriter(tokenizedOutput))); analyze(analyzer, new BufferedReader(new FileReader(inputFilename)), new BufferedWriter(new FileWriter(normalizedOutput))); }
@Test public void testNoPos() throws IOException { final Set<String> posTags = new HashSet<>(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH); final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class); return new TokenStreamComponents(tokenizer, new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() { @Override public String get() { return posAtt.getPartOfSpeech(); } })); } }; assertAnalyzesTo(analyzer, "明日は詳細設計です。", // new String[] { "明日", "は", "詳細", "設計", "です", "。" }, // new int[] { 0, 2, 3, 5, 7, 9 }, // new int[] { 2, 3, 5, 7, 9, 10 }, // new int[] { 1, 1, 1, 1, 1, 1 }); }
TokenizerWrapper() { super(); tokenizerTimestamp = dictionaryTimestamp; tokenizer = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); try { final Field attributesField = getAccessibleField(AttributeSource.class, "attributes"); final Object attributesObj = attributesField.get(tokenizer); attributesField.set(this, attributesObj); final Field attributeImplsField = getAccessibleField(AttributeSource.class, "attributeImpls"); final Object attributeImplsObj = attributeImplsField.get(tokenizer); attributeImplsField.set(this, attributeImplsObj); final Field currentStateField = getAccessibleField(AttributeSource.class, "currentState"); final Object currentStateObj = currentStateField.get(tokenizer); currentStateField.set(this, currentStateObj); } catch (final Exception e) { throw new IllegalStateException( "Failed to update the tokenizer.", e); } }
@Ignore("Used for detailed testing") @Test public void testLargeData() throws IOException { final String inputFilename = "sample1.txt"; final String tokenizedOutput = "sample1.tok.txt"; final String normalizedOutput = "sample1.tok.norm.txt"; final Analyzer plainAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents( final String fieldName, final Reader reader) { final Tokenizer tokenizer = new JapaneseTokenizer( newAttributeFactory(), null, null, false, JapaneseTokenizer.Mode.SEARCH); return new TokenStreamComponents(tokenizer); } }; analyze(plainAnalyzer, new BufferedReader(new FileReader(inputFilename)), new BufferedWriter(new FileWriter(tokenizedOutput))); analyze(analyzer, new BufferedReader(new FileReader(inputFilename)), new BufferedWriter(new FileWriter(normalizedOutput))); }
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
public static JapaneseTokenizer.Mode getMode(Settings settings) { JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE; String modeSetting = settings.get("mode", null); if (modeSetting != null) { if ("search".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.SEARCH; } else if ("normal".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.NORMAL; } else if ("extended".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.EXTENDED; } } return mode; }
@Override public Tokenizer create() { JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); int nBestCost = this.nBestCost; if (nBestExamples != null) { nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); } t.setNBestCost(nBestCost); return t; }
public void testDefaultsKuromojiAnalysis() throws IOException { TestAnalysis analysis = createTestAnalysis(); TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_tokenizer"); assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); TokenFilterFactory filterFactory = analysis.tokenFilter.get("kuromoji_part_of_speech"); assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_readingform"); assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_baseform"); assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_stemmer"); assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class)); filterFactory = analysis.tokenFilter.get("ja_stop"); assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_number"); assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); analyzer = indexAnalyzers.get("my_analyzer"); assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark"); assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); }
public void testBaseFormFilterFactory() throws IOException { TestAnalysis analysis = createTestAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos"); assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[]{"私", "は", "制限", "スピード", "を"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
public void testJapaneseStopFilterFactory() throws IOException { TestAnalysis analysis = createTestAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("ja_stop"); assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class)); String source = "私は制限スピードを超える。"; String[] expected = new String[]{"私", "制限", "超える"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
public void testNumberFilterFactory() throws Exception { TestAnalysis analysis = createTestAnalysis(); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_number"); assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class)); String source = "本日十万二千五百円のワインを買った"; String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"}; Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
@Override protected TokenStreamComponents createComponents( final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH); return new TokenStreamComponents(tokenizer, new KanjiNumberFilter(tokenizer)); }
@Test public void testBasic() throws IOException { final Set<String> posTags = new HashSet<>(); posTags.add("名詞-副詞可能"); posTags.add("名詞-形容動詞語幹"); posTags.add("名詞-サ変接続"); posTags.add("名詞-一般"); posTags.add("名詞-接尾-一般"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new JapaneseTokenizer(null, false, JapaneseTokenizer.Mode.SEARCH); final PartOfSpeechAttribute posAtt = tokenizer.addAttribute(PartOfSpeechAttribute.class); return new TokenStreamComponents(tokenizer, new PosConcatenationFilter(tokenizer, posTags, new PosConcatenationFilter.PartOfSpeechSupplier() { @Override public String get() { return posAtt.getPartOfSpeech(); } })); } }; assertAnalyzesTo(analyzer, "歯科医院の歯科衛生士", // new String[] { "歯科医院", "の", "歯科衛生士" }, // new int[] { 0, 4, 5 }, // new int[] { 4, 5, 10 }, // new int[] { 1, 1, 1 }); assertAnalyzesTo(analyzer, "明日は詳細設計です。", // new String[] { "明日", "は", "詳細設計", "です", "。" }, // new int[] { 0, 2, 3, 7, 9 }, // new int[] { 2, 3, 7, 9, 10 }, // new int[] { 1, 1, 1, 1, 1 }); }
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
public static JapaneseTokenizer.Mode getMode(final Settings settings) { JapaneseTokenizer.Mode mode = JapaneseTokenizer.DEFAULT_MODE; final String modeSetting = settings.get("mode", null); if (modeSetting != null) { if ("search".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.SEARCH; } else if ("normal".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.NORMAL; } else if ("extended".equalsIgnoreCase(modeSetting)) { mode = JapaneseTokenizer.Mode.EXTENDED; } } return mode; }
@Override public Tokenizer create() { final JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); int nBestCost = this.nBestCost; if (nBestExamples != null) { nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); } t.setNBestCost(nBestCost); return t; }
public LuceneParser(boolean ignoreDefaultWordSet) throws IOException { CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer .getDefaultStopSet() : new CharArraySet( new ArrayList<String>(), true); Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer .getDefaultStopTags() : new HashSet<String>(); analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags); }
private Mode getMode(Map<String, String> args) { String mode = args.get(MODE); if (mode != null) { return Mode.valueOf(mode.toUpperCase(Locale.ROOT)); } else { return JapaneseTokenizer.DEFAULT_MODE; } }
@Override public List<String> segmentWords(String text) { List<String> ret = new ArrayList<String>(); StringReader textreader = new StringReader(text); JapaneseTokenizer segmenter = new JapaneseTokenizer(textreader, null, true, JapaneseTokenizer.Mode.SEARCH); JaStemmer.lemma.clear(); CharTermAttribute termAtt = segmenter.getAttribute(CharTermAttribute.class); BaseFormAttribute baseAtt = segmenter.getAttribute(BaseFormAttribute.class); try { segmenter.reset(); while (segmenter.incrementToken()){ //segmenter.clearAttributes(); ret.add(termAtt.toString()); if(baseAtt.getBaseForm()!=null) JaStemmer.lemma.put(termAtt.toString(), baseAtt.getBaseForm()); } segmenter.close(); } catch (IOException e) { // TODO Auto-generated catch block. e.printStackTrace(); } return ret; }
@Override protected TokenStreamComponents createComponents( final String fieldName, final Reader reader) { final Tokenizer tokenizer = new JapaneseTokenizer(reader, null, false, JapaneseTokenizer.Mode.SEARCH); return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(tokenizer)); }
public ReloadableKuromojiTokenizerFactory(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) { super(indexSettings, name, settings); this.env = env; this.settings = settings; mode = KuromojiTokenizerFactory.getMode(settings); userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); discartPunctuation = settings.getAsBoolean("discard_punctuation", true); inputPendingField = getAccessibleField(Tokenizer.class, "inputPending"); userDictionaryField = getAccessibleField(JapaneseTokenizer.class, "userDictionary"); userFSTField = getAccessibleField(JapaneseTokenizer.class, "userFST"); userFSTReaderField = getAccessibleField(JapaneseTokenizer.class, "userFSTReader"); dictionaryMapField = getAccessibleField(JapaneseTokenizer.class, "dictionaryMap"); dictionaryTimestamp = System.currentTimeMillis(); final String monitoringFilePath = settings.get("user_dictionary"); if (monitoringFilePath != null) { final Path path = env.configFile().resolve(monitoringFilePath); try { final File file = path.toFile(); if (file.exists()) { reloadableFile = file; dictionaryTimestamp = reloadableFile.lastModified(); reloadInterval = settings.getAsTime("reload_interval", TimeValue.timeValueMinutes(1)).getMillis(); if (VERBOSE) { System.out.println("Check " + reloadableFile.getAbsolutePath() + " (interval: " + reloadInterval + "ms)"); } } } catch (final Exception e) { throw new IllegalArgumentException( "Could not access " + monitoringFilePath, e); } } }
@Override public Tokenizer create(Reader input) { return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode); }