@Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { final int arglen = arguments.length; if (arglen < 1 || arglen > 5) { throw new UDFArgumentException("Invalid number of arguments for `tokenize_ja`: " + arglen); } this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL; this._stopWords = (arglen >= 3) ? stopWords(arguments[2]) : JapaneseAnalyzer.getDefaultStopSet(); this._stopTags = (arglen >= 4) ? stopTags(arguments[3]) : JapaneseAnalyzer.getDefaultStopTags(); this._userDict = (arglen >= 5) ? userDictionary(arguments[4]) : null; this._analyzer = null; return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); }
@Nonnull private static CharArraySet stopWords(@Nonnull final ObjectInspector oi) throws UDFArgumentException { if (HiveUtils.isVoidOI(oi)) { return JapaneseAnalyzer.getDefaultStopSet(); } final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopSet(); } if (array.length == 0) { return CharArraySet.EMPTY_SET; } CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true); return results; }
@Nonnull private static Set<String> stopTags(@Nonnull final ObjectInspector oi) throws UDFArgumentException { if (HiveUtils.isVoidOI(oi)) { return JapaneseAnalyzer.getDefaultStopTags(); } final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopTags(); } final int length = array.length; if (length == 0) { return Collections.emptySet(); } final Set<String> results = new HashSet<String>(length); for (int i = 0; i < length; i++) { String s = array[i]; if (s != null) { results.add(s); } } return results; }
private static List<String> getStopList(File f) { List<String> stopwords = new ArrayList<String>(); try { InputStream in = JapaneseAnalyzer.class.getResourceAsStream(f.getPath()); BufferedReader input = new BufferedReader( new InputStreamReader(in)); for(String line = input.readLine(); line != null; line = input.readLine()) { if (line.startsWith("#")) continue; stopwords.add(line); } input.close(); return stopwords; } catch(IOException e) { e.printStackTrace(); System.exit(1); return null; } }
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
public JapaneseStopTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); this.ignoreCase = settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "ignore_case", false, deprecationLogger); this.removeTrailing = settings .getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), "remove_trailing", true, deprecationLogger); this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase); }
public void testDefaultsKuromojiAnalysis() throws IOException { TestAnalysis analysis = createTestAnalysis(); TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_tokenizer"); assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); TokenFilterFactory filterFactory = analysis.tokenFilter.get("kuromoji_part_of_speech"); assertThat(filterFactory, instanceOf(KuromojiPartOfSpeechFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_readingform"); assertThat(filterFactory, instanceOf(KuromojiReadingFormFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_baseform"); assertThat(filterFactory, instanceOf(KuromojiBaseFormFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_stemmer"); assertThat(filterFactory, instanceOf(KuromojiKatakanaStemmerFactory.class)); filterFactory = analysis.tokenFilter.get("ja_stop"); assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class)); filterFactory = analysis.tokenFilter.get("kuromoji_number"); assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); analyzer = indexAnalyzers.get("my_analyzer"); assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark"); assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); }
@Override public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { if (_analyzer == null) { this._analyzer = new JapaneseAnalyzer(_userDict, _mode, _stopWords, _stopTags); } Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } String line = arg0.toString(); final List<Text> results = new ArrayList<Text>(32); TokenStream stream = null; try { stream = _analyzer.tokenStream("", line); if (stream != null) { analyzeTokens(stream, results); } } catch (IOException e) { IOUtils.closeQuietly(_analyzer); throw new HiveException(e); } finally { IOUtils.closeQuietly(stream); } return results; }
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
public LuceneParser(boolean ignoreDefaultWordSet) throws IOException { CharArraySet stopSet = ignoreDefaultWordSet ? JapaneseAnalyzer .getDefaultStopSet() : new CharArraySet( new ArrayList<String>(), true); Set<String> stopTags = ignoreDefaultWordSet ? JapaneseAnalyzer .getDefaultStopTags() : new HashSet<String>(); analyzer = new JapaneseAnalyzer(null, JapaneseTokenizer.Mode.NORMAL, stopSet, stopTags); }
@Override public JapaneseAnalyzer get() { return this.analyzer; }
public JapaneseStopTokenFilterFactory(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) { super(indexSettings, name, settings); this.ignoreCase = settings.getAsBoolean("ignore_case", false); this.removeTrailing = settings.getAsBoolean("remove_trailing", true); this.stopWords = Analysis.parseWords(env, settings, "stopwords", JapaneseAnalyzer.getDefaultStopSet(), NAMED_STOP_WORDS, ignoreCase); }