public static UserDictionary getUserDictionary(Environment env, Settings settings) { try { final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION); if (reader == null) { return null; } else { try { return UserDictionary.open(reader); } finally { reader.close(); } } } catch (IOException e) { throw new ElasticsearchException("failed to load kuromoji user dictionary", e); } }
@Nullable private static UserDictionary userDictionary(@Nullable final String[] userDictArray) throws UDFArgumentException { if (userDictArray == null) { return null; } final StringBuilder builder = new StringBuilder(); for (String row : userDictArray) { builder.append(row).append('\n'); } final Reader reader = new StringReader(builder.toString()); try { return UserDictionary.open(reader); // return null if empty } catch (Throwable e) { throw new UDFArgumentException( "Failed to create user dictionary based on the given array<string>: " + e); } }
public static UserDictionary getUserDictionary(final Environment env, final Settings settings) { try { final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION); if (reader == null) { return null; } else { try { return UserDictionary.open(reader); } finally { reader.close(); } } } catch (final IOException e) { throw new ElasticsearchException("failed to load kuromoji user dictionary", e); } }
@Override public void inform(ResourceLoader loader) throws IOException { if (userDictionaryPath != null) { InputStream stream = loader.openResource(userDictionaryPath); String encoding = userDictionaryEncoding; if (encoding == null) { encoding = IOUtils.UTF_8; } CharsetDecoder decoder = Charset.forName(encoding).newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); userDictionary = new UserDictionary(reader); } else { userDictionary = null; } }
public static UserDictionary readDict() { InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt"); if (is == null) { throw new RuntimeException("Cannot find userdict.txt in test classpath!"); } try { try { Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8); return new UserDictionary(reader); } finally { is.close(); } } catch (IOException ioe) { throw new RuntimeException(ioe); } }
@Override public void inform(ResourceLoader loader) throws IOException { mode = getMode(args); String userDictionaryPath = args.get(USER_DICT_PATH); if (userDictionaryPath != null) { InputStream stream = loader.openResource(userDictionaryPath); String encoding = args.get(USER_DICT_ENCODING); if (encoding == null) { encoding = IOUtils.UTF_8; } CharsetDecoder decoder = Charset.forName(encoding).newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); Reader reader = new InputStreamReader(stream, decoder); userDictionary = new UserDictionary(reader); } else { userDictionary = null; } discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true); }
public static UserDictionary readDict() { InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt"); if (is == null) { throw new RuntimeException("Cannot find userdict.txt in test classpath!"); } try { try { Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8); return new UserDictionary(reader); } finally { is.close(); } } catch (IOException ioe) { throw new RuntimeException(ioe); } }
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
@Nullable private static UserDictionary userDictionary(@Nonnull final ObjectInspector oi) throws UDFArgumentException { if (HiveUtils.isConstListOI(oi)) { return userDictionary(HiveUtils.getConstStringArray(oi)); } else if (HiveUtils.isConstString(oi)) { return userDictionary(HiveUtils.getConstString(oi)); } else { throw new UDFArgumentException( "User dictionary MUST be given as an array of constant string or constant string (URL)"); } }
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) { super(indexSettings, name, settings); final Set<?> stopWords = Analysis.parseStopWords( env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet()); final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings); final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags()); }
public JapaneseAnalyzer(UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) { super(stopwords); this.userDict = userDict; this.mode = mode; this.stoptags = stoptags; }
/** * Create a new JapaneseTokenizer. * * @param factory the AttributeFactory to use * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(factory, input); dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } this.discardPunctuation = discardPunctuation; switch(mode){ case SEARCH: searchMode = true; extendedMode = false; outputCompounds = true; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = false; break; default: searchMode = false; extendedMode = false; outputCompounds = false; break; } buffer.reset(this.input); resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); }
public JapaneseAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) { super(matchVersion, stopwords); this.userDict = userDict; this.mode = mode; this.stoptags = stoptags; }
/** * Create a new JapaneseTokenizer. * * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(input); dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } this.discardPunctuation = discardPunctuation; switch(mode){ case SEARCH: searchMode = true; extendedMode = false; outputCompounds = true; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = false; break; default: searchMode = false; extendedMode = false; outputCompounds = false; break; } buffer.reset(null); // best effort NPE consumers that don't call reset() resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); }
/** * Create a new JapaneseTokenizer. * <p> * Uses the default AttributeFactory. * * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode); }
/** * Create a new JapaneseTokenizer. * <p> * Uses the default AttributeFactory. * * @param input Reader containing text * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode); }