Java 类org.apache.lucene.analysis.ja.dict.UserDictionary 实例源码

项目:elasticsearch_my    文件:KuromojiTokenizerFactory.java   
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
    try {
        final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
        if (reader == null) {
            return null;
        } else {
            try {
                return UserDictionary.open(reader);
            } finally {
                reader.close();
            }
        }
    } catch (IOException e) {
        throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
    }
}
项目:incubator-hivemall    文件:KuromojiUDF.java   
@Nullable
private static UserDictionary userDictionary(@Nullable final String[] userDictArray)
        throws UDFArgumentException {
    if (userDictArray == null) {
        return null;
    }

    final StringBuilder builder = new StringBuilder();
    for (String row : userDictArray) {
        builder.append(row).append('\n');
    }
    final Reader reader = new StringReader(builder.toString());
    try {
        return UserDictionary.open(reader); // return null if empty
    } catch (Throwable e) {
        throw new UDFArgumentException(
            "Failed to create user dictionary based on the given array<string>: " + e);
    }
}
项目:elasticsearch-analysis-ja    文件:KuromojiTokenizerFactory.java   
public static UserDictionary getUserDictionary(final Environment env, final Settings settings) {
    try {
        final Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION);
        if (reader == null) {
            return null;
        } else {
            try {
                return UserDictionary.open(reader);
            } finally {
                reader.close();
            }
        }
    } catch (final IOException e) {
        throw new ElasticsearchException("failed to load kuromoji user dictionary", e);
    }
}
项目:search    文件:JapaneseTokenizerFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (userDictionaryPath != null) {
    InputStream stream = loader.openResource(userDictionaryPath);
    String encoding = userDictionaryEncoding;
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
    CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    Reader reader = new InputStreamReader(stream, decoder);
    userDictionary = new UserDictionary(reader);
  } else {
    userDictionary = null;
  }
}
项目:search    文件:TestJapaneseTokenizer.java   
public static UserDictionary readDict() {
  InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
  if (is == null) {
    throw new RuntimeException("Cannot find userdict.txt in test classpath!");
  }
  try {
    try {
      Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
      return new UserDictionary(reader);
    } finally {
      is.close();
    }
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}
项目:NYBC    文件:JapaneseTokenizerFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  mode = getMode(args);
  String userDictionaryPath = args.get(USER_DICT_PATH);
  if (userDictionaryPath != null) {
    InputStream stream = loader.openResource(userDictionaryPath);
    String encoding = args.get(USER_DICT_ENCODING);
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
    CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    Reader reader = new InputStreamReader(stream, decoder);
    userDictionary = new UserDictionary(reader);
  } else {
    userDictionary = null;
  }
  discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true);
}
项目:NYBC    文件:TestJapaneseTokenizer.java   
public static UserDictionary readDict() {
  InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
  if (is == null) {
    throw new RuntimeException("Cannot find userdict.txt in test classpath!");
  }
  try {
    try {
      Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
      return new UserDictionary(reader);
    } finally {
      is.close();
    }
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}
项目:read-open-source-code    文件:JapaneseTokenizerFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (userDictionaryPath != null) {
    InputStream stream = loader.openResource(userDictionaryPath);
    String encoding = userDictionaryEncoding;
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
    CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    Reader reader = new InputStreamReader(stream, decoder);
    userDictionary = new UserDictionary(reader);
  } else {
    userDictionary = null;
  }
}
项目:read-open-source-code    文件:JapaneseTokenizerFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (userDictionaryPath != null) {
    InputStream stream = loader.openResource(userDictionaryPath);
    String encoding = userDictionaryEncoding;
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
    CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    Reader reader = new InputStreamReader(stream, decoder);
    userDictionary = new UserDictionary(reader);
  } else {
    userDictionary = null;
  }
}
项目:Maskana-Gestor-de-Conocimiento    文件:JapaneseTokenizerFactory.java   
@Override
public void inform(ResourceLoader loader) throws IOException {
  if (userDictionaryPath != null) {
    InputStream stream = loader.openResource(userDictionaryPath);
    String encoding = userDictionaryEncoding;
    if (encoding == null) {
      encoding = IOUtils.UTF_8;
    }
    CharsetDecoder decoder = Charset.forName(encoding).newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    Reader reader = new InputStreamReader(stream, decoder);
    userDictionary = new UserDictionary(reader);
  } else {
    userDictionary = null;
  }
}
项目:Maskana-Gestor-de-Conocimiento    文件:TestJapaneseTokenizer.java   
public static UserDictionary readDict() {
  InputStream is = TestJapaneseTokenizer.class.getResourceAsStream("userdict.txt");
  if (is == null) {
    throw new RuntimeException("Cannot find userdict.txt in test classpath!");
  }
  try {
    try {
      Reader reader = new InputStreamReader(is, IOUtils.CHARSET_UTF_8);
      return new UserDictionary(reader);
    } finally {
      is.close();
    }
  } catch (IOException ioe) {
    throw new RuntimeException(ioe);
  }
}
项目:elasticsearch_my    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
        env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:incubator-hivemall    文件:KuromojiUDF.java   
@Nullable
private static UserDictionary userDictionary(@Nonnull final ObjectInspector oi)
        throws UDFArgumentException {
    if (HiveUtils.isConstListOI(oi)) {
        return userDictionary(HiveUtils.getConstStringArray(oi));
    } else if (HiveUtils.isConstString(oi)) {
        return userDictionary(HiveUtils.getConstString(oi));
    } else {
        throw new UDFArgumentException(
            "User dictionary MUST be given as an array of constant string or constant string (URL)");
    }
}
项目:elasticsearch-analysis-ja    文件:KuromojiAnalyzerProvider.java   
public KuromojiAnalyzerProvider(final IndexSettings indexSettings, final Environment env, final String name, final Settings settings) {
    super(indexSettings, name, settings);
    final Set<?> stopWords = Analysis.parseStopWords(
            env, indexSettings.getIndexVersionCreated(), settings, JapaneseAnalyzer.getDefaultStopSet());
    final JapaneseTokenizer.Mode mode = KuromojiTokenizerFactory.getMode(settings);
    final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
    analyzer = new JapaneseAnalyzer(userDictionary, mode, CharArraySet.copy(stopWords), JapaneseAnalyzer.getDefaultStopTags());
}
项目:search    文件:JapaneseAnalyzer.java   
public JapaneseAnalyzer(UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
  super(stopwords);
  this.userDict = userDict;
  this.mode = mode;
  this.stoptags = stoptags;
}
项目:search    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 *
 * @param factory the AttributeFactory to use
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  super(factory, input);
  dictionary = TokenInfoDictionary.getInstance();
  fst = dictionary.getFST();
  unkDictionary = UnknownDictionary.getInstance();
  characterDefinition = unkDictionary.getCharacterDefinition();
  this.userDictionary = userDictionary;
  costs = ConnectionCosts.getInstance();
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.discardPunctuation = discardPunctuation;
  switch(mode){
    case SEARCH:
      searchMode = true;
      extendedMode = false;
      outputCompounds = true;
      break;
    case EXTENDED:
      searchMode = true;
      extendedMode = true;
      outputCompounds = false;
      break;
    default:
      searchMode = false;
      extendedMode = false;
      outputCompounds = false;
      break;
  }
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
项目:NYBC    文件:JapaneseAnalyzer.java   
public JapaneseAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
  super(matchVersion, stopwords);
  this.userDict = userDict;
  this.mode = mode;
  this.stoptags = stoptags;
}
项目:NYBC    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 * 
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  super(input);
  dictionary = TokenInfoDictionary.getInstance();
  fst = dictionary.getFST();
  unkDictionary = UnknownDictionary.getInstance();
  characterDefinition = unkDictionary.getCharacterDefinition();
  this.userDictionary = userDictionary;
  costs = ConnectionCosts.getInstance();
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.discardPunctuation = discardPunctuation;
  switch(mode){
    case SEARCH:
      searchMode = true;
      extendedMode = false;
      outputCompounds = true;
      break;
    case EXTENDED:
      searchMode = true;
      extendedMode = true;
      outputCompounds = false;
      break;
    default:
      searchMode = false;
      extendedMode = false;
      outputCompounds = false;
      break;
  }
  buffer.reset(null); // best effort NPE consumers that don't call reset()

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
项目:read-open-source-code    文件:JapaneseAnalyzer.java   
public JapaneseAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
  super(matchVersion, stopwords);
  this.userDict = userDict;
  this.mode = mode;
  this.stoptags = stoptags;
}
项目:read-open-source-code    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 *
 * @param factory the AttributeFactory to use
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  super(factory, input);
  dictionary = TokenInfoDictionary.getInstance();
  fst = dictionary.getFST();
  unkDictionary = UnknownDictionary.getInstance();
  characterDefinition = unkDictionary.getCharacterDefinition();
  this.userDictionary = userDictionary;
  costs = ConnectionCosts.getInstance();
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.discardPunctuation = discardPunctuation;
  switch(mode){
    case SEARCH:
      searchMode = true;
      extendedMode = false;
      outputCompounds = true;
      break;
    case EXTENDED:
      searchMode = true;
      extendedMode = true;
      outputCompounds = false;
      break;
    default:
      searchMode = false;
      extendedMode = false;
      outputCompounds = false;
      break;
  }
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
项目:read-open-source-code    文件:JapaneseAnalyzer.java   
public JapaneseAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
  super(matchVersion, stopwords);
  this.userDict = userDict;
  this.mode = mode;
  this.stoptags = stoptags;
}
项目:read-open-source-code    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 *
 * @param factory the AttributeFactory to use
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  super(factory, input);
  dictionary = TokenInfoDictionary.getInstance();
  fst = dictionary.getFST();
  unkDictionary = UnknownDictionary.getInstance();
  characterDefinition = unkDictionary.getCharacterDefinition();
  this.userDictionary = userDictionary;
  costs = ConnectionCosts.getInstance();
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.discardPunctuation = discardPunctuation;
  switch(mode){
    case SEARCH:
      searchMode = true;
      extendedMode = false;
      outputCompounds = true;
      break;
    case EXTENDED:
      searchMode = true;
      extendedMode = true;
      outputCompounds = false;
      break;
    default:
      searchMode = false;
      extendedMode = false;
      outputCompounds = false;
      break;
  }
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
项目:Maskana-Gestor-de-Conocimiento    文件:JapaneseAnalyzer.java   
public JapaneseAnalyzer(Version matchVersion, UserDictionary userDict, Mode mode, CharArraySet stopwords, Set<String> stoptags) {
  super(matchVersion, stopwords);
  this.userDict = userDict;
  this.mode = mode;
  this.stoptags = stoptags;
}
项目:Maskana-Gestor-de-Conocimiento    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 *
 * @param factory the AttributeFactory to use
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer
    (AttributeFactory factory, Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  super(factory, input);
  dictionary = TokenInfoDictionary.getInstance();
  fst = dictionary.getFST();
  unkDictionary = UnknownDictionary.getInstance();
  characterDefinition = unkDictionary.getCharacterDefinition();
  this.userDictionary = userDictionary;
  costs = ConnectionCosts.getInstance();
  fstReader = fst.getBytesReader();
  if (userDictionary != null) {
    userFST = userDictionary.getFST();
    userFSTReader = userFST.getBytesReader();
  } else {
    userFST = null;
    userFSTReader = null;
  }
  this.discardPunctuation = discardPunctuation;
  switch(mode){
    case SEARCH:
      searchMode = true;
      extendedMode = false;
      outputCompounds = true;
      break;
    case EXTENDED:
      searchMode = true;
      extendedMode = true;
      outputCompounds = false;
      break;
    default:
      searchMode = false;
      extendedMode = false;
      outputCompounds = false;
      break;
  }
  buffer.reset(this.input);

  resetState();

  dictionaryMap.put(Type.KNOWN, dictionary);
  dictionaryMap.put(Type.UNKNOWN, unkDictionary);
  dictionaryMap.put(Type.USER, userDictionary);
}
项目:search    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 * <p>
 * Uses the default AttributeFactory.
 * 
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
}
项目:read-open-source-code    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 * <p>
 * Uses the default AttributeFactory.
 * 
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
}
项目:read-open-source-code    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 * <p>
 * Uses the default AttributeFactory.
 * 
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
}
项目:Maskana-Gestor-de-Conocimiento    文件:JapaneseTokenizer.java   
/**
 * Create a new JapaneseTokenizer.
 * <p>
 * Uses the default AttributeFactory.
 * 
 * @param input Reader containing text
 * @param userDictionary Optional: if non-null, user dictionary.
 * @param discardPunctuation true if punctuation tokens should be dropped from the output.
 * @param mode tokenization mode.
 */
public JapaneseTokenizer(Reader input, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) {
  this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode);
}