我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用jieba.Tokenizer()。
def __init__(self, entity_list): """ [{"@id":"1","name":"??"},{"@id":"2","name":"??"}] all input text are assumed (or will be converted into) unicode """ # init entity index self.entities = collections.defaultdict(list) entity_list_unicode = [] for entity in entity_list: entity_list_unicode.append(any2unicode(entity)) for entity in entity_list_unicode: name = entity["name"] self.entities[name].append(entity) for entity in entity_list_unicode: for name in entity.get("alternateName", []): self.entities[name].append(entity) stat(entity_list_unicode, ["name"]) # init jieba self.tokenizer = jieba.Tokenizer() for name in self.entities: self.tokenizer.add_word(name)
def __init__(self, tokenizer=None): self.tokenizer = tokenizer or jieba.Tokenizer() self.load_word_tag(self.tokenizer.get_dict_file())
def lcut(self, *args, **kwargs): return list(self.cut(*args, **kwargs)) # default Tokenizer instance