我们从Python开源项目中,提取了以下15个代码示例,用于说明如何使用jieba.tokenize()。
def getChList(docStrByte): ## ?????????????????????????????????? inputStr = str(docStrByte, encoding = 'gbk', errors = 'ignore').lower()#????????????????? strList = ''.join(inputStr.split('\n'))#???????????????? rawTokens = list(jieba.tokenize(strList))#???? #stopWord ? ???????key ???????value??None fSW = open('stopwords.txt', 'r', encoding = 'utf-8', errors = 'ignore').read() stopWord = {}.fromkeys(fSW.split('\n')) stopWord[''] = None final = [] s = nltk.stem.SnowballStemmer('english') for seg in rawTokens: # print(seg[0].strip()) rawWord = seg[0].strip()#strip()????????????? if (rawWord.isalpha()):#????????????? word = s.stem(rawWord) else: word = rawWord if word not in stopWord:#????? final.append(word)#????list return final
def __call__(self, text, **kargs): words = jieba.tokenize(text, mode="search") token = Token() for (w, start_pos, stop_pos) in words: if not accepted_chars.match(w) and len(w) <= 1: continue token.original = token.text = w token.pos = start_pos token.startchar = start_pos token.endchar = stop_pos yield token
def testTokenize(self): for content in test_contents: result = jieba.tokenize(content) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize", file=sys.stderr)
def testTokenize_NOHMM(self): for content in test_contents: result = jieba.tokenize(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Tokenize Generator error" result = list(result) assert isinstance(result, list), "Test Tokenize error on content: %s" % content for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]), file=sys.stderr) print("testTokenize_NOHMM", file=sys.stderr)
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode,HMM=False) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
def cuttest(test_sent): global g_mode result = jieba.tokenize(test_sent,mode=g_mode) for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig, **Any) -> None if config['language'] != 'zh': raise Exception("tokenizer_jieba is only used for Chinese. Check your configure json file.") for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
def process(self, message, **kwargs): # type: (Message, **Any) -> None message.set("tokens", self.tokenize(message.text))
def tokenize(self, text): # type: (Text) -> List[Token] tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens