我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用jieba.posseg.lcut()。
def extract_dictionary_feature(file_name, col_tag=0, col_content=1): # ???? adv = codecs.open('./data/vocabulary/adv.txt', 'rb', encoding='utf-8').read().split('\n') inverse = codecs.open('./data/vocabulary/inverse.txt', 'rb', encoding='utf-8').read().split('\n') negdict = codecs.open('./data/vocabulary/negdict.txt', 'rb', encoding='utf-8').read().split('\n') posdict = codecs.open('./data/vocabulary/posdict.txt', 'rb', encoding='utf-8').read().split('\n') contents = pd.read_excel(file_name, header=None) print 'cut words...' cw = lambda x: [pair for pair in psg.lcut(x) if pair.word not in stopwords] contents['pairs'] = contents[col_content].apply(cw) matrix = reviews2matrix(list(contents['pairs']), posdict, negdict, inverse, adv) x = matrix2vec(matrix) y = list(contents[col_tag]) return x, y
def delNOTNeedWords(content,customstopwords=None): # words = jieba.lcut(content) if customstopwords == None: customstopwords = "stopwords.txt" import os if os.path.exists(customstopwords): stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n') customstopwords = stop_words result='' return_words = [] # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #???? words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') tempword = word.encode('utf-8').strip(' ') if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']): # and flag[0] in [u'n', u'f', u'a', u'z']): # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #?????????????????? result += tempword # +"/"+str(w.flag)+" " #???? return_words.append(tempword) return result,return_words
def delNOTNeedWords(content,stopwords): # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #???? words = pseg.lcut(content) # jieba.cut() text_list = [] for word, flag in words: # print word.encode('utf-8') if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #?????????????????? # text_list.append(word.encode('utf-8')) result += word.encode('utf-8') # +"/"+str(w.flag)+" " #???? # ''.join(text_list) return result # return ''.join(text_list)
def jieba_example(): raw = "????S5????,123,?,?" raw_seq = jieba.cut(raw) raw_seq_list = jieba.lcut(raw) raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=()) raw_with_ictclas = pseg.cut(raw) for word, flag in raw_with_ictclas: print word, flag
def cut_with_flag(raw_str, filter_invalid_word_flag=True): """ :param raw_str: str :return: list[(str, str)] """ res = [(a, b) for a, b in pseg.lcut(raw_str)] if filter_invalid_word_flag: return filter_invalid_word(res) else: return res
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'): """ ???????????????????? simType=simple, simple_POS, vec """ self.lastTxt.append(intxt) if simType not in ('simple', 'simple_pos', 'vec'): return 'error: maxSimTxt?simType?????: {}'.format(simType) # ??????????????? simple_pos ?? embedding = self.vecModel if simType == 'vec' and not embedding: simType = 'simple_pos' for t in self.zhishiku: questions = t.q_vec if simType == 'vec' else t.q_word in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt) t.sim = max( similarity(in_vec, question, method=simType, embedding=embedding) for question in questions ) maxSim = max(self.zhishiku, key=lambda x: x.sim) logger.info('maxSim=' + format(maxSim.sim, '.0%')) if maxSim.sim < simCondision: return '?????????????????????????' return maxSim.a
def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted, (txt, hyp) :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ global stop_word_path self.stop = stop self.stopwords = codecs.open(stop_word_path + 'stopwords.txt', encoding='UTF-8').read() self.negwords = set([u"?", u"??", u"??", u"?", u"??", u"??", u"??", u"??", u"??"]) text_words = pseg.lcut(rtepair[0]) hyp_words = pseg.lcut(rtepair[1]) self.text_words = set() self.hyp_words = set() # ?????????????? pass # ?? wordnet ???????? if lemmatize: pass # ???? for word, flag in text_words: if word not in self.stopwords: self.text_words.add((word, flag)) for word, flag in hyp_words: if word not in self.stopwords: self.hyp_words.add((word, flag)) # ???? self._overlap = self.hyp_words & self.text_words # hyp ? text?? self._hyp_extra = self.hyp_words - self.text_words # hyp? text?? self._txt_extra = self.text_words - self.hyp_words # text? hyp??
def delstopwords(content): result = '' words = pseg.lcut("".join(content.split())) for word, flag in words: if word not in stopwords and flag not in ["/x", "/zg", "/uj", "/ul", "/e", "/d", "/uz", "/y"]: # ?????????????????? result += word.encode('utf-8') # +"/"+str(w.flag)+" " #???? return result
def prefix_process(curr_index, sentence, score): """ ????????????? :param curr_index: w ? sentence ?????? :param score: ?????? :param sentence: ?? :return: """ num_cnt = 5 if curr_index - num_cnt > 0: seg = sentence[curr_index - num_cnt:curr_index] else: seg = sentence[0:curr_index] # ???????? for curr_neg_prefix in double_none_prefix: if seg.endswith(curr_neg_prefix): return 0.8 * score # ???????? for curr_neg_prefix in set_neg_prefix: if seg.endswith(curr_neg_prefix): temp_pair = pseg.lcut(sentence[0:curr_index]) for i, (w, f) in enumerate(reversed(temp_pair)): if f.startswith(u"x"): break elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"): if (len(temp_pair)-i-2) > 0 and temp_pair[len(temp_pair)-i-2].word in set_neg_prefix: return 1.3 * score return -1.3 * score temp_pair = pseg.lcut(seg) for i, (w, f) in enumerate(reversed(temp_pair)): if f.startswith(u"x"): break elif f.startswith(u"r") or f.startswith(u"n") or f.startswith(u"m"): if temp_pair[len(temp_pair)-i-2].word in set_neg_prefix: return -0.6 * score # ????????????? for curr_very_prefix in set_very_prefix: if seg.endswith(curr_very_prefix): return 1.3 * score return score