我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用jieba.posseg.cut()。
def parse(self, in_file, out_file): output_file = open(out_file, 'w') with open(in_file, 'r') as file: line = file.readline() i = 0 for line in file.readlines(): sentence = "" line = line.strip().split('\t') for word, flag in pseg.cut(line[1].strip()): if flag == 'x': continue else: sentence = sentence + word + " " output_file.write(sentence.strip() + "\n") i += 1 if i % 100 == 0: print('Handle lines %d' % i)
def cut_for_property(self, text): ''' @summary: ?????? --------- @param text: ???? --------- @result: ??[(text1, property1)...(textN, propertyN)] ''' words_list = [] words =pseg.cut(text) for word in words: if word.word not in self._stop_words: words_list.append((word.word, word.flag)) return words_list
def extract_keyword(self): sents = [] comm_list = self.dao.get_hotel_comments() # ??????????????? for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d"%len(sents) # ?????????? pos_sents = [] for sent in sents: pos_sents.append(pseg.cut(sent)) print "length of pos_sents:%d"%len(pos_sents) # ?????,????? print "counting" noun_dict = {} for pos_sent in pos_sents: for key,type in pos_sent: if type == "n": if key not in noun_dict: noun_dict[key] = 1 else: noun_dict[key] = noun_dict[key] + 1 a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True) return a
def handel_weibo(filename): fp = open("f://emotion/mysite/Label_extract/weibo_corpus/" + filename, 'r') contents = [] for line in fp.readlines(): # ???? line = line.strip() line.decode('utf-8') seg_lines = pseg.cut(line) # ???? for seg_line in seg_lines: # ?????????? if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz': contents.append(seg_line.word) # ???? #print "length:", len(contents) fp.close() # ?????????? fp_handel = open('f://emotion/mysite/Label_extract/weibo_corpus_handel/handel_' + filename, 'w+') for content in contents: fp_handel.write(content) fp_handel.write('\n') fp_handel.close() # 2.???????????30????????????????
def read_test_list(): fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r') contents = [] for line in fp.readlines(): # ???? line = line.strip() line.decode('utf-8') seg_lines = pseg.cut(line) # ???? for seg_line in seg_lines: # ?????????? if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz': contents.append(seg_line.word) # ???? fp.close() #for w in contents: # print w # ??str??????? str_test = ' '.join(contents) return str_test # 5.??????chinese_weibo.txt??????TF-IDF???????100??
def MatchItem(self, input, start, end,muststart, mode=None): self.LogIn(input, start,end) pos = start; if end is None: end=len(input); seg_list = pseg.cut(input[start:end] if self.Len == -1 else input[start:start + self.Len]); for word, flag in seg_list: if self.Pos is None: sword = word; break; else: if flag in self.Pos: sword = word; break; pos += len(word); if pos < 0 or (muststart == True and pos != start): self.LogOut(None) return start + self.Len if self.Len < 0 else tnpy.int_max; self.LogOut(sword) m = tnpy.MatchResult(self, sword, pos); m.rstr = sword; return m;
def cut_Text(content, nomial=False): """ :param content: string :param nomial: if nomial is True,only noun-like words will remain :return:a text which format is 'a b c d' """ if nomial: text = '' words = pseg.cut(content) for word in words: if contain(['n'], word.flag): text = text + ' ' + word.word return text.strip() else: text = '' words = jieba.cut(content) for word in words: text = text + ' ' + word return text.strip()
def cut_Dataset(data_set, parrel=False, nomial=False): """ :param data_set:bunch of Dataset :param parrel: if it is True,cut dataset in parrel.Windows is not available :param nomial: if nomial is True,only noun-like words will remain :return:data_set after cutted """ from tqdm import tqdm data_cut = [] start = time.time() print('cuting dataset......') if parrel: p = ThreadPool(9) p.map(cut_Text, data_set.data) p.close() p.join() else: n=0 for doc_content in tqdm(data_set.data): data_cut.append(cut_Text(doc_content, nomial)) end = time.time() print('cuting runs %0.2f seconds.' % (end - start)) data_set.data = data_cut
def splitWord(self, content): segs = pseg.cut(str(content)) result = [] for word,type in segs: WORD = Word() if self.wordtypeDict.has_key(word): WORD.setword(word) WORD.settype(self.wordtypeDict[word]) WORD.setfreq(self.wordfreqDict[word]) else: WORD.setword(word) WORD.settype(type) result.append(WORD) # print "word ", word result.append(WORD) return result
def get_word_list(self, text, lower=True, strip_stop_words=True, use_tag_filter=False): text = util.as_text(text) jieba_result = pseg.cut(text) if use_tag_filter: jieba_result = [ w for w in jieba_result if w.flag in self.default_tag_filter] else: jieba_result = [w for w in jieba_result] word_list = [w.word.strip() for w in jieba_result if w.flag != 'x'] word_list = [word for word in word_list if len(word) > 0] if lower: word_list = [word.lower() for word in word_list] if strip_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def load(self): from gensim.models import Word2Vec # ??????????? self.link_database = [] # ???? self.vecmodel = Word2Vec.load(self.model_file) log.info('???????') log.info('???????') with open(self.txt_file) as fp: senten_list = fp.readlines() log.debug("senten%s", senten_list) for senten_txt in senten_list: self.link_database.append(Senten2vec(senten_txt)) log.info('???????????') for link in self.link_database: link.sentence_word = (set(jieba.cut(link.sentence))) for link in self.link_database: link.sentence_vec = {word for word in link.sentence_word if word in self.vecmodel.wv.index2word} log.info('???????') # ????????????n???
def juziSim_vec(self, intxt, questionWordset, posWeight=None): # juziIn??????juziLi??????? if posWeight == None: log.warning('there is no posWeight') return 0 intxtSet = set(list(pseg.cut(intxt))) if not len(intxtSet): return 0 simWeight = 0 totalWeight = 0 for word, pos in intxtSet: if word in self.vecmodel.wv.index2word: wordPosWeight = posWeight.get(pos, 1) totalWeight += wordPosWeight wordMaxWeight = 0 for t in questionWordset: # print(word, t) tmp = self.vecmodel.wv.similarity(word, t) if wordMaxWeight < tmp: wordMaxWeight = tmp simWeight += wordPosWeight * wordMaxWeight if totalWeight == 0: return 0 return simWeight/totalWeight
def __call__(self, question) : # print(question.questionSentence) qSentence = question.questionSentence # question.wordsToken = list(jieba.cut(qSentence)) question.wordsToken, question.posToken = getPosToken(qSentence) assert len(question.wordsToken) == len(question.posToken) # print 'Length words Token = %d'%(len(question.wordsToken)) # print 'Length pos token = %d'%(len(question.posToken)) question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5)) # print ' '.join(question.keyWordToken) # dependency = parser.parse(words).next() # print '/'.join(question.wordsToken) # for word, flag in question.posToken: # print('%s %s'%(word, flag)) question.questionType, question.answerType = getQuestionType(question.questionSentence) question.getAnswerTemp() # my_print(question.answerTemp) # print question.answerRe
def ansFind(wikiList, typeInfo, Ques,obj): wordList = convert.solve(Ques) keyList = convert.getKeyWords(wordList) for j in range(len(wordList)): if j >= len(wordList): break if wordList[j][1].startswith("u") or wordList[j][1].startswith("x") or wordList[j][1].startswith("p"): del wordList[j] j = j-1 sourceList = [] for i in range(len(wikiList)): words = pseg.cut(wikiList[i]) relevantList = [] for w in words: wordsGroup = [w.word,w.flag] relevantList.append(wordsGroup) sourceList.append(relevantList) typeStr = ansExtract.getTypeStr(typeInfo) ansList = ansExtract.check(sourceList, wordList, typeStr, typeInfo,obj) return ansDecide.chooseAns(ansList, typeStr,typeInfo,obj)
def jiebafenci(all_the_text): re = "" relist = "" words = pseg.cut(all_the_text) count = 0 for w in words: flag = w.flag #?? tmp = w.word #?? #print "org: "+tmp #\u4e00-\u9fa5?unicode??????????????????? #???unicode????Unicode??????????????????? if len(tmp)>1 and len(flag)>0 and flag[0] not in flag_list and tmp[0]>=u'/u4e00' and tmp[0]<=u'\u9fa5': re = re + " " + w.word re = re.replace("\n"," ").replace("\r"," ") if len(re)>40: relist = re relist = relist + "\n" return relist
def getTrainData(inpath,outfile): i=0 for filename in os.listdir(inpath): fw = open(outfile+str(i)+".cut","w") #??????????? i=i+1 file_object = open(inpath+"\\"+filename,'r', encoding='UTF-8') try: all_the_text = file_object.read() #all_the_text = all_the_text.decode("gb2312").encode("utf-8") pre_text = jiebafenci(all_the_text) pre_text.encode('UTF-8') if len(pre_text)>30: fw.write(pre_text) except: print('@'*20) pass finally: file_object.close() fw.close() #['C000008', 'C000010', 'C000013', 'C000014', 'C000016', 'C000020', 'C000022','C000023', 'C000024']
def jieba_example(): raw = "????S5????,123,?,?" raw_seq = jieba.cut(raw) raw_seq_list = jieba.lcut(raw) raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=()) raw_with_ictclas = pseg.cut(raw) for word, flag in raw_with_ictclas: print word, flag
def jaccard_similarity_score(context1, context2, flag1, flag2): #print 'context1', context1 try: if flag1 and len(context1)!=0: temp = context1[-1] context1.pop() context1 += list(pseg.cut(temp)) if flag2 and len(context1)!=0: temp = context2[-1] context2.pop() context2 += list(pseg.cut(temp)) except: pass mySet = set(context1 + context2) a1 = [] a2 = [] for item in mySet: if item in context1: a1.append(1) else: a1.append(0) if item in context2: a2.append(1) else: a2.append(0) #print sklearn.metrics.jaccard_similarity_score(a1,a2) return sklearn.metrics.jaccard_similarity_score(a1,a2) # element[i]?element[j]?contextSim
def parse(): """parse the comments""" import jieba import jieba.posseg as pseg # Load User's Dictionary path_list = os.getcwd().split('/') path_list.append("dict.txt") dict_path = '/'.join(path_list) jieba.load_userdict(dict_path) # Disimss These Flags dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f', 'ud', 'ug', 'uv'] comments = Comment.query.all() for comment in comments: word_list = [] pseg_cut = pseg.cut(comment.body) for word, flag in pseg_cut: if flag not in dismiss: word_list.append(word) comment.parsed = '/'.join(word_list) db.session.add(comment) print "Comment %04d Parsed!" % comment.id db.session.commit() print "ALL DONE!"
def synonym_cut(sentence, pattern="wf"): """Cut the sentence into a synonym vector tag. ?????????????? If a word in this sentence was not found in the synonym dictionary, it will be marked with default value of the word segmentation tool. ???????????????????????? Args: pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'? """ sentence = sentence.rstrip(tone_words) synonym_vector = [] if pattern == "w": result = list(jieba.cut(sentence)) synonym_vector = [item for item in result if item not in punctuation_all] elif pattern == "k": synonym_vector = analyse.extract_tags(sentence, topK=1) elif pattern == "t": synonym_vector = analyse.extract_tags(sentence, topK=10) elif pattern == "wf": result = posseg.cut(sentence) # synonym_vector = [(item.word, item.flag) for item in result \ # if item.word not in punctuation_all] # Modify in 2017.4.27 for item in result: if item.word not in punctuation_all: if len(item.flag) < 4: item.flag = list(posseg.cut(item.word))[0].flag synonym_vector.append((item.word, item.flag)) elif pattern == "tf": result = posseg.cut(sentence) tags = analyse.extract_tags(sentence, topK=10) for item in result: if item.word in tags: synonym_vector.append((item.word, item.flag)) return synonym_vector
def segment(self, text, lower = True, use_stop_words = True, use_speech_tags_filter = False): """????????????list??????? Keyword arguments: lower -- ????????????? use_stop_words -- ??True??????????????????? use_speech_tags_filter -- ?????????????True????self.default_speech_tag_filter?????????? """ text = util.as_text(text) jieba_result = pseg.cut(text) if use_speech_tags_filter == True: jieba_result = [w for w in jieba_result if w.flag in self.default_speech_tag_filter] else: jieba_result = [w for w in jieba_result] # ?????? word_list = [w.word.strip() for w in jieba_result if w.flag!='x'] word_list = [word for word in word_list if len(word)>0] if lower: word_list = [word.lower() for word in word_list] if use_stop_words: word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] return word_list
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False): """ Extract keywords from sentence using TextRank algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v']. if the POS of w is not in this list, it will be filtered. - withFlag: if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ self.pos_filt = frozenset(allowPOS) g = UndirectWeightedGraph() cm = defaultdict(int) words = tuple(self.tokenizer.cut(sentence)) for i, wp in enumerate(words): if self.pairfilter(wp): for j in xrange(i + 1, i + self.span): if j >= len(words): break if not self.pairfilter(words[j]): continue if allowPOS and withFlag: cm[(wp, words[j])] += 1 else: cm[(wp.word, words[j].word)] += 1 for terms, w in cm.items(): g.addEdge(terms[0], terms[1], w) nodes_rank = g.rank() if withWeight: tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True) else: tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True) if topK: return tags[:topK] else: return tags
def cut(filename1,filename2): # ??????seg and pos f=open(filename2,'w') for line in open(filename1): res=pseg.cut(line.strip()) split_line=' '.join([w.word for w in res])+'\n' f.write(split_line.encode('utf-8')) # print '%s split successful' %(filename1)
def main(): source_path,target_path=sys.argv[1],sys.argv[2] source_files,target_files=getFileList(source_path,target_path) # print fileList for filename1,filename2 in zip(source_files,target_files): cut(filename1,filename2)
def cut(contents): # ?? split_contents=[] for line in contents: res=pseg.cut(line.strip()) split_line=' '.join([w.word for w in res]) split_contents.append(split_line) return split_contents
def main(): source_file='law_text.txt' law_text_list=readFromFile(source_file) print len(law_text_list) split_contents=cut(law_text_list) # cPickle.dump(split_contents,open('split_law_text.pkl','wb')) print len(split_contents) # for item in law_text_list: # print item print law_text_list[1].strip() print split_contents[1].strip()
def testDefaultCut(self): for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error" result = list(result) assert isinstance(result, list), "Test DefaultCut error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testDefaultCut", file=sys.stderr)
def testCutAll(self): for content in test_contents: result = jieba.cut(content, cut_all=True) assert isinstance(result, types.GeneratorType), "Test CutAll Generator error" result = list(result) assert isinstance(result, list), "Test CutAll error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testCutAll", file=sys.stderr)
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testSetDictionary", file=sys.stderr)
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg", file=sys.stderr)
def testDefaultCut_NOHMM(self): for content in test_contents: result = jieba.cut(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test DefaultCut Generator error" result = list(result) assert isinstance(result, list), "Test DefaultCut error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testDefaultCut_NOHMM", file=sys.stderr)
def cuttest(test_sent): result = pseg.cut(test_sent, HMM=False) for word, flag in result: print(word, "/", flag, ", ", end=' ') print("")
def cuttest(test_sent): result = pseg.cut(test_sent) for word, flag in result: print(word, "/", flag, ", ", end=' ') print("")
def cuttest(test_sent): result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("")
def get_hot_noun_counts(source_file): f = open(source_file, "r") data = f.read() re_pat = r'[\d-]{10}\s[\d:]{7,8}\s+[^\n]+\d{5,11}\)' # ?????['2016-06-24 15:42:52 ??(40**21)',…] # li=re.findall(re_pat,data) li_content = re.split(re_pat, data) s = "" for l in li_content: s = s + l seg_list = pseg.cut(s.strip()) lists = [] for w in seg_list: if (w.flag == "ns"): lists.append(w.word) # print("******?????**0?kp-****") # print("???????",len(lists)) seg_list_norepeat = set(lists) # print("???????",len(seg_list_noRepeat)) word_set = {} for seg in seg_list_norepeat: count = 0 for ss in lists: if (ss == seg): count += 1 word_set[seg] = count word_tuple_sort = sorted(word_set.items(), key=lambda e: e[1], reverse=True) return word_tuple_sort
def cut(self, text, cut_all = False): ''' @summary: ?? --------- @param text: ?? @param cut_all: True ??? False ???? ???????????????????????? ???????????????????????, ??????????????? --------- @result: ''' result = list(jieba.cut(text, cut_all = cut_all)) result = self.__del_stop_key(result) return result
def __is_clause_pattern3(self, the_clause, seg_result): for a_phrase in self.__phrase_dict: keys = a_phrase.keys() to_compile = a_phrase["key"].replace("……", "[\u4e00-\u9fa5]*") if "start" in keys: to_compile = to_compile.replace("*", "{" + a_phrase["start"] + "," + a_phrase["end"] + "}") if "head" in keys: to_compile = a_phrase["head"] + to_compile match = re.compile(to_compile).search(the_clause) if match is not None: can_continue = True pos = [flag for word, flag in posseg.cut(match.group())] if "between_tag" in keys: if a_phrase["between_tag"] not in pos and len(pos) > 2: can_continue = False if can_continue: for i in range(len(seg_result)): if seg_result[i].word in match.group(): try: if seg_result[i + 1].word in match.group(): return self.__emotional_word_analysis( a_phrase["key"] + ":" + match.group(), a_phrase["value"], [x for x, y in seg_result], i) except IndexError: return self.__emotional_word_analysis( a_phrase["key"] + ":" + match.group(), a_phrase["value"], [x for x, y in seg_result], i) return ""
def extract_keyword_by_thulac(self): sents = [] comm_list = self.dao.get_hotel_comments() # ??????????????? for comm in comm_list: sents.extend(normal.get_sentences(comm[2])) print "length of sentences:%d"%len(sents) # ?????????? pos_sents = [] for sent in sents: try: pos_sents.append(map(lambda x: x.split("_"), self.thu.cut(sent.encode("utf-8")))) except: print sent continue print "length of pos_sents:%d"%len(pos_sents) # ?????,????? print "counting" noun_dict = {} for pos_sent in pos_sents: for word in pos_sent: if word[1] == "n": if word[0] not in noun_dict: noun_dict[word[0]] = 1 else: noun_dict[word[0]] = noun_dict[word[0]] + 1 a = sorted(noun_dict.iteritems(),key=lambda asd:asd[1],reverse=True) return a
def seg(self, sentence): words = list() tags = list() for item in pseg.cut(sentence): words.append(item.word) tags.append(item.flag) return words, tags
def jieba_cut(): #??pos_all_dict?? fp_pos = open("hownet/pos_all_dict.txt", "r") # ????????? fp_pos_cut = codecs.open('hownet/pos_all_cut.txt', "w+", encoding='UTF-8') # ???????????? contents = fp_pos.readlines() for content in contents: word = content.decode("utf-8") # ?? word_tag = pseg.cut(word) str_tag = "" for tag in word_tag: str_tag += str(tag.word) + '/' + str(tag.flag) p = re.compile(r'/x(.*)') str_tag = p.sub(r'\1', str_tag) # ?????? fp_pos_cut.write(str_tag) fp_pos.close() fp_pos_cut.close() #??pos_all_dict?? fp_neg = open("hownet/neg_all_dict.txt", "r") # ????????? fp_neg_cut = codecs.open('hownet/neg_all_cut.txt', "w+", encoding='UTF-8') # ???????????? contents = fp_neg.readlines() for content in contents: word = content.decode("utf-8") # ?? word_tag = pseg.cut(word) str_tag = "" for tag in word_tag: str_tag += str(tag.word) + '/' + str(tag.flag) p = re.compile(r'/x(.*)') str_tag = p.sub(r'\1', str_tag) # ?????? fp_neg_cut.write(str_tag) fp_neg.close() fp_neg_cut.close() # ????????????
def handel_weibo_data(): #???????????????????? fp = open("f://emotion/mysite/weibo_crawler/chinese_weibo.txt", 'r') weibo_data = [] # ?????????????[[??][??][??]]?????????????????? for line in fp.readlines(): # ???? contents = [] line = line.strip() line.decode('utf-8') seg_lines = pseg.cut(line) # ???? for seg_line in seg_lines: # ?????????? if seg_line.flag == 'n' or seg_line.flag == 'nr' or seg_line.flag == 'ns' or seg_line.flag == 'nt' or seg_line.flag == 'nz': contents.append(seg_line.word) # ???? weibo_data.append(contents) fp.close() return weibo_data
def segmentation(sentence): seg_list = jieba.cut(sentence) seg_result = [] for w in seg_list: seg_result.append(w) #print seg_result[:] return seg_result # ??????????????????
def build_analyzer(self): def analyzer(doc): words = pseg.cut(doc) new_doc = ''.join(w.word for w in words if w.flag != 'x') words = jieba.cut(new_doc) return words return analyzer # ?TFID???????