我们从Python开源项目中,提取了以下35个代码示例,用于说明如何使用jieba.analyse()。
def get_tag(sentence, config): """ Get semantic tag of sentence. """ iquestion = sentence.format(**config) try: keywords = analyse.extract_tags(iquestion, topK=1) keyword = keywords[0] except IndexError: keyword = iquestion tags = synonym_cut(keyword, 'wf') # tuple list if tags: tag = tags[0][1] if not tag: tag = keyword else: tag = keyword return tag
def set_stop_words(self, stop_words_path): ''' @summary: ????? --------- @param stop_words_path: ??????? --------- @result: ''' abs_path = _get_abs_path(stop_words_path) if not os.path.isfile(abs_path): raise Exception("jieba: file does not exist: " + abs_path) content = open(abs_path, 'rb').read().decode('utf-8') for line in content.splitlines(): self._stop_words.add(line) jieba.analyse.set_stop_words(stop_words_path) # analyse?????????
def text_rank(): db = query_DB() stop_words = load_stopwords() for sample in db.get_one(): author = sample[3] title = sample[1] content = sample[2] reply_number = sample[-1] if(author == 'mikki' or author == u'??'): continue if(reply_number >=3): title_seg = jieba.analyse.textrank(title,topK=5,withWeight=True,allowPOS=('ns','n','vn','v')) for word,weight in title_seg: weight *= 0.7 * (float(reply_number) / max_reply) db.write_textrank(word,weight) #content_seg = jieba.analyse.textrank(content,topK=8,withWeight=True,allowPOS=('ns','n','vn','v')) #for word,weight in content_seg: #weight *= 0.3 * (float(reply_number) / max_reply) #db.write_textrank(word,weight)
def extract_tags(key_word, a_name): ''' ???????????, ????????????,??????, ?????????JD??????, ??????????5??????????, ??????????????????????????????? ''' cut_tags = [tag for tag in jieba.cut(a_name)][:8] analyse_tags = jieba.analyse.extract_tags(a_name) tags = [tag for tag in cut_tags if tag in analyse_tags] # ?????????????tags??? try: tags.remove(key_word) except: pass tags.insert(0, key_word) if len(tags) > 5: tags = tags[:5] return ' '.join(tags)
def loadDataFromCutFile(self,totalnum): doc = [] cut = Cut() for i in range(1,totalnum): line = cut.getRow(i,Global.cutnews_dir,Global.filesize) if not line: break data = json.loads(line) keyword = analyse.extract_tags(data['content'],topK=20) seg = " ".join(keyword) print seg doc.append(seg) return doc #calculate tf-idf
def __call__(self, question) : # print(question.questionSentence) qSentence = question.questionSentence # question.wordsToken = list(jieba.cut(qSentence)) question.wordsToken, question.posToken = getPosToken(qSentence) assert len(question.wordsToken) == len(question.posToken) # print 'Length words Token = %d'%(len(question.wordsToken)) # print 'Length pos token = %d'%(len(question.posToken)) question.keyWordToken = list(jieba.analyse.extract_tags(qSentence, topK=5)) # print ' '.join(question.keyWordToken) # dependency = parser.parse(words).next() # print '/'.join(question.wordsToken) # for word, flag in question.posToken: # print('%s %s'%(word, flag)) question.questionType, question.answerType = getQuestionType(question.questionSentence) question.getAnswerTemp() # my_print(question.answerTemp) # print question.answerRe
def cut_with_weight(self, sentence): """ Cut word string with weight @sentence: word string return list or None ["word1`weight1", "word2`weight2" ...] """ try: top_k = 2147483647 seg_list = jieba.analyse.extract_tags(sentence, topK=top_k, withWeight=True) return [item[0].encode('utf-8')+'`'+str(item[1]) for item in seg_list] except Exception,e: logger.error('cut sentence:[%s] exception:[%s]' % (sentence, str(e))) return None
def jieba_example(): raw = "????S5????,123,?,?" raw_seq = jieba.cut(raw) raw_seq_list = jieba.lcut(raw) raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=()) raw_with_ictclas = pseg.cut(raw) for word, flag in raw_with_ictclas: print word, flag
def get_hot_words(text): jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())) print(df) df.to_excel('./hotwords/DM.xlsx', 'DM')
def detail(info_hash): conn,curr = sphinx_conn() querysql='SELECT * FROM film WHERE info_hash=%s' curr.execute(querysql,info_hash) result=curr.fetchone() sphinx_close(curr,conn) #hash=Search_Hash.query.filter_by(id=id).first() if not result: return redirect(url_for('index')) fenci_list=jieba.analyse.extract_tags(result['name'], 8) tags=Search_Tags.query.order_by(Search_Tags.id.desc()).limit(20) form=SearchForm() return render_template('detail.html',form=form,tags=tags,hash=result,fenci_list=fenci_list,sitename=sitename)
def jieba_textrank(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')): ''' ??textrank?????????topK???????????????20? withWeight???????????????? allowPOS??????? ''' keyword_list = [] for w in jieba.analyse.textrank(data, topK=20, withWeight=True, allowPOS=allowPOS): keyword_list.append(w[0]) keyword = '/'.join(keyword_list) return keyword
def jieba_tfidf(data, topK=20, withWeight=False, allowPOS=('nz', 'nt', 'ns', 'nr', 'n', 'vn')): ''' ??tfidf?????????topK???????????????20? withWeight???????????????? allowPOS??????? ''' temp_result = jieba.analyse.extract_tags( data, topK, withWeight, allowPOS) temp_result = '/'.join(temp_result) return temp_result
def synonym_cut(sentence, pattern="wf"): """Cut the sentence into a synonym vector tag. ?????????????? If a word in this sentence was not found in the synonym dictionary, it will be marked with default value of the word segmentation tool. ???????????????????????? Args: pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'? """ sentence = sentence.rstrip(tone_words) synonym_vector = [] if pattern == "w": result = list(jieba.cut(sentence)) synonym_vector = [item for item in result if item not in punctuation_all] elif pattern == "k": synonym_vector = analyse.extract_tags(sentence, topK=1) elif pattern == "t": synonym_vector = analyse.extract_tags(sentence, topK=10) elif pattern == "wf": result = posseg.cut(sentence) # synonym_vector = [(item.word, item.flag) for item in result \ # if item.word not in punctuation_all] # Modify in 2017.4.27 for item in result: if item.word not in punctuation_all: if len(item.flag) < 4: item.flag = list(posseg.cut(item.word))[0].flag synonym_vector.append((item.word, item.flag)) elif pattern == "tf": result = posseg.cut(sentence) tags = analyse.extract_tags(sentence, topK=10) for item in result: if item.word in tags: synonym_vector.append((item.word, item.flag)) return synonym_vector
def page_tags(request, pk): import jieba.analyse page = Page.objects.get(pk=pk) tags = jieba.analyse.extract_tags(page.content) return render(request, 'tags.html', {'title': 'Tags', 'page': page, 'tags': tags})
def extarctTextRankKeywords(self, doc_str, window=5): ''' ??TextRank??????? ??: http://www.letiantian.me/2014-12-01-text-rank/ ''' keywords = jieba.analyse.textrank(doc_str, withWeight=True) return keywords pass
def initTfidfKeywords(self, idf_file=None): ''' ??TFIDF???????????????IDF?? ''' self.words_idf = {} if idf_file is not None: jieba.analyse.set_idf_path(idf_file) ''' for line in codecs.open(idf_file, 'r', 'utf-8'): word, idf_value = line.strip().split() self.words_idf[word] = float(idf_value) pass ''' pass
def extractTfidfKeywords(self, doc_str): keywords = jieba.analyse.extract_tags(doc_str, withWeight=True) return keywords pass
def get_top_words(top, filename): topK = top content = open(filename, 'rb').read() tags = jieba.analyse.extract_tags(content, topK=topK) # items = str(tags).replace('u\'', '\'').decode("unicode-escape") return tags
def cut_for_keyword(self, text, with_weight = False, top_keyword_count = None): ''' @summary: ???? --------- @param text: ???? @param with_weight: ?????? ?????keyword, word_weight? @param top_keyword_count: ???N???? None????? --------- @result: ''' result = jieba.analyse.extract_tags(text, topK = top_keyword_count, withWeight = with_weight) return result
def extractKeyWordByTFIDF(self,sentence): wordList=[] if self.conf["threshold"]: threshold=self.conf["threshold"] tmpList=jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"]) for pair in tmpList: if pair[1]>=threshold: wordList.append(pair[0]) else: wordList=list(jieba.analyse.extract_tags(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"])) return wordList
def extractKeyWordByTextRank(self,sentence): wordList=[] if self.conf["threshold"]: threshold=self.conf["threshold"] tmpList=jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=True,allowPOS=self.conf["allowPOS"]) for pair in tmpList: if pair[1]>=threshold: wordList.append(pair[0]) else: wordList=list(jieba.analyse.textrank(sentence,topK=self.conf["topK"],withWeight=self.conf["withWeight"],allowPOS=self.conf["allowPOS"])) return wordList
def __get_model_answer(self, question): tag1 = jieba.analyse.extract_tags(question, 3) tag2 = jieba.analyse.textrank(question, 3) keywords = [] for tag in tag1: keywords.append(tag) for tag in tag2: if tag not in tag1: keywords.append(tag) tr4w = TextRank4Keyword() tr4w.analyze(text=question, lower=True, window=2) for item in tr4w.get_keywords(20, word_min_len=1): if item.word not in keywords: keywords.append(item.word) kstr = "" for k in keywords: if len(k) != 1: kstr = kstr + "AND" + k else: if k not in kstr: kstr = kstr + "AND" + k # print(k) estr = kstr[3:] print (estr) q = self.__parser.parse(estr) results = self.__searcher.search(q) return results
def keywords_extract(question): jieba.analyse.set_stop_words(stopwords) rv = jieba.analyse.extract_tags(question, topK=10, withWeight=True) return rv
def participle (content): tags = jieba.analyse.extract_tags(content, topK=topK) print(tags) str = '/'.join(tags) return str
def analyse_tfidf(): text = request.values.get('text', "text") topK = request.values.get("topK", default="20") if topK in [str(x) for x in range(3,41)]: topK = int(topK) else: topK = 20 withWeight = request.values.get("withWeight", default="0") if withWeight in ['0', '1']: withWeight = bool(int(withWeight)) else: withWeight = True result = list(jieba.analyse.extract_tags(text, topK=topK, withWeight=withWeight)) return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
def analyse_textrank(): text = request.values.get('text', "text") topK = request.values.get("topK", default="20") if topK in [str(x) for x in range(3,41)]: topK = int(topK) else: topK = 20 withWeight = request.values.get("withWeight", default="0") if withWeight in ['0', '1']: withWeight = bool(int(withWeight)) else: withWeight = True result = list(jieba.analyse.textrank(text, topK=topK, withWeight=withWeight)) return jsonify(text=text, topK=topK, withWeight=withWeight, result=result)
def get_keywords(self, all_text): kw_list = jieba.analyse.extract_tags(all_text, topK=10, withWeight=False, allowPOS=()) # return set(kw_list) for kw in kw_list: print kw
def test_if_has_keyword(self, weibo_text): content = weibo_text tags = jieba.analyse.extract_tags(content, topK=self.topK) for tag in tags: if tag in self.mingan_list: print("6666666") print(content) print(tag) return True else: print("no") return False
def get_keywords(self, content): result = pseg.cut(content) tags = jieba.analyse.textrank(content, topK=50, withWeight=False, allowPOS=('n')) tags = [tag for tag in tags if len(tag) > 2] return tags
def insert_into_reverse_dict(self, hash_val, text): """ ????: ?????? ????: @hash: ??text???? @text: ??text ????: ??????????????20%?, ???????, ????????. """ word_num = 0; weight_avg = 0; weight_total = 0; word_list = [] weight_list = [] # ???? word_with_weight = jieba.analyse.extract_tags(text, withWeight=True) for word, weight in word_with_weight: word_num += 1; weight_total += float(weight); if word_num > 0: weight_avg = weight_total / word_num; for word, weight in word_with_weight: if weight < (self.rate * weight_avg): break word_list.append(word); weight_list.append(weight); # ??????? list_len = len(word_list) key_list = self.gen_key_list(word_list, weight_list, list_len, self.word_max_len) for key in key_list: self.reverse_dict.add(key, 100, hash_val); # ????(key -> hash)
def key_word_extract(s): # for x, w in jieba.analyse.textrank(s, withWeight=True): # print('%s %s' % (x, w)) # for x, w in jieba.analyse.extract_tags(s, withWeight=True): # print('%s %s' % (x, w)) return jieba.analyse.textrank(s,withWeight=False)[:10]
def get_focus(num_of_post, pid_p_r): s = pid_p_r[num_of_post][0] for i in pid_p_r[num_of_post][1]: s += i tfidf_list = jieba.analyse.extract_tags(s, allowPOS = ('ns','n', 'vn', 'v'), withWeight = True) text_rank_list = jieba.analyse.textrank(s, allowPOS = ('ns','n', 'vn', 'v'), withWeight = True) focus_dic = {} for (i,j) in tfidf_list: focus_dic[i] = j for (i,j) in text_rank_list: if i in focus_dic: focus_dic[i] += j else: focus_dic[i] = j print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print num_of_post," post : ", pid_p_r[num_of_post][0] print "response : " #print focus in response for i in pid_p_r[num_of_post][1]: word = (' '.join(jieba.cut(i))).split(' ') focus_c = [] for j in word: if j in focus_dic: focus_c.append((j, focus_dic[j])) focus_c = sorted(focus_c, key = lambda x:x[-1], reverse = True) if focus_c != []: print i.decode('utf-8'),"--> focus is ",focus_c[0][0] print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" #main
def segment_text(text): # load user dict jieba.load_userdict(user_dict) # set stop words jieba.analyse.set_stop_words(stop_words) tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=()) for tag in tags: print(str(tag[0]) + "\t" + str(tag[1]))
def construct_dt_matrix(self, files, topK = 200): jieba.analyse.set_stop_words(self.stop_words_path) jieba.analyse.set_idf_path(self.idf_path) M = len(files) N = 1 terms = {} dt = [] for i in files: root = ET.parse(self.doc_dir_path + i).getroot() title = root.find('title').text body = root.find('body').text docid = int(root.find('id').text) tags = jieba.analyse.extract_tags(title + '?' + body, topK=topK, withWeight=True) #tags = jieba.analyse.extract_tags(title, topK=topK, withWeight=True) cleaned_dict = {} for word, tfidf in tags: word = word.strip().lower() if word == '' or self.is_number(word): continue cleaned_dict[word] = tfidf if word not in terms: terms[word] = N N += 1 dt.append([docid, cleaned_dict]) dt_matrix = [[0 for i in range(N)] for j in range(M)] i =0 for docid, t_tfidf in dt: dt_matrix[i][0] = docid for term, tfidf in t_tfidf.items(): dt_matrix[i][terms[term]] = tfidf i += 1 dt_matrix = pd.DataFrame(dt_matrix) dt_matrix.index = dt_matrix[0] print('dt_matrix shape:(%d %d)'%(dt_matrix.shape)) return dt_matrix
def comparekw(): begin_id = 400155662 for i in range(100): id = begin_id + i try: f = open('./text/%d.html/ask.txt' % id , 'r') qstr = f.read().decode('utf-8') qkw = jieba.analyse.extract_tags(qstr,5) #???????? list1 = [] for w in qkw: list1.append(str(w.encode('utf-8'))) print u'??%d.html ?????????' % id f.close except: print u'%d.html ????????' % id continue try: f = open('./text/%d.html/bestanswer.txt' % id , 'r') astr = f.read().decode('utf-8') akw = jieba.analyse.extract_tags(astr,5) #??????? list2 = [] for w in akw: list2.append(str(w.encode('utf-8'))) print u'?? %d.html ?????????????' % id f.close except: print u'??????????' + '\n' continue tmp = [val for val in list1 if val in list2] #???????list?????? if len(tmp) == 0: print u'??????' + '\n' else: print u'???????????? ???30??' + '\n' try: f = open('./text/%d.html/keyscore.txt' % id , 'r') score = int(f.read()) score = score + 30 result = str(score) f = open('./text/%d.html/keyscore.txt' % id , 'w') f.write(result) f.close except: with open('./text/%d.html/keyscore.txt' % id , 'w') as file_saved: text = str(30) file_saved.write(text)