我们从Python开源项目中,提取了以下29个代码示例,用于说明如何使用jieba.posseg()。
def parse(): """parse the comments""" import jieba import jieba.posseg as pseg # Load User's Dictionary path_list = os.getcwd().split('/') path_list.append("dict.txt") dict_path = '/'.join(path_list) jieba.load_userdict(dict_path) # Disimss These Flags dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f', 'ud', 'ug', 'uv'] comments = Comment.query.all() for comment in comments: word_list = [] pseg_cut = pseg.cut(comment.body) for word, flag in pseg_cut: if flag not in dismiss: word_list.append(word) comment.parsed = '/'.join(word_list) db.session.add(comment) print "Comment %04d Parsed!" % comment.id db.session.commit() print "ALL DONE!"
def synonym_cut(sentence, pattern="wf"): """Cut the sentence into a synonym vector tag. ?????????????? If a word in this sentence was not found in the synonym dictionary, it will be marked with default value of the word segmentation tool. ???????????????????????? Args: pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'? """ sentence = sentence.rstrip(tone_words) synonym_vector = [] if pattern == "w": result = list(jieba.cut(sentence)) synonym_vector = [item for item in result if item not in punctuation_all] elif pattern == "k": synonym_vector = analyse.extract_tags(sentence, topK=1) elif pattern == "t": synonym_vector = analyse.extract_tags(sentence, topK=10) elif pattern == "wf": result = posseg.cut(sentence) # synonym_vector = [(item.word, item.flag) for item in result \ # if item.word not in punctuation_all] # Modify in 2017.4.27 for item in result: if item.word not in punctuation_all: if len(item.flag) < 4: item.flag = list(posseg.cut(item.word))[0].flag synonym_vector.append((item.word, item.flag)) elif pattern == "tf": result = posseg.cut(sentence) tags = analyse.extract_tags(sentence, topK=10) for item in result: if item.word in tags: synonym_vector.append((item.word, item.flag)) return synonym_vector
def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f
def __init__(self, idf_path=None): self.tokenizer = jieba.dt self.postokenizer = jieba.posseg.dt self.stop_words = self.STOP_WORDS.copy() self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF) self.idf_freq, self.median_idf = self.idf_loader.get_idf()
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg", file=sys.stderr)
def testPosseg_NOHMM(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr) print("testPosseg_NOHMM", file=sys.stderr)
def testPosseg(self): import jieba.posseg as pseg for content in test_contents: result = pseg.cut(content) assert isinstance(result, types.GeneratorType), "Test Posseg Generator error" result = list(result) assert isinstance(result, list), "Test Posseg error on content: %s" % content print >> sys.stderr, " , ".join([w.word + " / " + w.flag for w in result]) print >> sys.stderr, "testPosseg"
def isJux(answer): """ ??????????????1.????0 :param answer: :return: """ tag = [] seg = jieba.posseg.cut(answer) for i in seg: if i.flag == '*': tag.extend((i.word, i.flag)) if len(tag)/len(answer.split(u'?')) > 0.5: return 0 else: return 1
def getLocations(self): seg_list = jieba.posseg.cut( self.content ) ns = [] lastNs = False for i in seg_list: if i.flag == 'nnl': print (i.word) if lastNs: ns[-1] += i.word else: ns.append( i.word ) lastNs = True else: lastNs = False return ns
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. if the POS of w is not in this list,it will be filtered. - withFlag: only work with allowPOS is not empty. if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ if allowPOS: allowPOS = frozenset(allowPOS) words = self.postokenizer.cut(sentence) else: words = self.tokenizer.cut(sentence) freq = {} word_list = [] for w in words: if allowPOS: if w.flag not in allowPOS: continue elif not withFlag: w = w.word wc = w.word if allowPOS and withFlag else w if len(wc.strip()) < 2 or wc.lower() in self.stop_words: continue freq[w] = freq.get(w, 0.0) + 1.0 word_list.append(w) total = sum(freq.values()) for k in freq: kw = k.word if allowPOS and withFlag else k freq[k] *= self.idf_freq.get(kw, self.median_idf) / total res_list = [] for word in word_list: weight = freq[word] res_list.append((word, weight)) return res_list
def preprocess_WithSpeech(stopword_file, news): content = jieba.posseg.cut(news) # ?????? content = filter(lambda x: hasMeaningfulWords(x), content) # ??????????? content = [i.word for i in content] content = filter(lambda x: len(x) > 1, content) # ????? stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()] parsed = set(content) - set(stopw) return ' '.join(parsed)
def preprocess_per_news(news): content = jieba.posseg.cut(news) # ?????? content = filter(lambda x: hasMeaningfulWords(x), content) # ??????????? content = [i.word for i in content] content = filter(lambda x: len(x) > 1, content) # ????? stopword_file = "/Users/luoyi/Scala/OnlineRS/com/Recsys_engine/data/stop_word.txt" stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()] parsed = set(content) - set(stopw) return ' '.join(parsed)
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False): """ Extract keywords from sentence using TF-IDF algorithm. Parameter: - topK: return how many top keywords. `None` for all possible words. - withWeight: if True, return a list of (word, weight); if False, return a list of words. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr']. if the POS of w is not in this list,it will be filtered. - withFlag: only work with allowPOS is not empty. if True, return a list of pair(word, weight) like posseg.cut if False, return a list of words """ if allowPOS: allowPOS = frozenset(allowPOS) words = self.postokenizer.cut(sentence) else: words = self.tokenizer.cut(sentence) freq = {} for w in words: if allowPOS: if w.flag not in allowPOS: continue elif not withFlag: w = w.word wc = w.word if allowPOS and withFlag else w if len(wc.strip()) < 2 or wc.lower() in self.stop_words: continue freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: kw = k.word if allowPOS and withFlag else k freq[k] *= self.idf_freq.get(kw, self.median_idf) / total if withWeight: tags = sorted(freq.items(), key=itemgetter(1), reverse=True) else: tags = sorted(freq, key=freq.__getitem__, reverse=True) if topK: return tags[:topK] else: return tags