Python jieba 模块,posseg() 实例源码


项目:sentiment-analysis    作者:kasheemlew    | 项目源码 | 文件源码
def parse():
    """parse the comments"""
    import jieba
    import jieba.posseg as pseg

    # Load User's Dictionary
    path_list = os.getcwd().split('/')
    dict_path = '/'.join(path_list)

    # Disimss These Flags
    dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f',
            'ud', 'ug', 'uv']

    comments = Comment.query.all()
    for comment in comments:
         word_list = []
         pseg_cut = pseg.cut(comment.body)
         for word, flag in pseg_cut:
             if flag not in dismiss:
         comment.parsed = '/'.join(word_list)
         print "Comment %04d Parsed!" %

    print "ALL DONE!"
项目:chat    作者:Decalogue    | 项目源码 | 文件源码
def synonym_cut(sentence, pattern="wf"):
    """Cut the sentence into a synonym vector tag.

    If a word in this sentence was not found in the synonym dictionary,
    it will be marked with default value of the word segmentation tool.

        pattern: 'w'-??, 'k'-??????'t'-?????, 'wf'-????, 'tf-?????'?
    sentence = sentence.rstrip(tone_words)
    synonym_vector = []
    if pattern == "w":
        result = list(jieba.cut(sentence))
        synonym_vector = [item for item in result if item not in punctuation_all]
    elif pattern == "k":
        synonym_vector = analyse.extract_tags(sentence, topK=1)
    elif pattern == "t":
        synonym_vector = analyse.extract_tags(sentence, topK=10)
    elif pattern == "wf":
        result = posseg.cut(sentence)
        # synonym_vector = [(item.word, item.flag) for item in result \
        # if item.word not in punctuation_all]
        # Modify in 2017.4.27 
        for item in result:
            if item.word not in punctuation_all:
                if len(item.flag) < 4:
                    item.flag = list(posseg.cut(item.word))[0].flag
                synonym_vector.append((item.word, item.flag))
    elif pattern == "tf":
        result = posseg.cut(sentence)
        tags = analyse.extract_tags(sentence, topK=10)
        for item in result:
            if item.word in tags:
                synonym_vector.append((item.word, item.flag))
    return synonym_vector
项目:PTTChatBot_DL2017    作者:thisray    | 项目源码 | 文件源码
def cutfunc(sentence, _, HMM=True):
        for w, f in jieba.posseg.cut(sentence, HMM):
            yield w + posdelim + f
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testPosseg(self):
        import jieba.posseg as pseg
        for content in test_contents:
            result = pseg.cut(content)
            assert isinstance(result, types.GeneratorType), "Test Posseg Generator error"
            result = list(result)
            assert isinstance(result, list), "Test Posseg error on content: %s" % content
            print(" , ".join([w.word + " / " + w.flag for w in result]), file=sys.stderr)
        print("testPosseg", file=sys.stderr)
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def isJux(answer):
    :param answer:
    tag = []
    seg = jieba.posseg.cut(answer)
    for i in seg:
        if i.flag == '*':
            tag.extend((i.word, i.flag))
    if len(tag)/len(answer.split(u'?')) > 0.5:
        return 0
        return 1
项目:GeoNews    作者:chunlaw    | 项目源码 | 文件源码
def getLocations(self):
        seg_list = jieba.posseg.cut( self.content )
        ns = []
        lastNs = False
        for i in seg_list:
            if i.flag == 'nnl':
                print (i.word)
                if lastNs:
                    ns[-1] += i.word
                    ns.append( i.word )
                lastNs = True
                lastNs = False
        return ns
项目:http_server    作者:chenguolin    | 项目源码 | 文件源码
def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
        Extract keywords from sentence using TF-IDF algorithm.
            - topK: return how many top keywords. `None` for all possible words.
            - withWeight: if True, return a list of (word, weight);
                          if False, return a list of words.
            - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
                        if the POS of w is not in this list,it will be filtered.
            - withFlag: only work with allowPOS is not empty.
                        if True, return a list of pair(word, weight) like posseg.cut
                        if False, return a list of words
        if allowPOS:
            allowPOS = frozenset(allowPOS)
            words = self.postokenizer.cut(sentence)
            words = self.tokenizer.cut(sentence)
        freq = {}
        word_list = []
        for w in words:
            if allowPOS:
                if w.flag not in allowPOS:
                elif not withFlag:
                    w = w.word
            wc = w.word if allowPOS and withFlag else w
            if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())
        for k in freq:
            kw = k.word if allowPOS and withFlag else k
            freq[k] *= self.idf_freq.get(kw, self.median_idf) / total

        res_list = []
        for word in word_list:
            weight = freq[word]
            res_list.append((word, weight))
        return res_list
项目:Content-Based-News-Recommendation-System-in-Spark    作者:Labyrinth108    | 项目源码 | 文件源码
def preprocess_WithSpeech(stopword_file, news):
    content = jieba.posseg.cut(news)  # ??????

    content = filter(lambda x: hasMeaningfulWords(x), content)  # ???????????
    content = [i.word for i in content]
    content = filter(lambda x: len(x) > 1, content)  # ?????

    stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()]

    parsed = set(content) - set(stopw)
    return ' '.join(parsed)
项目:Content-Based-News-Recommendation-System-in-Spark    作者:Labyrinth108    | 项目源码 | 文件源码
def preprocess_per_news(news):

        content = jieba.posseg.cut(news)  # ??????
        content = filter(lambda x: hasMeaningfulWords(x), content)  # ???????????
        content = [i.word for i in content]
        content = filter(lambda x: len(x) > 1, content)  # ?????
        stopword_file = "/Users/luoyi/Scala/OnlineRS/com/Recsys_engine/data/stop_word.txt"
        stopw = [line.strip().decode('utf-8') for line in open(stopword_file).readlines()]

        parsed = set(content) - set(stopw)
        return ' '.join(parsed)
