Python jieba 模块,enable_parallel() 实例源码

我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用jieba.enable_parallel()

项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_probability(news_folder, feature_words):
    """????, prob_matrix, prob_classes
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")
    prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
    num_of_all_news = 0
    prob_classes = {}
    for cls in news_classes:
        prob_classes[cls] = 0
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        prob_count = {}
        for word in feature_words:
            prob_count[word] = 1 # ??????
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in prob_count.keys():
                    if word in word_list:
                        prob_count[word] += 1
        news_nums = len(news_list)
        num_of_all_news += news_nums
        prob_classes[news_class] = news_nums
        for word in prob_count.keys():
            prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
    jieba.disable_parallel()
    for cls in prob_classes.keys():
        prob_classes[cls] = prob_classes[cls] / num_of_all_news
    return prob_matrix, prob_classes
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
项目:warWolf    作者:wu-yy    | 项目源码 | 文件源码
def save_jieba_result(file_name):
    #???????
    #jieba.enable_parallel(4)
    dirs=path.join(path.dirname(__file__),file_name)
    print(dirs)
    with codecs.open(dirs,encoding='utf-8') as f:
        comment_text=f.read()
    cut_text=" ".join(jieba.cut(comment_text))
    with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
        f.write(cut_text)
项目:warWolf    作者:wu-yy    | 项目源码 | 文件源码
def save_jieba_result(file_name):
    #???????
    #jieba.enable_parallel(4)
    dirs=path.join(path.dirname(__file__),file_name)
    print(dirs)
    with codecs.open(dirs,encoding='utf-8') as f:
        comment_text=f.read()
    cut_text=" ".join(jieba.cut(comment_text))
    with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f:
        f.write(cut_text)
项目:Neural-Headline-Generator-CN    作者:QuantumLiu    | 项目源码 | 文件源码
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words
项目:zhanlang2_wordcloud    作者:ronghuaxu    | 项目源码 | 文件源码
def get_all_keywords(file_name):
    word_lists = []  # ?????
    jieba.enable_parallel(8)
    with codecs.open(file_name, 'r', encoding='utf-8') as f:
        Lists = f.readlines()  # ????
        for List in Lists:
            cut_list = list(jieba.cut(List))
            for word in cut_list:
                word_lists.append(word)
    word_lists_set = set(word_lists)  # ??????
    word_lists_set = list(word_lists_set)
    length = len(word_lists_set)
    print u"??%d????" % length
    information = pd.read_excel('/Users/huazi/Desktop/zhanlang2.xlsx')
    world_number_list = []
    word_copy=[]
    for w in word_lists_set:
        if (len(w) == 1):
            continue
        if (word_lists.count(w) > 3):
            world_number_list.append(word_lists.count(w))
            word_copy.append(w)
    information['key'] = word_copy
    information['count'] = world_number_list
    information.to_excel('sun_2.xlsx')


# ????
项目:zhanlang2_wordcloud    作者:ronghuaxu    | 项目源码 | 文件源码
def save_jieba_result():
    # ???????
    jieba.enable_parallel(4)
    dirs = path.join(path.dirname(__file__), '../pjl_comment.txt')
    with codecs.open(dirs, encoding='utf-8') as f:
        comment_text = f.read()
    cut_text = " ".join(jieba.cut(comment_text))  # ?jieba??????????????????
    with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f:
        f.write(cut_text)
项目:CloudMusic-Crawler    作者:GreatV    | 项目源码 | 文件源码
def words_split(corpus_path):

    with open(corpus_path, 'r') as f:
        content = f.read()

    jieba.load_userdict('data/userdict.txt') # ?????????
    jieba.enable_parallel(4) # ????


    seg_list = jieba.cut(content, cut_all = False) # ??

    return seg_list


# ?????
项目:emotion_analyse_py    作者:jeffmxh    | 项目源码 | 文件源码
def __init__(self,n_core = 16):
        self.rootdir = os.getcwd()
        self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
        self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
        jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
        self.n_CORE=n_core
        jieba.enable_parallel(self.n_CORE-1)