我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用jieba.enable_parallel()。
def words_extract(news_folder): """?????????? Args: news_folder/ ??/ ??/ ??/ """ subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] data_list = [] # element: ([word1, word2, ...], "??") jieba.enable_parallel(4) # ?????????? for subfolder in subfolder_list: news_class = subfolder subfolder = os.path.join(news_folder, subfolder) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??") jieba.disable_parallel() return data_list
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"): """???????????? Args: news_folder/ ??/ ??/ ??/ """ news_classes = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] stopwords = get_stopwords(stopwords_file) feature_words_dict = {} # ?????????? jieba.enable_parallel(4) for news_class in news_classes: subfolder = os.path.join(news_folder, news_class) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) for word in word_list: if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords: if word in feature_words_dict: feature_words_dict[word] += 1 else: feature_words_dict[word] = 1 jieba.disable_parallel() feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True) feature_words = list(list(zip(*feature_words_tuple))[0]) return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
def get_probability(news_folder, feature_words): """????, prob_matrix, prob_classes Args: news_folder/ ??/ ??/ ??/ """ news_classes = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] data_list = [] # element: ([word1, word2, ...], "??") prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes) num_of_all_news = 0 prob_classes = {} for cls in news_classes: prob_classes[cls] = 0 # ?????????? jieba.enable_parallel(4) for news_class in news_classes: prob_count = {} for word in feature_words: prob_count[word] = 1 # ?????? subfolder = os.path.join(news_folder, news_class) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) for word in prob_count.keys(): if word in word_list: prob_count[word] += 1 news_nums = len(news_list) num_of_all_news += news_nums prob_classes[news_class] = news_nums for word in prob_count.keys(): prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ?????? jieba.disable_parallel() for cls in prob_classes.keys(): prob_classes[cls] = prob_classes[cls] / num_of_all_news return prob_matrix, prob_classes
def save_jieba_result(file_name): #??????? #jieba.enable_parallel(4) dirs=path.join(path.dirname(__file__),file_name) print(dirs) with codecs.open(dirs,encoding='utf-8') as f: comment_text=f.read() cut_text=" ".join(jieba.cut(comment_text)) with codecs.open('pjl_jieba.txt','w',encoding='utf-8') as f: f.write(cut_text)
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']): jieba.enable_parallel(32) for word in custom_words: jieba.add_word(word) words=jieba.lcut(text) return words
def get_all_keywords(file_name): word_lists = [] # ????? jieba.enable_parallel(8) with codecs.open(file_name, 'r', encoding='utf-8') as f: Lists = f.readlines() # ???? for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # ?????? word_lists_set = list(word_lists_set) length = len(word_lists_set) print u"??%d????" % length information = pd.read_excel('/Users/huazi/Desktop/zhanlang2.xlsx') world_number_list = [] word_copy=[] for w in word_lists_set: if (len(w) == 1): continue if (word_lists.count(w) > 3): world_number_list.append(word_lists.count(w)) word_copy.append(w) information['key'] = word_copy information['count'] = world_number_list information.to_excel('sun_2.xlsx') # ????
def save_jieba_result(): # ??????? jieba.enable_parallel(4) dirs = path.join(path.dirname(__file__), '../pjl_comment.txt') with codecs.open(dirs, encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # ?jieba?????????????????? with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f: f.write(cut_text)
def words_split(corpus_path): with open(corpus_path, 'r') as f: content = f.read() jieba.load_userdict('data/userdict.txt') # ????????? jieba.enable_parallel(4) # ???? seg_list = jieba.cut(content, cut_all = False) # ?? return seg_list # ?????
def __init__(self,n_core = 16): self.rootdir = os.getcwd() self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt')) self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST]) jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt')) self.n_CORE=n_core jieba.enable_parallel(self.n_CORE-1)