我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用jieba.lcut()。
def process_sent(sent, vocab_int, steps): """ this file token sentence and make it into numpy array, return a fixed length 2d array :param sent: :param vocab_int: :param steps: :return: """ sent_list = jieba.lcut(sent) # if words not in vocab dict then let this word be a random index which maybe other words index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list] if len(index_list) < steps: index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list)))) else: index_list = index_list[0: steps] return np.array([index_list])
def prepare_data(self): corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus]) vocabs = [] for l in corpus_cut: for i in l: vocabs.append(i) # vocabs = reduce(lambda x, y: x+y, corpus_cut) # count every vocab frequency # but currently we don't think about the 'most' frequent one, just let it go counter = collections.Counter(vocabs) counter = counter.most_common() vocabs_set, _ = zip(*counter) vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)} data_flatten = np.array([vocab_int_map[v] for v in vocabs]) #step=3 data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(0,data_flatten.shape[0]-self.n_steps -1,3)]) # let's shuffle data to see anything happens np.random.shuffle(data) return len(vocabs_set), vocab_int_map, data
def prepare_data(self): corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus]) vocabs = [] for l in corpus_cut: for i in l: vocabs.append(i) # vocabs = reduce(lambda x, y: x+y, corpus_cut) # count every vocab frequency # but currently we don't think about the 'most' frequent one, just let it go counter = collections.Counter(vocabs) counter = counter.most_common() vocabs_set, _ = zip(*counter) vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)} data_flatten = np.array([vocab_int_map[v] for v in vocabs]) data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(data_flatten.shape[0] // (self.n_steps + 1))]) # let's shuffle data to see anything happens np.random.shuffle(data) return len(vocabs_set), vocab_int_map, data
def delNOTNeedWords(content,customstopwords=None): # words = jieba.lcut(content) if customstopwords == None: customstopwords = "stopwords.txt" import os if os.path.exists(customstopwords): stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n') customstopwords = stop_words result='' return_words = [] # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #???? words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') tempword = word.encode('utf-8').strip(' ') if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']): # and flag[0] in [u'n', u'f', u'a', u'z']): # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #?????????????????? result += tempword # +"/"+str(w.flag)+" " #???? return_words.append(tempword) return result,return_words
def result_by_time(self, sentence): seg_list = jieba.lcut(sentence, cut_all=False) n, cleaned_dict = self.clean_list(seg_list) time_scores = {} for term in cleaned_dict.keys(): r = self.fetch_from_db(term) if r is None: continue docs = r[2].split('\n') for doc in docs: docid, date_time, tf, ld = doc.split('\t') if docid in time_scores: continue news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S") now_datetime = datetime.now() td = now_datetime - news_datetime docid = int(docid) td = (timedelta.total_seconds(td) / 3600) # hour time_scores[docid] = td time_scores = sorted(time_scores.items(), key = operator.itemgetter(1)) if len(time_scores) == 0: return 0, [] else: return 1, time_scores
def gen_idf_file(self): files = listdir(self.doc_dir_path) n = float(len(files)) idf = {} for i in files: root = ET.parse(self.doc_dir_path + i).getroot() title = root.find('title').text body = root.find('body').text seg_list = jieba.lcut(title + '?' + body, cut_all=False) seg_list = set(seg_list) - self.stop_words for word in seg_list: word = word.strip().lower() if word == '' or self.is_number(word): continue if word not in idf: idf[word] = 1 else: idf[word] = idf[word] + 1 idf_file = open(self.idf_path, 'w', encoding = 'utf-8') for word, df in idf.items(): idf_file.write('%s %.9f\n'%(word, math.log(n / df))) idf_file.close()
def delstopwords(content): result='' words = jieba.lcut(content) return_words = [] for w in words: if w not in app.config['stopwords']: result += w.encode('utf-8') # +"/"+str(w.flag)+" " #???? return_words.append(w.encode('utf-8')) # words = pseg.lcut(content) # with app.test_request_context(): # for word, flag in words: # if (word not in app.config['stopwords'] and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #?????????????????? # result += word.encode('utf-8') # +"/"+str(w.flag)+" " #???? # print result return result,return_words
def delNOTNeedWords(content,stopwords): # words = jieba.lcut(content) result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #???? words = pseg.lcut(content) # jieba.cut() text_list = [] for word, flag in words: # print word.encode('utf-8') if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #?????????????????? # text_list.append(word.encode('utf-8')) result += word.encode('utf-8') # +"/"+str(w.flag)+" " #???? # ''.join(text_list) return result # return ''.join(text_list)
def cut_with_stop_words(string): segs = jieba.lcut(string) final = [] if True: for seg in segs: if seg not in stopwords: final.append(seg) return final else: return segs
def cut_with_stop_words(string): segs = jieba.lcut(string) final = [] if False: for seg in segs: if seg not in stopwords: final.append(seg) return final else: return segs
def cut_with_stop_words(string): segs = jieba.lcut(string) final = '' for seg in segs: if seg not in stopwords: final = final + seg return final
def main(_): print("Loading vocabulary") cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt") en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt") cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path) _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path) print("Building model...") config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: model = create_model(sess, False) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: seg_list = jieba.lcut(sentence.strip()) #print(" ".join(seg_list)) token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list] #print(token_ids) outputs = model.test(sess, token_ids) outputs = outputs.tolist() if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) print(output.capitalize()) print("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def jieba_example(): raw = "????S5????,123,?,?" raw_seq = jieba.cut(raw) raw_seq_list = jieba.lcut(raw) raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=()) raw_with_ictclas = pseg.cut(raw) for word, flag in raw_with_ictclas: print word, flag
def cut_with_flag(raw_str, filter_invalid_word_flag=True): """ :param raw_str: str :return: list[(str, str)] """ res = [(a, b) for a, b in pseg.lcut(raw_str)] if filter_invalid_word_flag: return filter_invalid_word(res) else: return res
def process_lyrics(file_name): lyrics = [] content = clean_cn_corpus(file_name, clean_level='all', is_save=False) for l in content: if len(l) < 40: continue l = start_token + l + end_token lyrics.append(l) lyrics = sorted(lyrics, key=lambda line: len(line)) print('all %d songs...' % len(lyrics)) # if not os.path.exists(os.path.dirname(segment_list_file)): # os.mkdir(os.path.dirname(segment_list_file)) # if os.path.exists(segment_list_file): # print('load segment file from %s' % segment_list_file) # with open(segment_list_file, 'rb') as p: # all_words = pickle.load(p) # else: all_words = [] for lyric in lyrics: all_words += jieba.lcut(lyric, cut_all=False) # with open(segment_list_file, 'wb') as p: # pickle.dump(all_words, p) # print('segment result have been save into %s' % segment_list_file) # calculate how many time appear per word counter = collections.Counter(all_words) print(counter['E']) # sorted depends on frequent counter_pairs = sorted(counter.items(), key=lambda x: -x[1]) words, _ = zip(*counter_pairs) print('E' in words) words = words[:len(words)] + (' ',) word_int_map = dict(zip(words, range(len(words)))) # translate all lyrics into int vector lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics] return lyrics_vector, word_int_map, words
def segement(self, strs): return jieba.lcut(strs)
def word_tokenize(tokens): # return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)] return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] #from my.corenlp_interface import CoreNLPInterface #url = 'vision-server2.corp.ai2' #port = 8000 #interface = CoreNLPInterface(url, port) #sent_tokenize = interface.split_doc #word_tokenize = interface.split_sent
def get_train_data(language): # Load data from files path = "./data/" + language + "/" positive_examples = list(open(path + "rt-polarity.pos", "r").readlines()) positive_examples = [s.strip() for s in positive_examples[:100]] # -1000 negative_examples = list(open(path + "rt-polarity.neg", "r").readlines()) negative_examples = [s.strip() for s in negative_examples[:100]] x_text = positive_examples + negative_examples x_text = [sent for sent in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) # Build vocabulary max_length_of_sentence = max([len(jieba.lcut(x)) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_of_sentence) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(1234) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/cross-validation set cross_validation_indices = np.array(random.sample(np.arange(len(y)), int(len(y) * 0.1) )) train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices))) x_train, x_dev = x_shuffled[train_indices], x_shuffled[cross_validation_indices] y_train, y_dev = y_shuffled[train_indices], y_shuffled[cross_validation_indices] return [x_train, x_dev, y_train, y_dev, vocab_processor]
def cut(sentence): if not __init_seg: __init() return jieba.lcut(sentence)
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'): """ ???????????????????? simType=simple, simple_POS, vec """ self.lastTxt.append(intxt) if simType not in ('simple', 'simple_pos', 'vec'): return 'error: maxSimTxt?simType?????: {}'.format(simType) # ??????????????? simple_pos ?? embedding = self.vecModel if simType == 'vec' and not embedding: simType = 'simple_pos' for t in self.zhishiku: questions = t.q_vec if simType == 'vec' else t.q_word in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt) t.sim = max( similarity(in_vec, question, method=simType, embedding=embedding) for question in questions ) maxSim = max(self.zhishiku, key=lambda x: x.sim) logger.info('maxSim=' + format(maxSim.sim, '.0%')) if maxSim.sim < simCondision: return '?????????????????????????' return maxSim.a
def tokenize(question, on='jieba'): """ ??????????? :param question: ??????? :return: ????????? """ if on == 'ltp': # LTP ?? words = segmentor.segment(question.encode('utf-8')) rv = _remove_stopwords([i.decode('utf-8') for i in words]) else: # jieba ?? rv = _remove_stopwords(jieba.lcut(question)) logging.debug("NLP:tokenize: {}".format(" ".join(rv))) return rv
def words_extract(news_folder): """?????????? Args: news_folder/ ??/ ??/ ??/ """ subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] data_list = [] # element: ([word1, word2, ...], "??") jieba.enable_parallel(4) # ?????????? for subfolder in subfolder_list: news_class = subfolder subfolder = os.path.join(news_folder, subfolder) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??") jieba.disable_parallel() return data_list
def predict_with_content(classifier, news_content, feature_words): word_list = jieba.lcut(news_content) x = np.array([1 if word in word_list else 0 for word in feature_words]).reshape(1, -1) return classifier.predict(x)[0]
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"): """???????????? Args: news_folder/ ??/ ??/ ??/ """ news_classes = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] stopwords = get_stopwords(stopwords_file) feature_words_dict = {} # ?????????? jieba.enable_parallel(4) for news_class in news_classes: subfolder = os.path.join(news_folder, news_class) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) for word in word_list: if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords: if word in feature_words_dict: feature_words_dict[word] += 1 else: feature_words_dict[word] = 1 jieba.disable_parallel() feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True) feature_words = list(list(zip(*feature_words_tuple))[0]) return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
def get_probability(news_folder, feature_words): """????, prob_matrix, prob_classes Args: news_folder/ ??/ ??/ ??/ """ news_classes = [subfolder for subfolder in os.listdir(news_folder) \ if os.path.isdir(os.path.join(news_folder, subfolder))] data_list = [] # element: ([word1, word2, ...], "??") prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes) num_of_all_news = 0 prob_classes = {} for cls in news_classes: prob_classes[cls] = 0 # ?????????? jieba.enable_parallel(4) for news_class in news_classes: prob_count = {} for word in feature_words: prob_count[word] = 1 # ?????? subfolder = os.path.join(news_folder, news_class) news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \ if os.path.isfile(os.path.join(subfolder, news))] for news in news_list: with open(news, 'r') as f: content = f.read() word_list = jieba.lcut(content) for word in prob_count.keys(): if word in word_list: prob_count[word] += 1 news_nums = len(news_list) num_of_all_news += news_nums prob_classes[news_class] = news_nums for word in prob_count.keys(): prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ?????? jieba.disable_parallel() for cls in prob_classes.keys(): prob_classes[cls] = prob_classes[cls] / num_of_all_news return prob_matrix, prob_classes
def predict_with_content(prob_matrix, prob_classes, feature_words, content): word_list = set(jieba.lcut(content)) result = {} for cls in prob_classes.keys(): result[cls] = np.log(prob_classes[cls]) for cls in result.keys(): for word in feature_words: if word in word_list: result[cls] += np.log(prob_matrix.loc[word, cls]) else: result[cls] += np.log(1 - prob_matrix.loc[word, cls]) return max(result, key=result.get)
def predict_with_content(prob_matrix, prob_classes, feature_words, content): word_list = jieba.lcut(content) result = {} for cls in prob_classes.keys(): result[cls] = np.log(prob_classes[cls]) for cls in result.keys(): for word in feature_words: if word in word_list: result[cls] += np.log(prob_matrix.loc[word, cls] * word_list.count(word)) else: result[cls] += np.log(1 - prob_matrix.loc[word, cls]) return max(result, key=result.get)
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']): jieba.enable_parallel(32) for word in custom_words: jieba.add_word(word) words=jieba.lcut(text) return words
def if_contains(self, one_page_des): kw_dict_high_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0} kw_dict_low_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0} # kw_dict = {u'??'} # kw_dict = {u'???'} seg_list = jieba.lcut(one_page_des, cut_all=False) for item in seg_list: if item in kw_dict: # print '??' return 1 # print '?????????' return 0
def tokenize(sentence): cn_sent = get_cnstr(sentence) term_list = jieba.lcut(cn_sent, cut_all=False) final_term_list = [term for term in term_list if len(term)>1 and is_cn_char(term)] return final_term_list
def process(file_name): content = read(file_name) words = jieba.lcut(content, cut_all=False) words = words + ['\n'] vocab = set(words) word2int = { w: i for i, w in enumerate(vocab)} int2word = dict(enumerate(vocab)) data = np.array([word2int[c] for c in words], dtype=np.int32) return data, word2int, int2word, vocab
def jieba_tokenizer(sentence): sentence =sentence.replace("^"," ") #?????????? return jieba.lcut(sentence)
def jieba_tokenizer(self,sentence): return jieba.lcut(sentence)
def init(self): # cut self.img = [] if os.path.exists(self.food_dir): self.imgs = json.loads(open(self.food_dir).read()) for img in self.imgs: img['jieba'] = (jieba.lcut(img['title'])) open(self.food_dir, "w").write(json.dumps(self.imgs)) # build self.jieba_dic = {} for img in self.imgs: for jiba in img['jieba']: self.jieba_dic[jiba] = img
def wordSearch(self, text): textarr = jieba.lcut(text) self.colorPrint("Jieba cut", textarr) for t in textarr: if t in self.jieba_dic: return self.jieba_dic[t] raise ValueError("not found")
def imageAdd(self, img): self.colorPrint("Add Foods", img) img['jieba'] = (jieba.lcut(img['title'])) for jiba in img['jieba']: self.jieba_dic[jiba] = img self.img.append(img) open(self.food_dir, "w").write(json.dumps(self.imgs))
def mycut(s): result = [] j = 0 s = re_replace.sub(' ', s) for i in not_cuts.finditer(s): result.extend(jieba.lcut(s[j:i.start()], HMM=False)) if s[i.start()] in [u'?', u'“']: result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]]) else: result.append(s[i.start():i.end()]) j = i.end() result.extend(jieba.lcut(s[j:], HMM=False)) return result