我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用jieba.cut()。
def WordBeark(): logger.info("running Word Beark in " + path + data) inputfile = path + data + ".zhs" outputfile = path + data + ".wordbreak" i = 0 output = open(outputfile, 'w') input = open(inputfile, 'r') for line in input.readlines(): seg_list = jieba.cut(line) output.write(u' '.join(seg_list)) i = i + 1 if (i % 10000 == 0): logger.info("Cut " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles in " + outputfile)
def sentenceToIndex(sentence, word2idx, maxLen): """ ??????????embeddings?????? :param sentence: ?? :param word2idx: ????? :param maxLen: ??????? :return: ?????????? """ unknown = word2idx.get("UNKNOWN", 0) num = word2idx.get("NUM", len(word2idx)) index = [unknown] * maxLen i = 0 for word in jieba.cut(sentence): if word in word2idx: index[i] = word2idx[word] else: if re.match("\d+", word): index[i] = num else: index[i] = unknown if i >= maxLen - 1: break i += 1 return index
def bm25(p, titles, answers, scores): original_titles = copy.deepcopy(titles) titles = [remove_punctuation_re(title) for title in titles] answers = [remove_punctuation_re(answer) for answer in answers] p = remove_punctuation_re(p) titles = [' '.join(jieba.cut(title)) for title in titles] p = ' '.join(jieba.cut(p)) wordindoc, wordindata, doclen, sumlen = init(titles, False) global avglen avglen = 1.0 * sumlen / N res = search(p, zip(titles, original_titles, answers, scores), wordindoc, wordindata, doclen) titles, answers, scores = [], [], [] for key, _ in res: titles.append(key[0]) answers.append(key[1]) scores.append(key[2]) return titles, answers, scores
def get_html_text(url): response = requests.get(url) origin_text = response.text origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL) origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL) doc = html.fromstring(origin_text) text = doc.xpath('//body//text()') text = [i.strip() for i in text if i.strip()] text = ' '.join(text) seg = jieba.cut(text) stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords() seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit() and i.strip() not in stopwords] seg = ' '.join(seg) return seg
def overlap_index(question,answer,q_len,a_len,stopwords = []): qset = set(cut(question)) aset = set(cut(answer)) q_index = np.zeros(q_len) a_index = np.zeros(a_len) overlap = qset.intersection(aset) for i,q in enumerate(cut(question)[:q_len]): value = 1 if q in overlap: value = 2 q_index[i] = value for i,a in enumerate(cut(answer)[:a_len]): value = 1 if a in overlap: value = 2 a_index[i] = value return q_index,a_index
def ma_overlap_zi(row): question = cut(row["question"]) answer = cut(row["answer"]) di_question = [] di_answer = [] for w in question: for i in range(len(w) ): di_question.append(w[i]) for w in answer: for i in range(len(w) ): di_answer.append(w[i]) di_overlap = set(di_question).intersection(set(di_answer) ) di_weight_p = dict({}) for k in range(len(di_question) ): if di_question[k] in di_overlap: # print int(100*((k+1)/(len(question)+1)) ) di_weight_p[di_question[k] ] =((k+1)/len(di_question))**3.2# zi_weight[ int(100*((k+1)/(len(di_question)+1)) )]#((k+1)/len(di_question))**3.2 di_weight_all = 0.0 for k in di_overlap: di_weight_all += di_weight_p[k] return di_weight_all /(len(di_answer)+40)
def get_word_count(filename): data_source=open(filename,'r') data=data_source.read() if(data!=''): temp_result = jieba.cut(data,cut_all=True) temp_result = '/'.join(temp_result) word_result=temp_result.split('/') word_view={}#word_view[i]?????????????????i? for i in word_result: word_view[i]=0 if(i not in word_doc): word_doc[i]=0 for i in word_result: if(word_view[i]==0): word_view[i]=1; word_doc[i]=word_doc[i]+1
def print2file(f, title, responses, marker = '', separater = True): if marker != '': f.write(marker + ' ') title_cutted = jieba.cut(title.strip(), cut_all=False) for word in title_cutted: f.write(word + ' ') f.write('\n') for response in responses: #print(response['Content']) #if response['Content'] not in count_response.keys(): # count_response[response['Content']] = 0 #count_response[response['Content']] += 1 if marker != '': f.write(marker + ' ') response_cutted = jieba.cut(response['Content'].strip(), cut_all=False) for word in response_cutted: f.write(word + ' ') f.write('\n') if separater: f.write('===\n')
def word_tokenization(tick_blog_list): ''' word tokenization by jieba to list return list : [[,], [,], ...] ''' count = 0 seg_list = [] try: for blog in tick_blog_list: count += 1 if blog != '': segments = jieba.cut(blog) tmp = [] for seg in segments: tmp.append(seg) seg_list.append(tmp) else: print('Line%d is empty!' % cnt) except IOError as e: logging.error('IOError %s' % e) finally: return seg_list #-------------------------------------------------------------------------------
def word_tokenization(tick_blog_list): ''' word tokenization by jieba to list return list : [[,], [,], ...] ''' count = 0 seg_list = [] try: for blog in tick_blog_list: if blog != '': count += 1 segments = jieba.cut(blog) tmp = [] for seg in segments: tmp.append(seg) seg_list.append(tmp) except IOError as e: logging.error('IOError %s' % e) finally: return seg_list # Python????????
def word_segment(line, stop=False, remain_number=True): ''' ??????? stop ?????? ''' if STOP_WORDS is None: load_stopwords() seg_list = jieba.cut(line, HMM=True) sl = [] for word in seg_list: word = word.strip() if len(word) > 0 and word not in PUNCT: if stop: if word in STOP_WORDS: word = None if word is not None and not remain_number: if util_func.atof(word) is not None: word = None if word is not None: sl.append(word) return sl
def cut_for_property(self, text): ''' @summary: ?????? --------- @param text: ???? --------- @result: ??[(text1, property1)...(textN, propertyN)] ''' words_list = [] words =pseg.cut(text) for word in words: if word.word not in self._stop_words: words_list.append((word.word, word.flag)) return words_list
def get_seg_features(string): """ Segment text with jieba features are represented in bies format s donates single word """ seg_feature = [] for word in jieba.cut(string): if len(word) == 1: seg_feature.append(0) else: tmp = [2] * len(word) tmp[0] = 1 tmp[-1] = 3 seg_feature.extend(tmp) return seg_feature
def get_all_keywords(file_name): word_lists=[] #????? with codecs.open(file_name,'r',encoding='utf-8') as f: Lists=f.readlines() for li in Lists: cut_list=list(jieba.cut(li)) for word in cut_list: word_lists.append(word) word_lists_set=set(word_lists) #??????? sort_count=[] word_lists_set=list(word_lists_set) length=len(word_lists_set) print(u'??%d????'%length) k = 1 for w in word_lists_set: sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n") print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?") k += 1 with codecs.open('count_word.txt', 'w', encoding='utf-8') as f: f.writelines(sort_count)
def Delete_stopwords(): print '????????...' f_stop = open('emotion_file/stopwords.txt') # ??????? f_stop_list = [] for word in f_stop.readlines(): f_stop_list.append(word) f_stop.close() f_text = open("emotion_file/data_zhuguan.txt", "r") # ???? f_nostop = codecs.open('emotion_file/data_zhuguan_nostop.txt', 'w', encoding='UTF-8') for text in f_text.readlines(): # ?????????????? f_seg_list = list(jieba.cut(text, cut_all=False)) # ???? for word in f_seg_list: if word in f_stop_list: print word else: f_nostop.write(word) f_text.close() print"???????..." # ???? # ??????????????? data_jixing.txt ??????????
def get_seg_features(string): """ Segment text with jieba features are represented in bies format s donates single word """ seg_feature = [] for word in jieba.cut(string): if len(word) == 1: seg_feature.append(0) else: tmp = [2] * len(word) tmp[0] = 1 tmp[-1] = 3 ## ??????extend????append seg_feature.extend(tmp) return seg_feature
def jieba_contend_split(contend): punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\''] wordSequenceList = [] # ???? [[(id,comtend),()....]] ??????????????????? seg_list = jieba.cut(self.commentSentence) segmentedComment = [item for item in seg_list] segmentedCommentTuple = list(enumerate(segmentedComment)) subWordSequenceList = [] for wordTuple in segmentedCommentTuple: if wordTuple[1] in punctuation: if subWordSequenceList: wordSequenceList.append(subWordSequenceList) subWordSequenceList = [] else: subWordSequenceList.append(wordTuple) if subWordSequenceList: wordSequenceList.append(subWordSequenceList) return wordSequenceList
def segByPunc(self): punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\''] wordSequenceList = [] #???? [[(id,comtend),()....]] ??????????????????? seg_list = jieba.cut(self.commentSentence) segmentedComment = [item for item in seg_list] segmentedCommentTuple = list(enumerate(segmentedComment)) subWordSequenceList = [] for wordTuple in segmentedCommentTuple: if (wordTuple[1] in punctuation): if (subWordSequenceList != []): wordSequenceList.append(subWordSequenceList) subWordSequenceList = [] else: subWordSequenceList.append(wordTuple) if (subWordSequenceList != []): wordSequenceList.append(subWordSequenceList) return (wordSequenceList) #?????????????????????????
def _asian_tokenization(doc, entity_type, tag_type, tokenizer): sents = [] for paragraph in doc.split('\n'): sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE)) for partial_sent in sent_splits: sent = partial_sent + next(sent_splits, '') if sent.strip() == '': continue toks = [] # for tok in jieba.cut(sent, ): for tok in tokenizer(sent): pos = 'WORD' if tok.strip() == '': pos = 'SPACE' elif punct_re.match(tok): pos = 'PUNCT' toks.append(Tok(pos, tok[:2].lower(), tok.lower(), tok, ent_type='' if entity_type is None else entity_type.get(tok, ''), tag='' if tag_type is None else tag_type.get(tok, ''))) sents.append(Sentence(toks, sent)) return Doc(sents, doc)
def get_result(url_set): line_set = [] for url in url_set: wb_data = requests.get(url,headers = headers) soup = BeautifulSoup(wb_data.text,'lxml') a = soup.select('span.ctt') for i in range(len(a)): text = re.sub('<[^>]*>', '',a[i].text) text = re.sub('??', ' ', text) text = re.sub('[\W]+', ' ', text) line_set.append(text) #print(text) #writer.writerow((i,text)) word_list = [" ".join(jieba.cut(sentence)) for sentence in line_set] new_text = ' '.join(word_list) wordcloud = WordCloud(font_path="C:/Python34/Lib/site-packages/wordcloud/simhei.ttf", background_color="black").generate(new_text) plt.imshow(wordcloud) plt.axis("off") plt.show()
def load_utf8_data_and_labels(positive_data_file, negative_data_file): # Load data from files positive_data = list(codecs.open(positive_data_file, "r", encoding='utf-8').readlines()) positive_examples = list() for s in positive_data: positive_examples.append(" ".join(jieba.cut(s))) negative_data = list(codecs.open(negative_data_file, "r", encoding='utf-8').readlines()) negative_examples = list() for s in negative_data: negative_examples.append(" ".join(jieba.cut(s))) # Split by words x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
def test(self, input_str): ''' 4?????????????????SVM?????????????? ''' test_input = input_str x_test = np.zeros(self.count+1) #??????? after_split = " ".join(jieba.cut(test_input)) #?? words = after_split.split(" ") for i in words: i = i.replace('\n','') i = i.replace('\r','') i = i.replace(' ','') if self.dictionary.__contains__(i.encode('utf-8')): x_test[self.dictionary[i.encode('utf-8')]] = 1. # else: # print 'Cannot find: '+i #???0????1 if self.mySVM.predict([x_test]) == 1.: return 1 else: return 0
def post_desc_counter(): """ ?????? """ # import thulac post = open(os.path.join("data", "post_require.txt"), "r", encoding="utf-8").read() # ?? thulac ?? # thu = thulac.thulac(seg_only=True) # thu.cut(post, text=True) # ?? jieba ?? file_path = os.path.join("data", "user_dict.txt") jieba.load_userdict(file_path) seg_list = jieba.cut(post, cut_all=False) counter = dict() for seg in seg_list: counter[seg] = counter.get(seg, 1) + 1 counter_sort = sorted( counter.items(), key=lambda value: value[1], reverse=True) pprint(counter_sort) with open(os.path.join("data", "post_pre_desc_counter.csv"), "w+", encoding="utf-8") as f: f_csv = csv.writer(f) f_csv.writerows(counter_sort)
def calculate_similarity(text1,text2): raw1 = jieba.cut(text1) raw2 = jieba.cut(text2) raw1 = Counter(raw1) raw2 = Counter(raw2) same_words = set(raw1) & set(raw2) if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0: dot_product = 0 mod1 = 0 mod2 = 0 for word in same_words: dot_product += raw1[word] * raw2[word] for word in raw1: mod1 += math.pow(raw1[word],2) for word in raw2: mod2 += math.pow(raw2[word],2) cos = dot_product/math.sqrt(mod1*mod2) else: cos = 0 return cos
def extract_tags(sentence,topK=20): words = jieba.cut(sentence) freq = {} for w in words: if len(w.strip())<2: continue if w.lower() in stop_words: continue freq[w]=freq.get(w,0.0)+1.0 total = sum(freq.values()) freq = [(k,v/total) for k,v in freq.iteritems()] tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq] st_list = sorted(tf_idf_list,reverse=True) top_tuples= st_list[:topK] tags = [a[1] for a in top_tuples] return tags
def cut_Text(content, nomial=False): """ :param content: string :param nomial: if nomial is True,only noun-like words will remain :return:a text which format is 'a b c d' """ if nomial: text = '' words = pseg.cut(content) for word in words: if contain(['n'], word.flag): text = text + ' ' + word.word return text.strip() else: text = '' words = jieba.cut(content) for word in words: text = text + ' ' + word return text.strip()
def cut_Dataset(data_set, parrel=False, nomial=False): """ :param data_set:bunch of Dataset :param parrel: if it is True,cut dataset in parrel.Windows is not available :param nomial: if nomial is True,only noun-like words will remain :return:data_set after cutted """ from tqdm import tqdm data_cut = [] start = time.time() print('cuting dataset......') if parrel: p = ThreadPool(9) p.map(cut_Text, data_set.data) p.close() p.join() else: n=0 for doc_content in tqdm(data_set.data): data_cut.append(cut_Text(doc_content, nomial)) end = time.time() print('cuting runs %0.2f seconds.' % (end - start)) data_set.data = data_cut
def fetch(self): # cut the text in semi-redundant sequences of maxlen characters #text=self.text text=self.next_text() chars=self.chars maxlen=self.maxlen step=self.step maxlen = 20 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i: i + maxlen]) next_chars.append(text[i + maxlen]) print('nb sequences:', len(sentences)) print('Vectorization...') X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) y = np.zeros((len(sentences), len(chars)), dtype=np.bool) for i, sentence in enumerate(sentences): for t, char in enumerate(sentence): X[i, t, self.char_indices[char]] = 1 y[i, self.char_indices[next_chars[i]]] = 1 return text,X,y
def predict(text): words = jieba.cut(text) words = " ".join(words) index2label = {i: l.strip() for i, l in enumerate(tv_classfication.label_list)} word2vec_model = Word2Vec.load(tv_classfication.word2vec_path) text_converter = data_convert.SimpleTextConverter(word2vec_model, 80, None) x_test = [] for doc, _ in text_converter.transform_to_ids([words]): x_test.append(doc) x_test = np.array(x_test) graph = tf.Graph() with graph.as_default(),tf.Session() as sess: model = bi_lstm_model.Bi_lstm() model.restore_model(sess) print(tv_classfication.index2label.get(model.predict(sess,x_test)[0]))
def lyrics(): with open('lyrics.json', 'r', encoding='utf-8') as f: data = json.load(f) tokens = list() for v in data.values(): # ??????, ???????? 2 ??, ????? tokens += [seg for seg in jieba.cut(v) if seg.split() and len(seg) > 1] # ?? tokens ????????? counter = Counter(tokens) print(counter.most_common(10)) # ???, ??????????? wcloud = WordCloud(font_path='NotoSansMonoCJKtc-Regular.otf').generate(' '.join(tokens)) plt.imshow(wcloud) plt.axis('off') plt.show()
def cut_words(input_file, output_file): count = 0 with io.open(output_file, mode = 'w', encoding = 'utf-8') as outfile: with io.open(input_file, mode = 'r', encoding = 'utf-8') as infile: for line in infile: line = line.strip() if len(line) < 1: # empty line continue if line.startswith('doc'): # start or end of a passage if line == 'doc': # end of a passage outfile.write(u'\n') count = count + 1 if(count % 1000 == 0): print('%s articles were finished.......' %count) continue for word in jieba.cut(line): outfile.write(word + ' ') print('%s articles were finished.......' %count)
def extract_tags(key_word, a_name): ''' ???????????, ????????????,??????, ?????????JD??????, ??????????5??????????, ??????????????????????????????? ''' cut_tags = [tag for tag in jieba.cut(a_name)][:8] analyse_tags = jieba.analyse.extract_tags(a_name) tags = [tag for tag in cut_tags if tag in analyse_tags] # ?????????????tags??? try: tags.remove(key_word) except: pass tags.insert(0, key_word) if len(tags) > 5: tags = tags[:5] return ' '.join(tags)
def handleLine(self, line): # ??????? line = line.replace(' ', '') line = line.replace('\n', '') line = line.replace('em', '') # ?? words = jieba.cut(line) for word in words: if len(word)<=1: continue if word in self.data: self.data[word] = self.data[word]+1 else: self.data[word] = 1
def process_data(line): """ word break and remove word Returns split sentences """ # Word break seg_list = jieba.cut(line) line = u' '.join(seg_list) # Remove word ss = re.findall('[\n\s*\r\u4e00-\u9fa5]|nmovie|nrcelebrity', line) line = u"".join(ss).strip() if(len(line) < 2): return "UNK" return line
def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.') import jieba while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionc = jieba.cut(question, cut_all=False) question = str(" ".join(questionc)).decoder("GBK") print(question) questionSeq = [] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print('Warning: sentence too long, sorry. Maybe try a simpler sentence.') continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. #jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set #stopwordset = set() #with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw: # for line in sw: # stopwordset.add(line.strip('\n')) output = open('allbook-segment.txt','w') texts_num = 0 with open("allbook.txt", "rb") as f: #if(f.readline() == ""): print("geting data") bookdata = f.read(190000000).decode('UTF-8') print("geting data OK ") lineu = bookdata p = 0 for p in range(0,len(bookdata),100): line = bookdata[p:p+100] #print(line) words = jieba.cut(line, cut_all=False) for word in words: output.write(word +' ') texts_num += 1 if texts_num % 10000 == 0: logging.info("???? %d ????" % texts_num) output.close()
def word_seg_cn(docs): docs = [list(jieba.cut(sent)) for sent in docs] return docs
def cutandsplit(s): for ln in filterlist(splitsentence(stripblank(s))): l = RE_BRACKETS.sub(brcksub, ln.strip()) if notchinese(l): continue yield ' '.join(cut(l.replace('?', '“').replace('?', '”').replace('?', '‘').replace('?', '’').lstrip(tailpunct).rstrip(headpunct)))