我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用jieba.load_userdict()。
def __init__(self, dict_path = ''): super(Singleton, self).__init__() if not hasattr(self,'_stop_words'): #??????? if dict_path: jieba.load_userdict(dict_path) self._stop_words = set(( '', ' ', '\n', "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it", "this", "then", "at", "have", "all", "not", "one", "has", "or", "that" ))
def post_desc_counter(): """ ?????? """ # import thulac post = open(os.path.join("data", "post_require.txt"), "r", encoding="utf-8").read() # ?? thulac ?? # thu = thulac.thulac(seg_only=True) # thu.cut(post, text=True) # ?? jieba ?? file_path = os.path.join("data", "user_dict.txt") jieba.load_userdict(file_path) seg_list = jieba.cut(post, cut_all=False) counter = dict() for seg in seg_list: counter[seg] = counter.get(seg, 1) + 1 counter_sort = sorted( counter.items(), key=lambda value: value[1], reverse=True) pprint(counter_sort) with open(os.path.join("data", "post_pre_desc_counter.csv"), "w+", encoding="utf-8") as f: f_csv = csv.writer(f) f_csv.writerows(counter_sort)
def get_hot_words(text): jieba.analyse.set_stop_words(STOPWORDS_PATH) jieba.load_userdict(USER_CORPUS) df = pd.DataFrame(jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())) print(df) df.to_excel('./hotwords/DM.xlsx', 'DM')
def parse(): """parse the comments""" import jieba import jieba.posseg as pseg # Load User's Dictionary path_list = os.getcwd().split('/') path_list.append("dict.txt") dict_path = '/'.join(path_list) jieba.load_userdict(dict_path) # Disimss These Flags dismiss = ['b', 'c', 'r', 'uj', 'u', 'p', 'q', 'uz', 't', 'ul', 'k', 'f', 'ud', 'ug', 'uv'] comments = Comment.query.all() for comment in comments: word_list = [] pseg_cut = pseg.cut(comment.body) for word, flag in pseg_cut: if flag not in dismiss: word_list.append(word) comment.parsed = '/'.join(word_list) db.session.add(comment) print "Comment %04d Parsed!" % comment.id db.session.commit() print "ALL DONE!"
def __init__(self): self.encoderFile = "./question.txt" self.decoderFile = './answer.txt' self.dictFile = 'word_dict.txt' # ??????????? jieba.load_userdict(self.dictFile) # ??????? self.stopwordsFile = "./preprocessing/stopwords.dat"
def __init__(self): print("tensorflow version: ", tf.__version__) tf.reset_default_graph() self.encoder_vec_file = "./preprocessing/enc.vec" self.decoder_vec_file = "./preprocessing/dec.vec" self.encoder_vocabulary = "./preprocessing/enc.vocab" self.decoder_vocabulary = "./preprocessing/dec.vocab" self.dictFile = './word_dict.txt' self.batch_size = 1 self.max_batches = 10000 self.show_epoch = 100 self.model_path = './model/' # jieba???? jieba.load_userdict(self.dictFile) self.model = dynamicSeq2seq(encoder_cell=LSTMCell(20), decoder_cell=LSTMCell(40), encoder_vocab_size=540, decoder_vocab_size=1600, embedding_size=20, attention=True, bidirectional=True, debug=False, time_major=True) self.location = ["??", "??", "??", "??","??"] self.user_info = {"__username__":"Stephen", "__location__":"??"} self.robot_info = {"__robotname__":"JiJi"} self.dec_vocab = {} self.enc_vocab = {} tag_location = '' with open(self.encoder_vocabulary, "r") as enc_vocab_file: for index, word in enumerate(enc_vocab_file.readlines()): self.enc_vocab[word.strip()] = index with open(self.decoder_vocabulary, "r") as dec_vocab_file: for index, word in enumerate(dec_vocab_file.readlines()): self.dec_vocab[index] = word.strip()
def main(argv): f = open('freeRiderData.txt') jieba.load_userdict('KeywordDictionary.txt') for line in f: # ???? seg_list = jieba.cut(line, cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) return
def __init__(self): self.__root_filepath = "f_dict/" jieba.load_userdict("f_dict/user.dict") # ?????? # ???????? self.__phrase_dict = self.__get_phrase_dict() self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt") self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt") self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt") self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt") self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt") self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt")
def __init(): user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt") jieba.load_userdict(user_dict_path) jieba.add_word(u"??", 10000) jieba.suggest_freq((u"?", u"??")) jieba.suggest_freq((u"??", u"??")) jieba.suggest_freq((u"??", u"??")) jieba.suggest_freq((u"??", u"?"))
def __init(): user_dict_path = os.path.join(root_filepath, "f_seg/user_dict.txt") jieba.load_userdict(user_dict_path) jieba.add_word("??", 10000) jieba.suggest_freq(("?", "??")) jieba.suggest_freq(("??", "??")) jieba.suggest_freq(("??", "??")) jieba.suggest_freq(("??", "?"))
def __init__(self,userDict=None,conf={}): self.userDict=userDict self.conf={} self.configFromDict(conf) if self.userDict: jieba.load_userdict(userDict) self.configDefault()
def __init__(self, custom_dict_path=CUSTOM_DICTIONARY_PATH): super(JiebaClient, self).__init__() try: jieba.load_userdict(custom_dict_path) self.debug("init JiebaClient, with custom_dict_path=%s", custom_dict_path) except Exception, e: self.exception(e) self.error('@@@@@@@@@@@@@@@@@@@@@@@@@@@ loading custom_dictionary failed')
def cutwords_jieba(self,sentence,userdict='dict/userdict.txt',stopwords='dict/stopwords.txt'): stropw = [] if userdict: jieba.load_userdict(userdict) stropw = [line.strip() for line in open(stopwords,'r',encoding='utf-8').readlines()] frequency = defaultdict(int) l = list(jieba.cut(sentence)) for t in l: frequency[t] += 1 texts = [token for token in frequency if frequency[token] > 0] rtexts = list(set(texts)-set(stropw)) return rtexts
def read(self,file_name,POS_tag): f = open(file_name, "r") tempLine=[] #vocabulary = {} jieba.load_userdict("data/metadata/user_dict.txt") for lineNo,line in enumerate(f.readlines()): pattern=re.compile("^<d p=\"(.+)\">(.+)</d>") m=pattern.match(line) if m: info=m.group(1).split(',') temp={"time":int(float(info[0])), \ "text":[word for word,flag in pseg.cut(m.group(2)) \ if word not in self.stop_words and flag not in \ POS_tag ], "lineno":lineNo+1, "user":info[6]} #?????? ???????>3??? temp2=[] for index,text in enumerate(temp["text"]): if len(text)>1: temp2.append(text) if len(temp2)>=3: print(temp2) temp["text"]=temp2 tempLine.append(temp) lines=sorted(tempLine, key= lambda e:(e.__getitem__('time'))) print len(lines) return lines#,vocabulary
def __init__(self): self.ut_path = '../data/ut.data' self.vocab_path = '../data/vocab.data' self.ids_path = '../data/ids.data' self.train_path = '../data/train.data' self.dev_path = '../data/dev.data' self.test_path = '../data/test.data' self.dict_path = '../data/medical.txt' self.emd_path = '../data/emd/ylemd.bin' self.tag_path = '../data/tag.data' jieba.load_userdict(self.dict_path)
def __init__(self): self.ut_path = '../data/uterance.data' self.mark_path = '../data/mark.data' self.vocab_path = '../data/vocab.data' self.ids_path = '../data/ids.data' self.train_path = '../data/train.data' self.dev_path = '../data/dev.data' self.test_path = '../data/test.data' self.dict_path = '../data/medical.txt' self.emd_path = '../data/emd/ylemd.bin' jieba.load_userdict(self.dict_path)
def __init__(self,size): self.data_path = 'skin.data' self.train_size = int(size*0.7) self.dev_size = int(size*0.1) self.test_size = size - self.train_size - self.dev_size jieba.load_userdict('medical.txt') self.sentences = [] self.orders = [] self.stop_line = [] for line in open('goodbye.data'): line = line.strip() self.stop_line.append(line) self.ac_dialogs = []
def __init__(self): jieba.load_userdict("keyword.txt") jieba.load_userdict("mingan_word.txt") self.topK = 12 self.mingan_list = [] self.get_mingan_list()
def __init__(self): self.encoderFile = "./question.txt" self.decoderFile = './answer.txt' self.dictFile = 'word_dict.txt' jieba.load_userdict(self.dictFile) self.stopwordsFile = "./preprocessing/stopwords.dat"
def __init__(self): print("tensorflow version: ", tf.__version__) tf.reset_default_graph() self.encoder_vec_file = "./preprocessing/enc.vec" self.decoder_vec_file = "./preprocessing/dec.vec" self.encoder_vocabulary = "./preprocessing/enc.vocab" self.decoder_vocabulary = "./preprocessing/dec.vocab" self.dictFile = './word_dict.txt' self.batch_size = 1 self.max_batches = 100000 self.show_epoch = 100 self.model_path = './model/' # jieba???? jieba.load_userdict(self.dictFile) self.model = dynamicSeq2seq(encoder_cell=LSTMCell(40), decoder_cell=LSTMCell(40), encoder_vocab_size=600, decoder_vocab_size=1600, embedding_size=20, attention=False, bidirectional=False, debug=False, time_major=True) self.location = ["??", "??", "??", "??"] self.user_info = {"__username__":"yw", "__location__":"??"} self.robot_info = {"__robotname__":"Rr"} self.dec_vocab = {} self.enc_vocab = {} self.dec_vecToSeg = {} tag_location = '' with open(self.encoder_vocabulary, "r") as enc_vocab_file: for index, word in enumerate(enc_vocab_file.readlines()): self.enc_vocab[word.strip()] = index with open(self.decoder_vocabulary, "r") as dec_vocab_file: for index, word in enumerate(dec_vocab_file.readlines()): self.dec_vecToSeg[index] = word.strip() self.dec_vocab[word.strip()] = index
def cut_main(): jieba.set_dictionary('dict.txt.big') #jieba.load_userdict("userdict.txt") if len(sys.argv) == 3: inputfile = sys.argv[1] outputfile = sys.argv[2] else: print "Usage: python cut.py filetoCut.txt cuttedFile.txt" sys.exit() readNcut(inputfile,outputfile)
def cut_main(inputfile,outputfile): jieba.set_dictionary('dict.txt.big') #-----user define dict----- #jieba.load_userdict("userdict.txt") readNcut(inputfile,outputfile)
def load_userdict(): """ Load user dictionary """ # ???? jieba.load_userdict("./dict/name/amuse.txt"); jieba.load_userdict("./dict/name/sporter.txt"); jieba.load_userdict("./dict/name/politicians.txt"); # ???? jieba.load_userdict("./dict/sport.txt"); # ???? # ???? jieba.load_userdict("./dict/dict.txt");
def load_userdict(): # ???? jieba.load_userdict("./dict/name/amuse.txt"); jieba.load_userdict("./dict/name/sporter.txt"); jieba.load_userdict("./dict/name/politicians.txt"); # ???? jieba.load_userdict("./dict/sport.txt"); # ???? # ???? jieba.load_userdict("./dict/dict.txt");
def words_split(corpus_path): with open(corpus_path, 'r') as f: content = f.read() jieba.load_userdict('data/userdict.txt') # ????????? jieba.enable_parallel(4) # ???? seg_list = jieba.cut(content, cut_all = False) # ?? return seg_list # ?????
def __init__(self): #self.encoderFile = "/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_ask.txt" #self.decoderFile = '/home/yanwii/Python/NLP/seq2seq/seq2seq_no_buckets/preprocessing/MySeq2seq/Data/alldata_answer.txt' #self.savePath = '/home/yanwii/Python/NLP/seq2seq/seq2seq_pytorch/data/' self.encoderFile = "./data/question.txt" self.decoderFile = "./data/answer.txt" self.savePath = './data/' jieba.load_userdict("./data/supplementvocab.txt")
def __init__(self, diction=None, content=None): self.diction = diction or "assets/location.dict" self.content = content or "" jieba.load_userdict(self.diction)
def gen_dataset_from_baike(): doc_path = os.path.join(rel_ext_dir, 'sample_baike_doc.json') out_path = os.path.join(rel_ext_dir, 'data/raw_dataset.txt') name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache') fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache') finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path) Print('load userdict') jieba.load_userdict(os.path.join(rel_ext_dir, 'trimmed_baike_dict.txt')) Print('gen dataset from [%s]' %doc_path) outf = file(out_path, 'w') for line in tqdm(file(doc_path), total = nb_lines_of(doc_path)): p = line.split('\t') baike_url = p[0].decode('utf-8') paragraphs = json.loads(p[1]) for paragraph in paragraphs: sentences = split_sentences(paragraph) for sentence in sentences: cases, words = gen_dataset(sentence, finder) if len(cases) > 0: out_obj = { 'words': "#".join(words), 'cases': map(str, cases), } outf.write("%s\t%s\n" %(baike_url, json.dumps(out_obj, ensure_ascii = False))) outf.close()
def segment_text(text): # load user dict jieba.load_userdict(user_dict) # set stop words jieba.analyse.set_stop_words(stop_words) tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True, allowPOS=()) for tag in tags: print(str(tag[0]) + "\t" + str(tag[1]))
def __init__(self,n_core = 16): self.rootdir = os.getcwd() self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt')) self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST]) jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt')) self.n_CORE=n_core jieba.enable_parallel(self.n_CORE-1)
def __init__(self): self.__root_path = "data/dict/" jieba.load_userdict("data/dict/user.dict") # ??????? # ???? self.__phrase_dict = self.__get_phrase_dict() self.__positive_dict = self.__get_dict(self.__root_path + "positive_dict.txt") self.__negative_dict = self.__get_dict(self.__root_path + "negative_dict.txt") self.__conjunction_dict = self.__get_dict(self.__root_path + "conjunction_dict.txt") self.__punctuation_dict = self.__get_dict(self.__root_path + "punctuation_dict.txt") self.__adverb_dict = self.__get_dict(self.__root_path + "adverb_dict.txt") self.__denial_dict = self.__get_dict(self.__root_path + "denial_dict.txt")
def read(self,file_name,timelength): #f = open("data/1993410.txt", "r") #timelength = 5640 # f = open("data/5077534.txt", "r") # timelength = 4740 f = open(file_name, "r") #timelength = 2582 tempLine=[] #vocabulary=set() vocabulary = {} jieba.load_userdict("data/metadata/user_dict.txt") for lineNo,line in enumerate(f.readlines()): pattern=re.compile("^<d p=\"(.+)\">(.+)</d>") m=pattern.match(line) if m: temp={} temp={"time":int(float(m.group(1).split(',')[0])), \ "text":[word for word,flag in pseg.cut(m.group(2)) \ if word not in self.stop_words and flag not in \ ["m","w","g","c","o","p","z","q","un","e","r","x","d","t","h","k","y","u","s","uj","ul","r","eng"] ], "lineno":lineNo+1} if len(temp["text"])>3: tempLine.append(temp) for item in temp["text"]: if item not in vocabulary: vocabulary[item]=0 #print(len(tempLine)) lines=sorted(tempLine, key= lambda e:(e.__getitem__('time'))) # print vocabulary # print "vocabulary size: %d " % len(vocabulary) # print "video comment size: %d " % len(lines) # print lines[12] self.store(lines,timelength) return lines,timelength,vocabulary
def __init__(self, user_dict=None): """ Init WordSegment Client @user_dict: user dict ???????????????????????????????? """ self.user_dict = user_dict if self.user_dict is not None: jieba.load_userdict(self.user_dict)
def clean(): jieba.load_userdict("../data/segmention/unigram.txt") output = open("./train.data", "w") with open("../data/prepare_data", "r") as f: for line in f: line = unicode(line.strip()) #?????? line = line.lower() #?????query if len(line) <= 2: continue #???????id?query if re.match('[0-9]{18}', line) != None: continue #???????query eng_flag = True for i in line: if i >= u'\u4e00' and i <= u'\u9fa5': eng_flag = False break if eng_flag == True: continue #???? ll = jieba.cut(line) line = [] for i in ll: if i == u"\u2006" or i == u" " or i == " ": continue line.append(i) #?????????? for i in range(len(line)): if synonym_dict.has_key(line[i]): line[i] = synonym_dict[line[i]] #????query if line in s_list: continue l = ",".join(line) s_list.append(line) output.write(l + "\n") output.close() return
def __init__(self, itemInfos): lastTime = time.time() # itemInfos : dict[(pid, description)] # train model jieba.load_userdict('./dict.txt.big.txt') stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")]) stopWords.add('\n') stopWords.add(' ') stopWords.add(u'\u2022') stopWords.add(u'\xa9') texts = [] self.name2id = {} self.id2name = [] for k, v in itemInfos.iteritems(): seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords] texts.append(list(seg_list)) self.name2id[k] = len(self.id2name) self.id2name.append(k) frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] print "start cast :", (time.time() - lastTime) lastTime = time.time() dictionary = corpora.Dictionary(texts) print "dictionary cast :", (time.time() - lastTime) lastTime = time.time() corpus = [dictionary.doc2bow(text) for text in texts] print "doc2bow cast :", (time.time() - lastTime) lastTime = time.time() tfidf = models.TfidfModel(corpus) print "tfid model cast :", (time.time() - lastTime) lastTime = time.time() lastTime = time.time() corpus_tfidf = tfidf[corpus] print "tfidf corpus cast :", (time.time() - lastTime) lastTime = time.time() self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print "lsi model cast :", (time.time() - lastTime) lastTime = time.time() #corpus_lsi = lsi[corpus_tfidf] self.index = similarities.MatrixSimilarity(self.lsi[corpus]) self.corpus = corpus self.pidName = getPidName() print "init finish"