我们从Python开源项目中,提取了以下16个代码示例,用于说明如何使用jieba.set_dictionary()。
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. #jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set #stopwordset = set() #with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw: # for line in sw: # stopwordset.add(line.strip('\n')) output = open('allbook-segment.txt','w') texts_num = 0 with open("allbook.txt", "rb") as f: #if(f.readline() == ""): print("geting data") bookdata = f.read(190000000).decode('UTF-8') print("geting data OK ") lineu = bookdata p = 0 for p in range(0,len(bookdata),100): line = bookdata[p:p+100] #print(line) words = jieba.cut(line, cut_all=False) for word in words: output.write(word +' ') texts_num += 1 if texts_num % 10000 == 0: logging.info("???? %d ????" % texts_num) output.close()
def jiebaCustomSetting(self, dict_path, usr_dict_path): jieba.set_dictionary(dict_path) with open(usr_dict_path, 'r', encoding='utf-8') as dic: for word in dic: jieba.add_word(word.strip('\n'))
def __config_jieba(self): """ ?????? """ jieba.set_dictionary(jieba_dictionary)
def tokenize_file(self, text_path, text_output_path='./tokenized_texts.txt'): """ ???????? jieba ???? """ # jieba custom setting. jieba.set_dictionary(jieba_dictionary) # load stopwords set stopwordset = set() with open(jieba_stopwords, 'r', encoding='utf-8') as sw: for line in sw: stopwordset.add(line.strip('\n')) # ?? texts_num = 0 # ?????? output = open(text_output_path, 'w') # ?????? with open(text_path, 'r') as content: for line in content: line = line.strip('\n') # ?????? words = jieba.cut(line, cut_all=False) for word in words: if word not in stopwordset: output.write(word + ' ') output.write('\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("???? %d ????" % texts_num) output.close()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopwordset = set() with io.open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw: for line in sw: stopwordset.add(line.strip('\n')) texts_num = 0 output = io.open('wiki_seg.txt','w',encoding='utf-8') with io.open('wiki_zh_tw.txt','r',encoding='utf-8') as content : for line in content: words = jieba.cut(line, cut_all=False) for word in words: if word not in stopwordset: output.write(word +' ') texts_num += 1 if texts_num % 10000 == 0: logging.info("???? %d ????" % texts_num) output.close()
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # jieba custom setting. jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopwordset = set() with open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw: for line in sw: stopwordset.add(line.strip('\n')) texts_num = 0 output = open('wiki_seg.txt','w') with open('wiki_zh_tw.txt','r') as content : for line in content: line = line.strip('\n') words = jieba.cut(line, cut_all=False) for word in words: if word not in stopwordset: output.write(word +' ') texts_num += 1 if texts_num % 10000 == 0: logging.info("???? %d ????" % texts_num) output.close()
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testSetDictionary", file=sys.stderr)
def __init__(self, status): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36', } self.headers2 = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36', } self.cookies = { '_ts_id': '999999999999999999', } self.cookies2 = { '_ts_id': '888888888888888888', } self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \ 'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \ 'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \ '12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \ 'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \ 'supplementary', 'bottle', 'combination', 'look_times', 'label')) # outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??'] if status == 'c': self.with_header = False elif status == 'i': self.with_header = True else: raise SystemInputError('???????: c -> ??, i -> ??????') jieba.set_dictionary('dict.txt.big') # ??
def __init__(self, status): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36', } self.headers2 = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36', } self.cookies = { '_ts_id': '999999999999999999', } self.cookies2 = { '_ts_id': '888888888888888888', } self.result_df = pd.DataFrame(columns=('GID', 'price', 'discount', 'payment_CreditCard', \ 'payment_Arrival', 'payment_ConvenienceStore', 'payment_ATM', 'payment_iBon', \ 'preferential_count', 'img_height', 'is_warm', 'is_cold', 'is_bright', 'is_dark', \ '12H', 'shopcart', 'superstore', 'productFormatCount', 'attributesListArea', \ 'haveVideo', 'Taiwan','EUandUS','Germany','UK','US','Japan','Malaysia','Australia','other', \ 'look_times', 'label')) # outputOriginList = [u'??', u'??', u'??', u'??', u'??', u'??', u'????', u'??', u'??'] if status == 'c': self.with_header = False elif status == 'i': self.with_header = True else: raise SystemInputError('???????: c -> ??, i -> ??????') jieba.set_dictionary('dict.txt.big') # ??
def testSetDictionary(self): jieba.set_dictionary("foobar.txt") for content in test_contents: result = jieba.cut(content) assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error" result = list(result) assert isinstance(result, list), "Test SetDictionary error on content: %s" % content print >> sys.stderr, " , ".join(result) print >> sys.stderr, "testSetDictionary"
def set_dic(): _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) )) settings_path = os.environ.get('dict.txt') if settings_path and os.path.exists(settings_path): jieba.set_dictionary(settings_path) elif os.path.exists(os.path.join(_curpath, 'data/dict.txt.big')): jieba.set_dictionary('data/dict.txt.big') else: print "Using traditional dictionary!"
def __init__(self, slack, custom): self.slack = slack self.rundata = custom['data'] self.colorPrint = custom['colorPrint'] self.food_dir = "data/midnight.json" self.food_dic = "data/dict.txt.big" # find midnight channel self.nochannel = False rep = self.slack.api_call("channels.list") self.channel_id = "" for c in rep['channels']: if c['name'].lower() == custom['food_channelname']: self.channel_id = c['id'] break if not self.channel_id: self.colorPrint( "No midnight channel", "Restart when midnight channel can use", color="FAIL") self.nochannel = True return jieba.set_dictionary(self.food_dic) jieba.initialize() # add and del words for word in self.rundata.get('FOOD_addword'): jieba.add_word(word) for word in self.rundata.get('FOOD_delword'): jieba.del_word(word) self.init()
def cut_main(): jieba.set_dictionary('dict.txt.big') #jieba.load_userdict("userdict.txt") if len(sys.argv) == 3: inputfile = sys.argv[1] outputfile = sys.argv[2] else: print "Usage: python cut.py filetoCut.txt cuttedFile.txt" sys.exit() readNcut(inputfile,outputfile)
def cut_main(inputfile,outputfile): jieba.set_dictionary('dict.txt.big') #-----user define dict----- #jieba.load_userdict("userdict.txt") readNcut(inputfile,outputfile)