def parse_item(item): """ :param item: dictionary :return: void """ def add_school_seq(t): t['school_seq'] = jieba.lcut_for_search(t['school']) def add_title_seq(t): t['title_seq'] = build_tf(t['title'])[1] def add_abstract_seq_and_tf(t): t['abstract_seq_tf'], t['abstract_seq'] = build_tf(t['abstract']) add_abstract_seq_and_tf(item) add_school_seq(item) add_title_seq(item) item['_id'] = str(item['_id']) return item
def __init__(self): self.conn = conn self.cursor = cursor sql = ''' CREATE TABLE IF NOT EXISTS Competition( id INT PRIMARY KEY AUTO_INCREMENT, title VARCHAR(100), publishdate datetime, detail TEXT )ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;''' self.cursor.execute(sql) self.conn.commit() for match in competition_list: splitword = jieba.lcut_for_search(match) #???? onesplit = [] for word in splitword: if re.match('??|??|??|???|??|??|??|?|??|??|??'.decode('utf8'), word) == None: onesplit.append(word) competition_split.append(onesplit)
def search_cut(sentence): """ HMM????? :param sentence: :return: """ return jieba.lcut_for_search(sentence)
def make_inverted_index(filename,read_buff_size,output_file_record_size,web_record_numbers=100000): ''' :param filename: ?????????.txt :param read_buff_size:???????????? :param output_file_token_size:??????????????? :param ????????????????????? ?????? :return:?????? ''' #?????????????????? block_read=read_block(read_buff_size,filename) punct = set(u'''/+%#:!),.:;?]}¢'"???????????????? ????????????????????????????? ??•·???--?’”([{£¥'"?????????????????? ?????????“‘-—_…''') Letters_and_numbers=set('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') buff_dir=filename[:-4]+'_buff' #?????????? ????????????????????????? if os.path.exists(buff_dir): pass else: os.mkdir(buff_dir) file_numbers=1 while True: print "process :cuting word +making inverted_index files---->>>>",file_numbers*(output_file_record_size)*1.0/web_record_numbers spimi=SPIMI_Invert(buff_dir+'/'+str(file_numbers)+'.txt') count=0 while True: doc_id,content=block_read.pop_token() if content==''or count==output_file_record_size: break content_list=jieba.lcut_for_search(content) spimi.push_id(doc_id) for j in range(len(content_list)): if content_list[j] not in punct and content_list[j] not in Letters_and_numbers : spimi.push_word(content_list[j]) del content_list,doc_id,content count+=1 spimi.push_word('')#?? ????? file_numbers+=1 if content=='': break print ("process :cuting word +making inverted_index files---->>>>Finish") #???????? merged_filename=merge_inverted_files.merge_file([str(i) for i in range(1,file_numbers)],read_buff_size,buff_dir+'/') print "process:mergeing inverted index files----->Finish" #????????? ?-?????? Dictionary.establish_ditionary(buff_dir+'/'+merged_filename+'.txt',read_buff_size,buff_dir+'/'+"Dictionary.txt") shutil.copy(buff_dir+'/'+merged_filename+'.txt',filename[:-4]+'_inverted_index.txt')#???? shutil.copy(buff_dir+'/'+"Dictionary.txt",filename[:-4]+'_index_Dictionary.txt') shutil.rmtree(buff_dir)#????? del merged_filename,buff_dir,punct,Letters_and_numbers
def releventScore(self, text, ques, tfidf={}): def filtWord(li): # filt out stop words nl = [] for l in li: if l not in STOPWORDS: nl.append(l) return nl def sims(t, q): if t in self.dic.keys() and q in self.dic.keys(): vector1 = self.dic[t] vector2 = self.dic[q] dot_product = 0.0 normA = 0.0 normB = 0.0 for a, b in zip(vector1, vector2): dot_product += a * b normA += a**2 normB += b**2 if normA == 0.0 or normB == 0.0: return 0 else: return dot_product / ((normA * normB)**0.5) else: l = max([len(t), len(q)]) if Levenshtein.distance(t, q) < l: return (l - Levenshtein.distance(t, q)) / l * 0.7 else: return 0 ttoks = filtWord(jieba.lcut_for_search(text)) qtoks = filtWord(jieba.lcut_for_search(ques)) score = 0 if len(ttoks) == 0: return 0 for tword in ttoks: for qword in qtoks: if tword in tfidf.keys(): rate = tfidf[tword] else: rate = 1 if tword == qword: # exact match score += rate * 2.5 elif sims(tword, qword) > 0.4: # similar score += sims(tword, qword) * rate # remove advantage of length return score / len(ttoks) / len(qtoks) * 100