我们从Python开源项目中,提取了以下31个代码示例,用于说明如何使用gensim.models.TfidfModel()。
def get_tfidf(documents): # ??gensim????tfidf documents=[[word for word in document.text.split()] for document in documents] dictionary = corpora.Dictionary(documents) n_items = len(dictionary) corpus = [dictionary.doc2bow(text) for text in documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ds = [] for doc in corpus_tfidf: d = [0] * n_items for index, value in doc : d[index] = value ds.append(d) return ds
def fitAndPredict(self): corpus = self.trainingSet+self.testSet dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = models.TfidfModel(corpus) corpus = [text for text in model[corpus]] text_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary.token2id)).T if PCA_Applied: pca = PCA(n_components=PCA_nComponents) text_matrix = pca.fit_transform(text_matrix) classifier = LogisticRegression() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'Logistic:' print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(text_matrix[0:len(self.trainingSet)], self.trainingLabel) pred_labels = classifier.predict(text_matrix[len(self.trainingSet):]) print 'SVM:' print classification_report(self.testLabel, pred_labels)
def train_lda_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lda = models.LdaModel(corpus_tfidf, id2word=dictionary, iterations=1000, num_topics=total_topics) return lda
def get_similarity(query, ans_list): s_lenth = len(ans_list) Corp = ans_list # ?????????? dictionary = corpora.Dictionary(Corp) # ?????????? corpus = [dictionary.doc2bow(text) for text in Corp] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] vec_bow = dictionary.doc2bow(query) vec_tfidf = tfidf[vec_bow] index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] similarity = list(sims) # print(similarity) end_lenth = len(similarity) if s_lenth != end_lenth: print('bug') return similarity
def test_miislita_high_level(self): # construct corpus from file miislita = CorpusMiislita(datapath('miIslita.cor')) # initialize tfidf transformation and similarity index tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False) index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len(miislita.dictionary)) # compare to query query = 'latent semantic indexing' vec_bow = miislita.dictionary.doc2bow(query.lower().split()) vec_tfidf = tfidf[vec_bow] # perform a similarity query against the corpus sims_tfidf = index[vec_tfidf] # for the expected results see the article expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334] for i, value in enumerate(expected): self.assertAlmostEqual(sims_tfidf[i], value, 2)
def train_by_lsi(lib_texts): """ ??LSI????? """ from gensim import corpora, models, similarities #????????? #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)?? tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #???????topic???10?LSI?? lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ?? return (index, dictionary, lsi) #????? -- ??????????????????????
def load_model_and_dictionary(self): self.tfidf_model = models.TfidfModel.load('../../temp_results/tfidf_model') self.dictionary = corpora.Dictionary.load('../../temp_results/tfidf_dictionary') print ("Dictionary & Model Loaded Successfully")
def get_tfidf(documents): # ??gensim????tfidf documents=[[word for word in document.split()] for document in documents] dictionary = corpora.Dictionary(documents) n_items = len(dictionary) corpus = [dictionary.doc2bow(text) for text in documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] return corpus_tfidf
def load_tfidf(corpus, dictionary): if not os.path.isfile(TFIDF_MODEL_PATH): print('Creating TF-IDF') tfidf = models.TfidfModel(corpus) print('TF-IDF created') tfidf.save(TFIDF_MODEL_PATH) print('Loading TF-IDF model') tfidf = models.TfidfModel.load(TFIDF_MODEL_PATH) return tfidf # doc_list = get_data() # print(len(doc_list))
def do_calc_svd(self): print("?????%d" %(nlp_master.get_dict_len())) self.k_value = int(0.1*(nlp_master.get_dict_len())) if self.k_value < 300: self.k_value = 300 if self.k_value > 1000: self.k_value = 1000 print("k??%d" %(self.k_value)) tfidf = models.TfidfModel(list(nlp_master._id_docs.values())) tfidf_corpus = tfidf[list(nlp_master._id_docs.values())] # num_topics?????????????? 200–500 # LSI?? self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000) # ?????? today = datetime.date.today() self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day) with open(self.dumpfile,'wb', -1) as fp: dump_data = [] dump_data.append(self._user_classifier) dump_data.append(self.k_value) dump_data.append(self.lsi) pickle.dump(dump_data, fp, -1) return # ???????NULL??? # ???????site_news?????????????
def build_tfidf_base(self, corpus, bow_matrix): ## Description: Build and save objects common to TFIDF and LSA ## Params: Corpus, BOW matrix ## Returns: TF-IDF corpus and matrix tfidf_model = models.TfidfModel(corpus) tfidf_corpus= tfidf_model[corpus] tfidf_matrix = bow_matrix.apply(lambda x: tfidf_model[x[0]], 1) return tfidf_corpus, tfidf_matrix #MODEL OBJECTS #A model object consists of gensim similarity index and matrix containing transformed data
def get_tfidf_weighted_keyphrases(sentences, grammar=r'NP: {<DT>? <JJ>* <NN.*>+}', top_n=10): valid_chunks = get_chunks(sentences, grammar=grammar) dictionary = corpora.Dictionary(valid_chunks) corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] weighted_phrases = {dictionary.get(id): round(value,3) for doc in corpus_tfidf for id, value in doc} weighted_phrases = sorted(weighted_phrases.items(), key=itemgetter(1), reverse=True) return weighted_phrases[:top_n]
def train_lsi_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics) return lsi
def reduce_tfidf(dictionary, weibo_test): corpus_tfidf = None # # # # ????? ????????tfidf if not os.path.exists(path_tmp_tfidf): print('=== ?????tfidf??????????tfidf?? ===') # ?????????tfidf??????????????????? if not dictionary: # ???????????????????? dictionary = corpora.Dictionary.load(path_dictionary) os.makedirs(path_tmp_tfidf) files = os_path.LoadFiles(path_doc_root) tfidf_model = models.TfidfModel(dictionary=dictionary) corpus_tfidf = {} for i, msg in enumerate(files): catg = msg[0] file = msg[1] word_list = convert_doc_to_wordlist(file, cut_all=False) file_bow = dictionary.doc2bow(word_list) file_tfidf = tfidf_model[file_bow] tmp = corpus_tfidf.get(catg, []) tmp.append(file_tfidf) if tmp.__len__() == 1: corpus_tfidf[catg] = tmp # ?tfidf???????? catgs = list(corpus_tfidf.keys()) for catg in catgs: corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg), corpus_tfidf.get(catg), id2word=dictionary ) print('catg {c} has been transformed into tfidf vector'.format(c=catg)) print('=== tfidf?????? ===') else: print('=== ???tfidf???????????? ===') svm_lsi.reduce_lsi(dictionary, corpus_tfidf, weibo_test)
def reduce_result(dictionary, lsi_model, predictor, weibo_test): # # # # ????? ???????? if not dictionary: dictionary = corpora.Dictionary.load(path_dictionary) if not lsi_model: lsi_file = open(path_tmp_lsimodel,'rb') lsi_model = pkl.load(lsi_file) lsi_file.close() if not predictor: x = open(path_tmp_predictor,'rb') predictor = pkl.load(x) x.close() files = os.listdir(path_tmp_lsi) catg_list = [] for file in files: t = file.split('.')[0] if t not in catg_list: catg_list.append(t) demo_doc = weibo_test print(demo_doc) demo_doc = list(jieba.cut(demo_doc,cut_all=False)) demo_bow = dictionary.doc2bow(demo_doc) tfidf_model = models.TfidfModel(dictionary=dictionary) demo_tfidf = tfidf_model[demo_bow] demo_lsi = lsi_model[demo_tfidf] data = [] cols = [] rows = [] for item in demo_lsi: data.append(item[1]) cols.append(item[0]) rows.append(0) demo_matrix = csr_matrix((data,(rows,cols))).toarray() x = predictor.predict(demo_matrix) print('??????{x}'.format(x=catg_list[x[0]]))
def save_tfidf(): corpus_bow = corpora.MmCorpus(BowFile) tfidf_model = models.TfidfModel(corpus_bow) corpus_tfidf = tfidf_model[corpus_bow] corpora.MmCorpus.serialize(TfidfFile,corpus_tfidf) print "==================== TF-IDF data Generated and Saved ===================="
def tfidf(): if not TFIDF: return doc1 = u'Andrew likes Diet Pepsi.' doc2 = u'Andrew knows the muffin man.' doc3 = u'Andrew lives near the muffin man on Shirley Lane.' corpus = map(sip.noun_phrases, [doc1, doc2, doc3]) dictionary = corpora.Dictionary(corpus) bows = [dictionary.doc2bow(tokens) for tokens in corpus] return models.TfidfModel(bows, id2word=dictionary)
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def buildCorpus(self): """ Build the corpus from the documents: 1. Remove words that only appeared once. 2. Create the Dictionary object. 3. Convert the documents to simple bag-of-words representation. 4. Convert the bag-of-words vectors to tf-idf. """ # Remove words that only appear once. self.documents = [[token for token in doc if self.frequency[token] > 1] for doc in self.documents] # Build a dictionary from the text. self.dictionary = corpora.Dictionary(self.documents) # Map the documents to vectors. corpus = [self.dictionary.doc2bow(text) for text in self.documents] # Delete the tokenized representation of the documents--no need to # carry this around! del self.documents[:] # Convert the simple bag-of-words vectors to a tf-idf representation. self.tfidf_model = TfidfModel(corpus) self.corpus_tfidf = self.tfidf_model[corpus]
def load_tfidf_model(self): print "loading tfidf from", self.tfidf_filepath self.tfidf_model = models.TfidfModel.load(self.tfidf_filepath, mmap='r')
def make_tfidf_model(self, seqs): self.tfidf_model = models.TfidfModel((self.lexicon.doc2bow(tokenize(seq)) for seq in seqs)) self.tfidf_model.save(self.tfidf_filepath) print "saved tfidf to", self.tfidf_filepath
def getLsiModel(lsipath='./lsi/', num_topics=300): # ???? dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') print '??????' # ??? corpus = corpora.MmCorpus(lsipath +'viva.mm') print ('mm load') t31 = time.time() # tfidf tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] t32 = time.time() print "tfidf_corpus time = ", t32 - t31 # baobao change 3 lines # corpus = MyCorpus() # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False) # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000) lsi = None try: lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #???????? lsi.save(lsipath + 'viva.lsi') print('lsi??????') except (SystemExit, KeyboardInterrupt): raise except Exception, e: logging.error('Failed to lsi train', exc_info=True) return lsi
def tfidf_model(self): if self._tfidf_model is None: doc_count = self.post_ids_query.count() if doc_count < 10: return None dictionary = self.dictionary tfidf_model = gmodels.TfidfModel(id2word=dictionary) tfidf_fname = join(self.dirname, "tfidf_%d.model" % ( self.discussion.id,)) subcorpus = self.subcorpus if exists(tfidf_fname): tfidf_model = tfidf_model.load(tfidf_fname) # assumption: count implies identity. # Wrong in corner cases: hidden, etc. if tfidf_model.num_docs != doc_count: unlink(tfidf_fname) tfidf_model = gmodels.TfidfModel(id2word=dictionary) if tfidf_model.num_docs != doc_count: tfidf_model.initialize(subcorpus) tfidf_model.save(tfidf_fname) self._tfidf_model = tfidf_model return self._tfidf_model
def tfidf_weight(self): self.corpus = models.TfidfModel(self.corpus, normalize=True)
def trainModel(self): ''' Train a LDA model, inclusive of 4 steps: 1. Parse the whole corpora into unigram token collections and document mapping (for later use) 2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc) 3. Indexing the token collections and do TF-IDF transformation 4. Call gensim.models.LdaModel and generate topic distributions of the corpora ''' print 'Start preparing unigram tokens....' ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW) # Get document_count, tokens, and document-index mapping from the corpora doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) # Put the training data into gensim.corpora for later use dic = corpora.Dictionary(train_set) denominator = len(dic) # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality) dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc) nominator = len(dic) corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW print 'There are %i documents in the pool' % (doc_count) print "In the corpus there are ", denominator, " raw tokens" print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%" print 'Finished preparing unigram tokens....' ##END print 'Start training LDA model....' ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes) corpus_lda = lda[corpus_tfidf] # Once done training, print all the topics and related words print 'Finished training LDA model.......Here is the list of all topics & their most frequent words' for i in range(self.num_topics): print 'Topic %s : ' % (str(i)) + lda.print_topic(i) # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better print '===============================' print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics) print '===============================' return lda,doc_mapping,link_mapping,corpus
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir): DCG_score_list = [] for question_index in range(int(question_num)): if (question_index+1)%1000 == 1: print 'Now for line : ' + str(question_index+1) + '\n' index = question_index + 1 file_read_name = os.path.join(question_answer_word_dir,str(index)) file_write_name = os.path.join(question_answer_score_label_file_dir,str(index)) file_read = open(file_read_name,'rb+') question_line = file_read.readline() question_line_list = question_line.strip().split('\t') question_line_list.remove('question') answer_index = 0 answer_index_line_label_dict = {} answer_sentences_word_list = [] for line in file_read.readlines(): answer_temp_line_list = line.strip().split('\t') answer_label = answer_temp_line_list[1] answer_temp_line_list.remove('answer') answer_temp_line_list.remove(answer_label) answer_sentences_word_list.append(answer_temp_line_list) answer_list_temp = [] answer_list_temp.append(answer_label) answer_index_line_label_dict[answer_index] = answer_list_temp answer_index += 1 dic = corpora.Dictionary(answer_sentences_word_list) corpus=[dic.doc2bow(text) for text in answer_sentences_word_list] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2) index = similarities.MatrixSimilarity(lda[corpus_tfidf]) query_bow = dic.doc2bow(question_line_list) query_lda = lda[query_bow] sims = index[query_lda] list_simes = list(enumerate(sims)) sort_sims = sorted(enumerate(sims),key=lambda item:-item[1]) #answer_label_list = [] for item in list_simes: answer_index_temp = item[0] answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0]) answer_score = str(item[1]) file_write = open(file_write_name,'ab+') file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n') file_write.close() #answer_label_list.append(answer_label) #DCG_score = calu_DCG(answer_label_list,k) #DCG_score_list.append(DCG_score) #DCG_avg = calu_avg_answer_length(DCG_score_list) #print 'DCG_avg : \t' + str(DCG_avg)
def get_score_for_question(question_answer_word_dir,question_num,k): DCG_score_list = [] for question_index in range(int(question_num)): if (question_index+1)%1000 == 1: print 'Now for line : ' + str(question_index+1) + '\n' index = question_index + 1 file_read_name = os.path.join(question_answer_word_dir,str(index)) file_read = open(file_read_name,'rb+') question_line = file_read.readline() question_line_list = question_line.strip().split('\t') question_line_list.remove('question') answer_index = 0 answer_index_line_label_dict = {} answer_sentences_word_list = [] for line in file_read.readlines(): answer_temp_line_list = line.strip().split('\t') answer_label = answer_temp_line_list[1] answer_temp_line_list.remove('answer') answer_temp_line_list.remove(answer_label) answer_sentences_word_list.append(answer_temp_line_list) answer_list_temp = [] answer_list_temp.append(answer_label) answer_index_line_label_dict[answer_index] = answer_list_temp answer_index += 1 dic = corpora.Dictionary(answer_sentences_word_list) corpus=[dic.doc2bow(text) for text in answer_sentences_word_list] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2) index = similarities.MatrixSimilarity(lda[corpus_tfidf]) query_bow = dic.doc2bow(question_line_list) query_lda = lda[query_bow] sims = index[query_lda] sort_sims = sorted(enumerate(sims),key=lambda item:-item[1]) answer_label_list = [] for item in sort_sims: answer_index_temp = item[0] answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0]) answer_label_list.append(answer_label) DCG_score = calu_DCG(answer_label_list,k) DCG_score_list.append(DCG_score) DCG_avg = calu_avg_answer_length(DCG_score_list) print 'DCG_avg : \t' + str(DCG_avg)
def train_tfidf_model(self,file_path='../../temp_results/corpus.txt'): textfile = codecs.open(file_path, "r", "utf-8") print("Reading and Processing Text File") first_lines=[] for line in textfile: first_lines.append(line.strip()) print ("--------Building Corpora Dictionary---------------" ) dictionary = corpora.Dictionary(line.split('#|#')[1].split() for line in first_lines) #remove words that appear less than 2 times #twoids = [tokenid for tokenid,docfreq in iteritems(dictionary.dfs) if docfreq < 2] #dictionary.filter_tokens(fiveids) #Remove Gaps dictionary.compactify() dictionary.save_as_text('../../temp_results/tfidf_dictionary.txt',sort_by_word=False) dictionary.save('../../temp_results/tfidf_dictionary') print("Dictionary Saved") print ("--Now Transforming to Bag of Words Vectors on the Fly--") class MyCorpus(object): def __iter__(self): for line in first_lines: yield dictionary.doc2bow(line.split()) news_corpus = MyCorpus() print("Corpus Built...Now Starting Model Training") tfidf_model = models.TfidfModel(news_corpus) tfidf_model.save('../../temp_results/tfidf_model') print("Model Trained & Saved")
def __init__(self, itemInfos): lastTime = time.time() # itemInfos : dict[(pid, description)] # train model jieba.load_userdict('./dict.txt.big.txt') stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")]) stopWords.add('\n') stopWords.add(' ') stopWords.add(u'\u2022') stopWords.add(u'\xa9') texts = [] self.name2id = {} self.id2name = [] for k, v in itemInfos.iteritems(): seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords] texts.append(list(seg_list)) self.name2id[k] = len(self.id2name) self.id2name.append(k) frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] print "start cast :", (time.time() - lastTime) lastTime = time.time() dictionary = corpora.Dictionary(texts) print "dictionary cast :", (time.time() - lastTime) lastTime = time.time() corpus = [dictionary.doc2bow(text) for text in texts] print "doc2bow cast :", (time.time() - lastTime) lastTime = time.time() tfidf = models.TfidfModel(corpus) print "tfid model cast :", (time.time() - lastTime) lastTime = time.time() lastTime = time.time() corpus_tfidf = tfidf[corpus] print "tfidf corpus cast :", (time.time() - lastTime) lastTime = time.time() self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print "lsi model cast :", (time.time() - lastTime) lastTime = time.time() #corpus_lsi = lsi[corpus_tfidf] self.index = similarities.MatrixSimilarity(self.lsi[corpus]) self.corpus = corpus self.pidName = getPidName() print "init finish"