我们从Python开源项目中,提取了以下22个代码示例,用于说明如何使用gensim.models.LsiModel()。
def getTextConfidence(self, text): if self.typeOfSim == 'jaccard': intend_confidenceList = [] for i in self.know_words: intend_confidenceList.append(jaccard_compare(text, i)) if len(self.know_words) > 0: return max(intend_confidenceList) else : return 0 elif self.typeOfSim == 'gensim': try: from gensim import corpora, models, similarities except Exception as e: print(e) dictionary = corpora.Dictionary(self.know_words_remove_stopwords) corpus = [dictionary.doc2bow(text) for text in self.know_words_remove_stopwords] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) new_doc = text vec_bow = dictionary.doc2bow(new_doc.lower().split()) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(lsi[corpus]) sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) most_sim = sims[0] return most_sim[1]
def get_similarity(query, ans_list): s_lenth = len(ans_list) Corp = ans_list # ?????????? dictionary = corpora.Dictionary(Corp) # ?????????? corpus = [dictionary.doc2bow(text) for text in Corp] lsi = models.LsiModel(corpus) corpus_lsi = lsi[corpus] vec_bow = dictionary.doc2bow(query) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(corpus_lsi) sims = index[vec_lsi] similarity = list(sims) # print(similarity) end_lenth = len(similarity) if s_lenth != end_lenth: print('bug') return similarity
def run_model(name): if name == 'lsi': lsi = models.LsiModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics) print('Saving lsi_model...') lsi.save('exports/lsi.model') print('lsi_model saved!') # lsi_matrix = gensim.matutils.corpus2dense(lsi[corpus_gensim], len(lsi.projection.s)).T / lsi.projection.s # print('Saving lsi_matrix...') # pickle.dump(lsi_matrix, open('exports/lsi_matrix.p','wb')) # print('lsi_matrix saved!') elif name == 'lda': # lda = models.LdaModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5) lda = models.ldamulticore.LdaMulticore(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)#, alpha='auto') #auto needs non multicore LDA print('Saving lda_model...') lda.save('exports/lda.model') print('lda_model saved!') # lda_matrix = gensim.matutils.corpus2dense(lda[corpus_gensim], lda.num_topics) # print('Saving lda_matrix...') # pickle.dump(lda_matrix, open('exports/lda_matrix.p','wb')) # print('lda_matrix saved!') gc.collect()
def train_model(model_name, corpus, id2word, num_topics): """ Train specified model """ # LDA if model_name == 'lda': model = models.LdaModel( corpus, id2word=id2word, num_topics=num_topics, alpha='auto', eval_every=5, ) return model # LSI elif model_name == 'lsi': model = models.LsiModel( corpus, id2word=id2word, num_topics=num_topics, ) return model else: print('Invalid model name') return None
def gensim(self): # https://radimrehurek.com/gensim/dist_lsi.html # https://radimrehurek.com/gensim/models/lsimodel.html corpus = corpora.MmCorpus('../lda/lda_sources/documents_corpus.mm') id2word = corpora.Dictionary.load('../lda/lda_sources/documents_dictionary.dict') lsi = models.LsiModel(corpus, id2word=id2word, num_topics=self.dimensions) return lsi
def train_by_lsi(lib_texts): """ ??LSI????? """ from gensim import corpora, models, similarities #????????? #import logging #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary(lib_texts) corpus = [dictionary.doc2bow(text) for text in lib_texts] #doc2bow(): ?collection words ?????????(word_id, word_frequency)?? tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #???????topic???10?LSI?? lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) index = similarities.MatrixSimilarity(lsi[corpus]) # index ? gensim.similarities.docsim.MatrixSimilarity ?? return (index, dictionary, lsi) #????? -- ??????????????????????
def generateTopic(self,method=TopicMethod.LDA,numTopics=10): corpus=[self.dictionary.doc2bow(article.wordList) for article in Article.objects.only("wordList")] if method==TopicMethod.LDA: instance=ldamodel.LdaModel(corpus,id2word=self.dictionary,num_topics=numTopics) elif method==TopicMethod.LSI: instance=models.LsiModel(corpus,id2word=self.dictionary,num_topics=numTopics) dstCorpus=instance[corpus] return dstCorpus
def do_calc_svd(self): print("?????%d" %(nlp_master.get_dict_len())) self.k_value = int(0.1*(nlp_master.get_dict_len())) if self.k_value < 300: self.k_value = 300 if self.k_value > 1000: self.k_value = 1000 print("k??%d" %(self.k_value)) tfidf = models.TfidfModel(list(nlp_master._id_docs.values())) tfidf_corpus = tfidf[list(nlp_master._id_docs.values())] # num_topics?????????????? 200–500 # LSI?? self.lsi = models.LsiModel(tfidf_corpus, id2word=nlp_master.dictionary, num_topics=self.k_value, chunksize=2000) # ?????? today = datetime.date.today() self.dumpfile = "dumpdir/recsvd_dump.%d_%d" %(today.month, today.day) with open(self.dumpfile,'wb', -1) as fp: dump_data = [] dump_data.append(self._user_classifier) dump_data.append(self.k_value) dump_data.append(self.lsi) pickle.dump(dump_data, fp, -1) return # ???????NULL??? # ???????site_news?????????????
def build_lsa(self, nt, dictionary, tfidf_corpus, tfidf_matrix): ## Description: Builds LSA model and performs document similarity ## Params: Number of topics, dict, TFIDF corpus, TFIDF matrix ## Returns: Similarity index and matrix lsa_model = models.LsiModel(tfidf_corpus, id2word= dictionary, num_topics=nt) index = similarities.MatrixSimilarity(lsa_model[tfidf_corpus]) matrix = tfidf_matrix.apply(lambda x: lsa_model[x], 1) return (index, matrix)
def train_lsi_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics) return lsi
def lsi_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_tfidf = corpora.MmCorpus(TfidfFile) N_TOPICS = 300 lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=N_TOPICS) print "================= LSI MODEL IS BUILT =================" lsi_model.save(LsiModelFile) save_topics(lsi_model,LsiTopicsFile)
def LSI_fit(self, data): ''' Fits an LSI model and returns it with associated dictionary ''' texts = [[tag for tag in sent] for sent in self.get_pos(data)] dictionary = corpora.Dictionary(texts) texts = map(dictionary.doc2bow, texts) lsi = models.LsiModel(texts, id2word=dictionary, num_topics=self.num_topics) return dictionary, lsi
def load_lsi_model(self): print "loading lsi model from", self.lsi_filepath self.lsi_model = models.LsiModel.load(self.lsi_filepath, mmap='r')
def make_lsi_model(self, seqs): if self.use_tfidf: seqs = (self.tfidf_model[self.lexicon.doc2bow(tokenize(seq))] for seq in seqs) else: seqs = (self.lexicon.doc2bow(tokenize(seq)) for seq in seqs) self.lsi_model = models.LsiModel(seqs, num_topics=self.n_lsi_dim, id2word=self.lexicon) self.lsi_model.save(self.lsi_filepath) print "saved lsi model to", self.lsi_filepath
def getLsiModel(lsipath='./lsi/', num_topics=300): # ???? dictionary = corpora.Dictionary.load(lsipath + 'viva.dict') print '??????' # ??? corpus = corpora.MmCorpus(lsipath +'viva.mm') print ('mm load') t31 = time.time() # tfidf tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] t32 = time.time() print "tfidf_corpus time = ", t32 - t31 # baobao change 3 lines # corpus = MyCorpus() # lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=NUM_TOPIC,power_iters=2,chunksize=50000,onepass=True,distributed=False) # lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics,chunksize=20000) lsi = None try: lsi = lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, chunksize=60000, power_iters=2, onepass=True) #???????? lsi.save(lsipath + 'viva.lsi') print('lsi??????') except (SystemExit, KeyboardInterrupt): raise except Exception, e: logging.error('Failed to lsi train', exc_info=True) return lsi
def process(self, unused, site, config): try: num_related_posts = config.num_related_posts # Tokenize docs = [] valid_posts = [] #exclude pages that are not posts for post in site.posts: if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT: continue txt = post.md docs.append(gensim.utils.simple_preprocess(txt, deacc=True, min_len=3, max_len=15)) valid_posts.append(post) # Fixme stemming # build model dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] tfidf = models.tfidfmodel.TfidfModel(corpus=corpus) # Fixme: get correct number of topics num_topics = site.posts_by_tag.get_num_collections() * 2 # use collections as a proxy for the number of topics topic_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=num_topics) index = similarities.MatrixSimilarity(topic_model[tfidf[corpus]], num_best=num_related_posts + 1) #+1 because the best one is itself # find simlar posts and store them log_details = "" for post, sims in zip(valid_posts, index): if post.meta.microdata_type not in RelatedPosts.VALID_FORMAT: continue post.meta.related_posts = [] log_details += '<div class="subsection"><h3>%s</h3>Related posts:<ol>' % (post.meta.title) for idx, score in sims[1:]: #1: > first one is the article itself p = valid_posts[idx] o = utils.create_objdict() o.meta = p.meta o.score = score o.html = p.score post.meta.related_posts.append(o) log_details += '<li>%s (%s)</li>' % (o.meta.title, round(score,2)) log_details += '<ol></div>' return (SiteFab.OK, "Related posts via LSI", log_details) except Exception as e: return (SiteFab.ERROR, "Related posts via LSI", e)
def test_lee(self): """correlation with human data > 0.6 (this is the value which was achieved in the original paper) """ global bg_corpus, corpus # create a dictionary and corpus (bag of words) dictionary = corpora.Dictionary(bg_corpus) bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus] corpus = [dictionary.doc2bow(text) for text in corpus] # transform the bag of words with log_entropy normalization log_ent = models.LogEntropyModel(bg_corpus) bg_corpus_ent = log_ent[bg_corpus] # initialize an LSI transformation from background corpus lsi = models.LsiModel(bg_corpus_ent, id2word=dictionary, num_topics=200) # transform small corpus to lsi bow->log_ent->fold-in-lsi corpus_lsi = lsi[log_ent[corpus]] # compute pairwise similarity matrix and extract upper triangular res = np.zeros((len(corpus), len(corpus))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): res[i, j] = matutils.cossim(par1, par2) flat = res[matutils.triu_indices(len(corpus), 1)] cor = np.corrcoef(flat, human_sim_vector)[0, 1] logging.info("LSI correlation coefficient is %s" % cor) self.assertTrue(cor > 0.6) # def test_lee_mallet(self): # global bg_corpus, corpus, bg_corpus2, corpus2 # # create a dictionary and corpus (bag of words) # dictionary = corpora.Dictionary(bg_corpus2) # bg_corpus = [dictionary.doc2bow(text) for text in bg_corpus2] # corpus = [dictionary.doc2bow(text) for text in corpus2] # # initialize an LDA transformation from background corpus # lda = models.LdaMallet('/Users/kofola/Downloads/mallet-2.0.7/bin/mallet', # corpus=bg_corpus, id2word=dictionary, num_topics=200, optimize_interval=10) # corpus_lda = lda[corpus] # # compute pairwise similarity matrix and extract upper triangular # res = np.zeros((len(corpus), len(corpus))) # for i, par1 in enumerate(corpus_lda): # for j, par2 in enumerate(corpus_lda): # res[i, j] = matutils.cossim(par1, par2) # flat = res[matutils.triu_indices(len(corpus), 1)] # cor = np.corrcoef(flat, human_sim_vector)[0, 1] # logging.info("LDA correlation coefficient is %s" % cor) # self.assertTrue(cor > 0.35)
def __init__(self, itemInfos): lastTime = time.time() # itemInfos : dict[(pid, description)] # train model jieba.load_userdict('./dict.txt.big.txt') stopWords = set([line.strip().decode("gbk").lower() for line in open("./stopWords.txt")]) stopWords.add('\n') stopWords.add(' ') stopWords.add(u'\u2022') stopWords.add(u'\xa9') texts = [] self.name2id = {} self.id2name = [] for k, v in itemInfos.iteritems(): seg_list = [w.lower() for w in jieba.cut(v, cut_all=False) if w.lower() not in stopWords] texts.append(list(seg_list)) self.name2id[k] = len(self.id2name) self.id2name.append(k) frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] print "start cast :", (time.time() - lastTime) lastTime = time.time() dictionary = corpora.Dictionary(texts) print "dictionary cast :", (time.time() - lastTime) lastTime = time.time() corpus = [dictionary.doc2bow(text) for text in texts] print "doc2bow cast :", (time.time() - lastTime) lastTime = time.time() tfidf = models.TfidfModel(corpus) print "tfid model cast :", (time.time() - lastTime) lastTime = time.time() lastTime = time.time() corpus_tfidf = tfidf[corpus] print "tfidf corpus cast :", (time.time() - lastTime) lastTime = time.time() self.lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100) print "lsi model cast :", (time.time() - lastTime) lastTime = time.time() #corpus_lsi = lsi[corpus_tfidf] self.index = similarities.MatrixSimilarity(self.lsi[corpus]) self.corpus = corpus self.pidName = getPidName() print "init finish"
def reduce_lsi(dictionary, corpus_tfidf, weibo_test): corpus_lsi = None lsi_model = None # # # # ????? ???tfidf???lsi if not os.path.exists(path_tmp_lsi): print('=== ?????lsi??????????lsi?? ===') if not dictionary: dictionary = corpora.Dictionary.load(path_dictionary) if not corpus_tfidf: # ??????????????????tfidf?? print('--- ????tfidf??????????? ---') # ????????????? files = os.listdir(path_tmp_tfidf) catg_list = [] for file in files: t = file.split('.')[0] if t not in catg_list: catg_list.append(t) # ??????corpus corpus_tfidf = {} for catg in catg_list: path = '{f}{s}{c}.mm'.format(f=path_tmp_tfidf, s=os.sep, c=catg) corpus = corpora.MmCorpus(path) corpus_tfidf[catg] = corpus print('--- tfidf????????????lsi?? ---') # ??lsi model os.makedirs(path_tmp_lsi) corpus_tfidf_total = [] catgs = list(corpus_tfidf.keys()) for catg in catgs: tmp = corpus_tfidf.get(catg) corpus_tfidf_total += tmp lsi_model = models.LsiModel(corpus=corpus_tfidf_total, id2word=dictionary, num_topics=50) # ?lsi???????? lsi_file = open(path_tmp_lsimodel, 'wb') pkl.dump(lsi_model, lsi_file) lsi_file.close() del corpus_tfidf_total # lsi model??????????? print('--- lsi?????? ---') # ??corpus of lsi, ????? corpus of tfidf corpus_lsi = {} for catg in catgs: corpu = [lsi_model[doc] for doc in corpus_tfidf.get(catg)] corpus_lsi[catg] = corpu corpus_tfidf.pop(catg) corpora.MmCorpus.serialize('{f}{s}{c}.mm'.format(f=path_tmp_lsi, s=os.sep, c=catg), corpu, id2word=dictionary) print('=== lsi?????? ===') else: print('=== ???lsi???????????? ===') svm_module.reduce_module(dictionary, corpus_lsi, lsi_model, weibo_test)
def __init__(self, filename): self.docs = loads(open(filename, "r").read()) self.docmap = hoist_dict(self.docs, "id") if isfile("data.dict"): self.dictionary = Dictionary.load("data.dict") else: self.dictionary = Dictionary(iterate_summaries(self.docs)) self.dictionary.save("data.dict") if isfile("data.mm"): self.corpus = MmCorpus("data.mm") else: corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs)) MmCorpus.serialize("data.mm", corpus) self.corpus = MmCorpus("data.mm") self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3) if isfile("data.sim"): self.sim = MatrixSimilarity.load("data.sim") else: self.sim = MatrixSimilarity(self.lsi[self.corpus]) self.sim.save("data.sim") # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1) self.sentiment_model = Doc2Vec.load("imdb.d2v") self.sentiment = LogisticRegression() self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] + [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)], asarray(list(chain(repeat(0, 12500), repeat(1, 12500))))) if isfile("arxiv.d2v"): self.doc_model = Doc2Vec.load("arxiv.d2v") else: tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs] doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) doc_model.build_vocab(tagged) shuffle(tagged) # Replace with functional stuff for epoch in range(10): doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter) doc_model.save("arxiv.d2v")