我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用gensim.models.LdaModel()。
def run_model(name): if name == 'lsi': lsi = models.LsiModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics) print('Saving lsi_model...') lsi.save('exports/lsi.model') print('lsi_model saved!') # lsi_matrix = gensim.matutils.corpus2dense(lsi[corpus_gensim], len(lsi.projection.s)).T / lsi.projection.s # print('Saving lsi_matrix...') # pickle.dump(lsi_matrix, open('exports/lsi_matrix.p','wb')) # print('lsi_matrix saved!') elif name == 'lda': # lda = models.LdaModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5) lda = models.ldamulticore.LdaMulticore(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)#, alpha='auto') #auto needs non multicore LDA print('Saving lda_model...') lda.save('exports/lda.model') print('lda_model saved!') # lda_matrix = gensim.matutils.corpus2dense(lda[corpus_gensim], lda.num_topics) # print('Saving lda_matrix...') # pickle.dump(lda_matrix, open('exports/lda_matrix.p','wb')) # print('lda_matrix saved!') gc.collect()
def train_lda_model_gensim(corpus, total_topics=2): norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True) dictionary = corpora.Dictionary(norm_tokenized_corpus) mapped_corpus = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] tfidf = models.TfidfModel(mapped_corpus) corpus_tfidf = tfidf[mapped_corpus] lda = models.LdaModel(corpus_tfidf, id2word=dictionary, iterations=1000, num_topics=total_topics) return lda
def train_model(model_name, corpus, id2word, num_topics): """ Train specified model """ # LDA if model_name == 'lda': model = models.LdaModel( corpus, id2word=id2word, num_topics=num_topics, alpha='auto', eval_every=5, ) return model # LSI elif model_name == 'lsi': model = models.LsiModel( corpus, id2word=id2word, num_topics=num_topics, ) return model else: print('Invalid model name') return None
def run_lda(self): self.lda = models.LdaModel(corpus=self.corpus, id2word=self.dict, num_topics = self.nt, iterations=500000)
def build_lda(self, nt, corpus, dictionary, bow_matrix): ## Description: Builds LDA and does document similarity ## Params: Number of topics, corpus, dict, BOW matrix ## Returns: Similarity index and matrix lda_model = models.LdaModel(corpus, id2word= dictionary, num_topics=nt) self.lda_model = lda_model index = similarities.MatrixSimilarity(lda_model[corpus]) matrix = bow_matrix.apply(lambda x: [lda_model[x[0]]], 1) return (index, matrix)
def lda_model_topics(): dictionary = corpora.Dictionary.load(DictionaryFile) corpus_bow = corpora.MmCorpus(BowFile) N_TOPICS = 100 model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS) print "================= LDA MODEL IS BUILT =================" model.save(LdaModelFile) save_topics(model,LdaTopicsFile)
def __init__(self, num_topics=NUM_TOPICS, dictionary_file=DICTIONARY_FILE, model_file=MODEL_FILE): """Initializes the ranker. Args: num_topics: int (default: NUM_TOPICS), number of LSI topics to use. dictionary_file: str, where to save / load the dictionary file (defaults to DICTIONARY_FILE). model_file: str, where to save / load the model (defaults to MODEL_FILE). """ self.dictionary = None self.model = None self.num_topics = num_topics self.dictionary_file = dictionary_file self.model_file = model_file # Loads stopwords from the associated file. with open(STOPWORDS_FILE, 'r') as f: self.stoplist = set(f.read().strip().split()) # Loads an existing dictionary file, if one exists. if os.path.exists(self.dictionary_file): with open(self.dictionary_file, 'rb') as f: self.dictionary = pickle.load(f) # Loads an existing model file, if one exists. if os.path.exists(self.model_file): self.model = models.LdaModel.load(self.model_file) else: logging.warn('No model found in "%s"', self.model_file) # Determines if the model needs to be trained. self._trained = self.dictionary and self.model
def train(self, corpus, passes=1): """Updates dictionary and model given a corpus. Args: corpus: list of str, the documents to tokenize. """ if self.dictionary is not None or self.model is not None: x = raw_input('You are about to overwrite an existing ' 'model file (%s). Are you sure? [y/N] ' % self.model_file) if x[0] != 'y': raise RuntimeError('You chose not to overwrite the ' 'existing model and dictionary.') # Tokenizes the corpus. documents = [self.tokenize(document) for document in corpus] # Builds a dictionary from the existing documents. self.dictionary = corpora.Dictionary(documents) # Dumps the dictionary to a pickled file to use later. pickle.dump(self.dictionary, open(self.dictionary_file, 'wb')) # Converts the corpus to tokens. corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents] # Trains the LSI model. self.model = models.LdaModel(corpus_bow, passes=passes, id2word=self.dictionary, num_topics=self.num_topics) # Saves the model to use later. self.model.save(self.model_file) # Flag to remember that training has taken place. self._trained = True
def train(self, **kargs) : self.config.update(kargs) self.model = _LDA(self.database.corpus, id2word = self.database.dictionary, **self.config) delattr(self, "database")
def score(self, entities: list, context: str) -> list: queries = [ (i, q['context']) for i, q in enumerate(entities) if q['context'] ] context = tokenize(context) dictionary = Dictionary([context]) vectors = [ dictionary.doc2bow( tokenize(q) ) for _, q in queries ] model = LdaModel(id2word=dictionary, **self.model_kwargs) ents = ( entities[i] for i, _ in queries ) scores = ( model[vec][-1][1] for vec in vectors if model[vec] ) results = zip(ents, scores) def sort_by_score(item): return item[1] return sorted(results, key=sort_by_score, reverse=True)
def trainModel(self): ''' Train a LDA model, inclusive of 4 steps: 1. Parse the whole corpora into unigram token collections and document mapping (for later use) 2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc) 3. Indexing the token collections and do TF-IDF transformation 4. Call gensim.models.LdaModel and generate topic distributions of the corpora ''' print 'Start preparing unigram tokens....' ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW) # Get document_count, tokens, and document-index mapping from the corpora doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) # Put the training data into gensim.corpora for later use dic = corpora.Dictionary(train_set) denominator = len(dic) # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality) dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc) nominator = len(dic) corpus = [dic.doc2bow(text) for text in train_set] # transform every token into BOW print 'There are %i documents in the pool' % (doc_count) print "In the corpus there are ", denominator, " raw tokens" print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%" print 'Finished preparing unigram tokens....' ##END print 'Start training LDA model....' ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes) corpus_lda = lda[corpus_tfidf] # Once done training, print all the topics and related words print 'Finished training LDA model.......Here is the list of all topics & their most frequent words' for i in range(self.num_topics): print 'Topic %s : ' % (str(i)) + lda.print_topic(i) # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better print '===============================' print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics) print '===============================' return lda,doc_mapping,link_mapping,corpus
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir): DCG_score_list = [] for question_index in range(int(question_num)): if (question_index+1)%1000 == 1: print 'Now for line : ' + str(question_index+1) + '\n' index = question_index + 1 file_read_name = os.path.join(question_answer_word_dir,str(index)) file_write_name = os.path.join(question_answer_score_label_file_dir,str(index)) file_read = open(file_read_name,'rb+') question_line = file_read.readline() question_line_list = question_line.strip().split('\t') question_line_list.remove('question') answer_index = 0 answer_index_line_label_dict = {} answer_sentences_word_list = [] for line in file_read.readlines(): answer_temp_line_list = line.strip().split('\t') answer_label = answer_temp_line_list[1] answer_temp_line_list.remove('answer') answer_temp_line_list.remove(answer_label) answer_sentences_word_list.append(answer_temp_line_list) answer_list_temp = [] answer_list_temp.append(answer_label) answer_index_line_label_dict[answer_index] = answer_list_temp answer_index += 1 dic = corpora.Dictionary(answer_sentences_word_list) corpus=[dic.doc2bow(text) for text in answer_sentences_word_list] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2) index = similarities.MatrixSimilarity(lda[corpus_tfidf]) query_bow = dic.doc2bow(question_line_list) query_lda = lda[query_bow] sims = index[query_lda] list_simes = list(enumerate(sims)) sort_sims = sorted(enumerate(sims),key=lambda item:-item[1]) #answer_label_list = [] for item in list_simes: answer_index_temp = item[0] answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0]) answer_score = str(item[1]) file_write = open(file_write_name,'ab+') file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n') file_write.close() #answer_label_list.append(answer_label) #DCG_score = calu_DCG(answer_label_list,k) #DCG_score_list.append(DCG_score) #DCG_avg = calu_avg_answer_length(DCG_score_list) #print 'DCG_avg : \t' + str(DCG_avg)
def get_score_for_question(question_answer_word_dir,question_num,k): DCG_score_list = [] for question_index in range(int(question_num)): if (question_index+1)%1000 == 1: print 'Now for line : ' + str(question_index+1) + '\n' index = question_index + 1 file_read_name = os.path.join(question_answer_word_dir,str(index)) file_read = open(file_read_name,'rb+') question_line = file_read.readline() question_line_list = question_line.strip().split('\t') question_line_list.remove('question') answer_index = 0 answer_index_line_label_dict = {} answer_sentences_word_list = [] for line in file_read.readlines(): answer_temp_line_list = line.strip().split('\t') answer_label = answer_temp_line_list[1] answer_temp_line_list.remove('answer') answer_temp_line_list.remove(answer_label) answer_sentences_word_list.append(answer_temp_line_list) answer_list_temp = [] answer_list_temp.append(answer_label) answer_index_line_label_dict[answer_index] = answer_list_temp answer_index += 1 dic = corpora.Dictionary(answer_sentences_word_list) corpus=[dic.doc2bow(text) for text in answer_sentences_word_list] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2) index = similarities.MatrixSimilarity(lda[corpus_tfidf]) query_bow = dic.doc2bow(question_line_list) query_lda = lda[query_bow] sims = index[query_lda] sort_sims = sorted(enumerate(sims),key=lambda item:-item[1]) answer_label_list = [] for item in sort_sims: answer_index_temp = item[0] answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0]) answer_label_list.append(answer_label) DCG_score = calu_DCG(answer_label_list,k) DCG_score_list.append(DCG_score) DCG_avg = calu_avg_answer_length(DCG_score_list) print 'DCG_avg : \t' + str(DCG_avg)
def __init__(self, filename): self.docs = loads(open(filename, "r").read()) self.docmap = hoist_dict(self.docs, "id") if isfile("data.dict"): self.dictionary = Dictionary.load("data.dict") else: self.dictionary = Dictionary(iterate_summaries(self.docs)) self.dictionary.save("data.dict") if isfile("data.mm"): self.corpus = MmCorpus("data.mm") else: corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs)) MmCorpus.serialize("data.mm", corpus) self.corpus = MmCorpus("data.mm") self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3) if isfile("data.sim"): self.sim = MatrixSimilarity.load("data.sim") else: self.sim = MatrixSimilarity(self.lsi[self.corpus]) self.sim.save("data.sim") # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1) self.sentiment_model = Doc2Vec.load("imdb.d2v") self.sentiment = LogisticRegression() self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] + [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)], asarray(list(chain(repeat(0, 12500), repeat(1, 12500))))) if isfile("arxiv.d2v"): self.doc_model = Doc2Vec.load("arxiv.d2v") else: tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs] doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) doc_model.build_vocab(tagged) shuffle(tagged) # Replace with functional stuff for epoch in range(10): doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter) doc_model.save("arxiv.d2v")