Python gensim.models 模块,LdaModel() 实例源码

我们从Python开源项目中,提取了以下14个代码示例,用于说明如何使用gensim.models.LdaModel()

项目:itunes    作者:kaminem64    | 项目源码 | 文件源码
def run_model(name):
    if name == 'lsi':
        lsi = models.LsiModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics)
        print('Saving lsi_model...')
        lsi.save('exports/lsi.model')
        print('lsi_model saved!')
        # lsi_matrix = gensim.matutils.corpus2dense(lsi[corpus_gensim], len(lsi.projection.s)).T / lsi.projection.s
        # print('Saving lsi_matrix...')
        # pickle.dump(lsi_matrix, open('exports/lsi_matrix.p','wb'))
        # print('lsi_matrix saved!')

    elif name == 'lda':
        # lda = models.LdaModel(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)
        lda = models.ldamulticore.LdaMulticore(corpus_gensim, id2word=vocab_gensim, num_topics=num_topics, passes=5)#, alpha='auto') #auto needs non multicore LDA
        print('Saving lda_model...')
        lda.save('exports/lda.model')
        print('lda_model saved!')
        # lda_matrix = gensim.matutils.corpus2dense(lda[corpus_gensim], lda.num_topics)
        # print('Saving lda_matrix...')
        # pickle.dump(lda_matrix, open('exports/lda_matrix.p','wb'))
        # print('lda_matrix saved!')
    gc.collect()
项目:text-analytics-with-python    作者:dipanjanS    | 项目源码 | 文件源码
def train_lda_model_gensim(corpus, total_topics=2):

    norm_tokenized_corpus = normalize_corpus(corpus, tokenize=True)
    dictionary = corpora.Dictionary(norm_tokenized_corpus)
    mapped_corpus = [dictionary.doc2bow(text) 
                     for text in norm_tokenized_corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, 
                          id2word=dictionary,
                          iterations=1000,
                          num_topics=total_topics)
    return lda
项目:dish_ai    作者:melanietosik    | 项目源码 | 文件源码
def train_model(model_name, corpus, id2word, num_topics):
    """
    Train specified model
    """
    # LDA
    if model_name == 'lda':
        model = models.LdaModel(
            corpus,
            id2word=id2word,
            num_topics=num_topics,
            alpha='auto',
            eval_every=5,
        )
        return model
    # LSI
    elif model_name == 'lsi':
        model = models.LsiModel(
            corpus,
            id2word=id2word,
            num_topics=num_topics,
        )
        return model
    else:
        print('Invalid model name')
    return None
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def run_lda(self):
        self.lda = models.LdaModel(corpus=self.corpus, id2word=self.dict, num_topics = self.nt, iterations=500000)
项目:tRECS    作者:TeeOhh    | 项目源码 | 文件源码
def build_lda(self, nt, corpus, dictionary, bow_matrix):
        ## Description: Builds LDA and does document similarity
        ## Params: Number of topics, corpus, dict, BOW matrix
        ## Returns: Similarity index and matrix

        lda_model = models.LdaModel(corpus, id2word= dictionary, num_topics=nt)
        self.lda_model = lda_model
        index = similarities.MatrixSimilarity(lda_model[corpus])
        matrix = bow_matrix.apply(lambda x: [lda_model[x[0]]], 1)
        return (index, matrix)
项目:OpinionMining728    作者:stasi009    | 项目源码 | 文件源码
def lda_model_topics():
    dictionary = corpora.Dictionary.load(DictionaryFile)
    corpus_bow = corpora.MmCorpus(BowFile)

    N_TOPICS = 100
    model = models.LdaModel(corpus_bow, id2word=dictionary, num_topics=N_TOPICS)
    print "================= LDA MODEL IS BUILT ================="

    model.save(LdaModelFile)
    save_topics(model,LdaTopicsFile)
项目:liveqa2017    作者:codekansas    | 项目源码 | 文件源码
def __init__(self,
                 num_topics=NUM_TOPICS,
                 dictionary_file=DICTIONARY_FILE,
                 model_file=MODEL_FILE):
        """Initializes the ranker.

        Args:
            num_topics: int (default: NUM_TOPICS), number of LSI topics to use.
            dictionary_file: str, where to save / load the dictionary file
                (defaults to DICTIONARY_FILE).
            model_file: str, where to save / load the model (defaults to
                MODEL_FILE).
        """

        self.dictionary = None
        self.model = None
        self.num_topics = num_topics
        self.dictionary_file = dictionary_file
        self.model_file = model_file

        # Loads stopwords from the associated file.
        with open(STOPWORDS_FILE, 'r') as f:
            self.stoplist = set(f.read().strip().split())

        # Loads an existing dictionary file, if one exists.
        if os.path.exists(self.dictionary_file):
            with open(self.dictionary_file, 'rb') as f:
                self.dictionary = pickle.load(f)

        # Loads an existing model file, if one exists.
        if os.path.exists(self.model_file):
            self.model = models.LdaModel.load(self.model_file)
        else:
            logging.warn('No model found in "%s"', self.model_file)

        # Determines if the model needs to be trained.
        self._trained = self.dictionary and self.model
项目:liveqa2017    作者:codekansas    | 项目源码 | 文件源码
def train(self, corpus, passes=1):
        """Updates dictionary and model given a corpus.

        Args:
            corpus: list of str, the documents to tokenize.
        """

        if self.dictionary is not None or self.model is not None:
            x = raw_input('You are about to overwrite an existing '
                          'model file (%s). Are you sure? [y/N] '
                          % self.model_file)

            if x[0] != 'y':
                raise RuntimeError('You chose not to overwrite the '
                                   'existing model and dictionary.')

        # Tokenizes the corpus.
        documents = [self.tokenize(document) for document in corpus]

        # Builds a dictionary from the existing documents.
        self.dictionary = corpora.Dictionary(documents)

        # Dumps the dictionary to a pickled file to use later.
        pickle.dump(self.dictionary, open(self.dictionary_file, 'wb'))

        # Converts the corpus to tokens.
        corpus_bow = [self.dictionary.doc2bow(doc) for doc in documents]

        # Trains the LSI model.
        self.model = models.LdaModel(corpus_bow,
                                     passes=passes,
                                     id2word=self.dictionary,
                                     num_topics=self.num_topics)

        # Saves the model to use later.
        self.model.save(self.model_file)

        # Flag to remember that training has taken place.
        self._trained = True
项目:QA    作者:KiddoZhu    | 项目源码 | 文件源码
def train(self, **kargs) :
        self.config.update(kargs)
        self.model = _LDA(self.database.corpus, id2word = self.database.dictionary, **self.config)
        delattr(self, "database")
项目:pinkerton    作者:bureaucratic-labs    | 项目源码 | 文件源码
def score(self, entities: list, context: str) -> list:

        queries = [
            (i, q['context']) for i, q in enumerate(entities) if q['context']
        ]

        context = tokenize(context)

        dictionary = Dictionary([context])

        vectors = [
            dictionary.doc2bow(
                tokenize(q)
            ) for _, q in queries
        ]

        model = LdaModel(id2word=dictionary, **self.model_kwargs)

        ents = (
            entities[i] for i, _ in queries
        )

        scores = (
            model[vec][-1][1] for vec in vectors if model[vec]
        )

        results = zip(ents, scores)

        def sort_by_score(item):
            return item[1]

        return sorted(results, key=sort_by_score, reverse=True)
项目:LDA_RecEngine    作者:easonchan1213    | 项目源码 | 文件源码
def trainModel(self):
        '''
        Train a LDA model, inclusive of 4 steps:
        1. Parse the whole corpora into unigram token collections and document mapping (for later use)
        2. Filter tokens which are not common (no_below_this_number), and too common (no_above_fraction_of_doc)
        3. Indexing the token collections and do TF-IDF transformation
        4. Call gensim.models.LdaModel and generate topic distributions of the corpora
        '''
        print 'Start preparing unigram tokens....'      
        ## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....], which comprise Bag-Of-Words (BOW)
        # Get document_count, tokens, and document-index mapping from the corpora
        doc_count,train_set,doc_mapping,link_mapping = self.__tokenizeWholeCorpora(path_corpora) 
        # Put the training data into gensim.corpora for later use
        dic = corpora.Dictionary(train_set) 
        denominator = len(dic)
        # Filtering infrequent words & common stopwords, thus reducing the dimension of terms (which prevents curse of dimensionality)
        dic.filter_extremes(no_below=self.no_below_this_number, no_above=self.no_above_fraction_of_doc)
        nominator = len(dic)
        corpus = [dic.doc2bow(text) for text in train_set]  # transform every token into BOW
        print 'There are %i documents in the pool' % (doc_count)
        print "In the corpus there are ", denominator, " raw tokens"
        print "After filtering, in the corpus there are", nominator, "unique tokens, reduced ", (1-(nominator/denominator)),"%"
        print 'Finished preparing unigram tokens....'   
        ##END 

        print 'Start training LDA model....'
        ## Implementing TF-IDF as a vector for each document, and train LDA model on top of that
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = self.num_topics,iterations=self.num_of_iterations,passes = self.passes)
        corpus_lda = lda[corpus_tfidf]
        # Once done training, print all the topics and related words
        print 'Finished training LDA model.......Here is the list of all topics & their most frequent words'    
        for i in range(self.num_topics):
            print 'Topic %s : ' % (str(i)) + lda.print_topic(i)
        # Exhibit perplexity of current model under specific topic hyperparameter : k. The lower the better
        print '==============================='
        print 'Model perplexity : ',lda.bound(corpus_lda),' when topic k =', str(self.num_topics)
        print '==============================='   

        return lda,doc_mapping,link_mapping,corpus
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def get_score_for_question(question_answer_word_dir,question_num,question_answer_score_label_file_dir):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_write_name = os.path.join(question_answer_score_label_file_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        list_simes = list(enumerate(sims))
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        #answer_label_list = []
        for item in list_simes:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_score = str(item[1])
            file_write = open(file_write_name,'ab+')
            file_write.write(str(answer_label)+'\t'+str(answer_score)+'\n')
            file_write.close()
            #answer_label_list.append(answer_label)
        #DCG_score = calu_DCG(answer_label_list,k)
        #DCG_score_list.append(DCG_score)
    #DCG_avg = calu_avg_answer_length(DCG_score_list)
    #print 'DCG_avg : \t' + str(DCG_avg)
项目:CCIR    作者:xiaogang00    | 项目源码 | 文件源码
def get_score_for_question(question_answer_word_dir,question_num,k):
    DCG_score_list = []
    for question_index in range(int(question_num)):
        if (question_index+1)%1000 == 1:
            print 'Now for line : ' + str(question_index+1) + '\n'
        index = question_index + 1
        file_read_name = os.path.join(question_answer_word_dir,str(index))
        file_read = open(file_read_name,'rb+')
        question_line = file_read.readline()
        question_line_list = question_line.strip().split('\t')
        question_line_list.remove('question')
        answer_index = 0
        answer_index_line_label_dict = {}
        answer_sentences_word_list = []
        for line in file_read.readlines():
            answer_temp_line_list = line.strip().split('\t')
            answer_label = answer_temp_line_list[1]
            answer_temp_line_list.remove('answer')
            answer_temp_line_list.remove(answer_label)
            answer_sentences_word_list.append(answer_temp_line_list)
            answer_list_temp = []
            answer_list_temp.append(answer_label)
            answer_index_line_label_dict[answer_index] = answer_list_temp
            answer_index += 1
        dic = corpora.Dictionary(answer_sentences_word_list)
        corpus=[dic.doc2bow(text) for text in answer_sentences_word_list]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lda = models.LdaModel(corpus_tfidf,id2word=dic,num_topics=2)
        index = similarities.MatrixSimilarity(lda[corpus_tfidf])
        query_bow = dic.doc2bow(question_line_list)
        query_lda = lda[query_bow]
        sims = index[query_lda]
        sort_sims = sorted(enumerate(sims),key=lambda item:-item[1])
        answer_label_list = []
        for item in sort_sims:
            answer_index_temp = item[0]
            answer_label = int(answer_index_line_label_dict[int(answer_index_temp)][0])
            answer_label_list.append(answer_label)
        DCG_score = calu_DCG(answer_label_list,k)
        DCG_score_list.append(DCG_score)
    DCG_avg = calu_avg_answer_length(DCG_score_list)
    print 'DCG_avg : \t' + str(DCG_avg)
项目:hololens-dv-server    作者:AdamNiederer    | 项目源码 | 文件源码
def __init__(self, filename):
        self.docs = loads(open(filename, "r").read())
        self.docmap = hoist_dict(self.docs, "id")

        if isfile("data.dict"):
            self.dictionary = Dictionary.load("data.dict")
        else:
            self.dictionary = Dictionary(iterate_summaries(self.docs))
            self.dictionary.save("data.dict")

        if isfile("data.mm"):
            self.corpus = MmCorpus("data.mm")
        else:
            corpus = (self.dictionary.doc2bow(text) for text in iterate_summaries(self.docs))
            MmCorpus.serialize("data.mm", corpus)
            self.corpus = MmCorpus("data.mm")

        self.lsi = LsiModel(self.corpus, id2word=self.dictionary, num_topics=3)

        if isfile("data.sim"):
            self.sim = MatrixSimilarity.load("data.sim")
        else:
            self.sim = MatrixSimilarity(self.lsi[self.corpus])
            self.sim.save("data.sim")

        # self.lda = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=100, update_every=1, chunksize=10000, passes=1)

        self.sentiment_model = Doc2Vec.load("imdb.d2v")
        self.sentiment = LogisticRegression()
        self.sentiment.fit([self.sentiment_model.docvecs["TEST_POS_" + str(i)] for i in range(12500)] +
                           [self.sentiment_model.docvecs["TEST_NEG_" + str(i)] for i in range(12500)],
                           asarray(list(chain(repeat(0, 12500), repeat(1, 12500)))))

        if isfile("arxiv.d2v"):
            self.doc_model = Doc2Vec.load("arxiv.d2v")
        else:
            tagged = [TaggedDocument(doc.get("summary").split(), [doc.get("id")]) for doc in self.docs]
            doc_model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)
            doc_model.build_vocab(tagged)
            shuffle(tagged) # Replace with functional stuff
            for epoch in range(10):
                doc_model.train(tagged, total_examples=doc_model.corpus_count, epochs=doc_model.iter)
            doc_model.save("arxiv.d2v")