Python sklearn.decomposition 模块,LatentDirichletAllocation() 实例源码

我们从Python开源项目中,提取了以下36个代码示例,用于说明如何使用sklearn.decomposition.LatentDirichletAllocation()

项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def lda(X, n_topics=None):
    model = LatentDirichletAllocation(n_topics)
    X_new = model.fit_transform(X)
    components = model.components_
    return X_new, components
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5.,
                                    total_samples=20, random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics))
    assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X,
                         invalid_n_samples)
    # invalid topic number
    invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1))
    assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X,
                         invalid_n_topics)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        distr_1 = lda_1.fit_transform(X)
        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)

        distr_2 = lda_2.fit_transform(X)
        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_topics, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                          learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)
项目:hh-page-classifier    作者:TeamHG-Memex    | 项目源码 | 文件源码
def LDAPageVctorizer(*,
                     n_topics: int,
                     min_df: int,
                     max_features: int,
                     max_iter: int,
                     ngram_range: Tuple[int, int],
                     vocabulary=None,
                     batch_size: int=4096,
                     verbose=1):
    vec = _vectorizer(min_df=min_df, max_features=max_features,
                      ngram_range=ngram_range, vocabulary=vocabulary)
    lda = LatentDirichletAllocation(
        learning_method='online',
        n_topics=n_topics,
        batch_size=batch_size,
        evaluate_every=2,
        verbose=verbose,
        max_iter=max_iter,
        n_jobs=1,
    )
    return make_pipeline(vec, lda)
项目:AND4NMF    作者:PrincetonML    | 项目源码 | 文件源码
def train(self):
        D = self.A_true.shape[1]
        for i in range(20):
            self.show_error()

            start = time.time()
            prior = self.sparsity / np.float(self.A_true.shape[1])
            lda = LDA(n_topics=D, random_state=0, doc_topic_prior = prior, max_iter=i)
            lda.fit(self.Y.transpose())
            end = time.time()
            self.time = end - start
            self.A = np.asmatrix(lda.components_.transpose())
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:sift-nlp    作者:ubclaunchpad    | 项目源码 | 文件源码
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'):
    model = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                    random_state=rand_state,
                                                    learning_method=learn_method)
    model.fit(X=dataframe)
    return model
项目:Hanhan_NLP    作者:hanhanwu    | 项目源码 | 文件源码
def LDA_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words):
    print "Extracting tf features for LDA..."
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(text_lst)
    print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)
    print "\nTopics in LDA model:"
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    print "*************end LDA****************"
项目:HyPRec    作者:mostafa-mahmoud    | 项目源码 | 文件源码
def _train(self):
        """
        Train LDA Recommender, and store the document_distribution.
        """
        term_freq = self.abstracts_preprocessor.get_term_frequency_sparse_matrix()
        lda = LatentDirichletAllocation(n_topics=self.n_factors, max_iter=self.n_iter,
                                        learning_method='online',
                                        learning_offset=50., random_state=0,
                                        verbose=0)
        if self._verbose:
            print("Initialized LDA model..., Training LDA...")

        self.document_distribution = lda.fit_transform(term_freq)
        if self._verbose:
            print("LDA trained..")
项目:Trendster    作者:rawanhassunah    | 项目源码 | 文件源码
def lda(X, n_topics=None):
    model = LatentDirichletAllocation(n_topics)
    X_new = model.fit_transform(X)
    pass
项目:CheTo    作者:rdkit    | 项目源码 | 文件源码
def fitTopicModel(self, numTopics, max_iter=100, **kwargs):

        self.lda = LatentDirichletAllocation(n_topics=numTopics,learning_method=self.learningMethod,random_state=self.seed,
                                             n_jobs=1, max_iter=max_iter, batch_size=self.chunksize, **kwargs)
        if self.fragM.shape[0] > self.chunksize:
            # fit the model in chunks
            self.lda.learning_method = 'online'
            self.lda.fit(self.fragM)
        else:
            self.lda.fit(self.fragM)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_default_prior_params():
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_topics, X = _build_sparse_mtx()
    prior = 1. / n_topics
    lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior,
                                      topic_word_prior=prior, random_state=0)
    lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0)

    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_fit_batch():
    # Test LDA batch learning_offset (`fit` method with 'batch' learning)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1,
                                    learning_method='batch', random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_fit_online():
    # Test LDA online learning (`fit` method with 'online' learning)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10.,
                                    evaluate_every=1, learning_method='online',
                                    random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_partial_fit():
    # Test LDA online learning (`partial_fit` method)
    # (same as test_lda_batch)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10.,
                                    total_samples=100, random_state=rng)
    for i in xrange(3):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_lda_dense_input():
    # Test LDA with dense input.
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch',
                                    random_state=rng)
    lda.fit(X.toarray())

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)