我们从Python开源项目中,提取了以下36个代码示例,用于说明如何使用sklearn.decomposition.LatentDirichletAllocation()。
def lda(X, n_topics=None): model = LatentDirichletAllocation(n_topics) X_new = model.fit_transform(X) components = model.components_ return X_new, components
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X, invalid_n_samples) # invalid topic number invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X, invalid_n_topics)
def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) distr_1 = lda_1.fit_transform(X) perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False) distr_2 = lda_2.fit_transform(X) perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_topics, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
def LDAPageVctorizer(*, n_topics: int, min_df: int, max_features: int, max_iter: int, ngram_range: Tuple[int, int], vocabulary=None, batch_size: int=4096, verbose=1): vec = _vectorizer(min_df=min_df, max_features=max_features, ngram_range=ngram_range, vocabulary=vocabulary) lda = LatentDirichletAllocation( learning_method='online', n_topics=n_topics, batch_size=batch_size, evaluate_every=2, verbose=verbose, max_iter=max_iter, n_jobs=1, ) return make_pipeline(vec, lda)
def train(self): D = self.A_true.shape[1] for i in range(20): self.show_error() start = time.time() prior = self.sparsity / np.float(self.A_true.shape[1]) lda = LDA(n_topics=D, random_state=0, doc_topic_prior = prior, max_iter=i) lda.fit(self.Y.transpose()) end = time.time() self.time = end - start self.A = np.asmatrix(lda.components_.transpose())
def init_and_fit_lda_(dataframe, num_topics=20, rand_state=1, learn_method='online'): model = decomposition.LatentDirichletAllocation(n_topics=num_topics, random_state=rand_state, learning_method=learn_method) model.fit(X=dataframe) return model
def LDA_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words): print "Extracting tf features for LDA..." tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(text_lst) print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf) print "\nTopics in LDA model:" tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) print "*************end LDA****************"
def _train(self): """ Train LDA Recommender, and store the document_distribution. """ term_freq = self.abstracts_preprocessor.get_term_frequency_sparse_matrix() lda = LatentDirichletAllocation(n_topics=self.n_factors, max_iter=self.n_iter, learning_method='online', learning_offset=50., random_state=0, verbose=0) if self._verbose: print("Initialized LDA model..., Training LDA...") self.document_distribution = lda.fit_transform(term_freq) if self._verbose: print("LDA trained..")
def lda(X, n_topics=None): model = LatentDirichletAllocation(n_topics) X_new = model.fit_transform(X) pass
def fitTopicModel(self, numTopics, max_iter=100, **kwargs): self.lda = LatentDirichletAllocation(n_topics=numTopics,learning_method=self.learningMethod,random_state=self.seed, n_jobs=1, max_iter=max_iter, batch_size=self.chunksize, **kwargs) if self.fragM.shape[0] > self.chunksize: # fit the model in chunks self.lda.learning_method = 'online' self.lda.fit(self.fragM) else: self.lda.fit(self.fragM)
def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_topics, X = _build_sparse_mtx() prior = 1. / n_topics lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2)
def test_lda_fit_batch(): # Test LDA batch learning_offset (`fit` method with 'batch' learning) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1, learning_method='batch', random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_fit_online(): # Test LDA online learning (`fit` method with 'online' learning) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., evaluate_every=1, learning_method='online', random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., total_samples=100, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_dense_input(): # Test LDA with dense input. rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', random_state=rng) lda.fit(X.toarray()) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for component in lda.components_: # Find top 3 words in each LDA component top_idx = set(component.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)