Python sklearn.decomposition 模块,RandomizedPCA() 实例源码

我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用sklearn.decomposition.RandomizedPCA()

项目:CKME136    作者:asterix135    | 项目源码 | 文件源码
def visualize_data(data, labels):
    pca = RandomizedPCA(n_components=2)
    reshaped = pca.fit_transform(data)
    df = pd.DataFrame({'x': reshaped[:,0], 'y': reshaped[:, 1],
                       'label': np.where(labels == 1, 'Positive',
                                         np.where(labels == 0, 'Neutral',
                                                  'Negative'))})
    colors = ['yellow', 'red', 'blue']
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label'] == label
        plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
    plt.legend()
    plt.title('PCA Decomposition of Image Data')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.show()
    # plt.savefig('PCA_plot.png')
项目:CKME136    作者:asterix135    | 项目源码 | 文件源码
def visualize_data(data, labels):
    pca = RandomizedPCA(n_components=2)
    reshaped = pca.fit_transform(data)
    df = pd.DataFrame({'x': reshaped[:,0], 'y': reshaped[:, 1],
                       'label': np.where(labels == 1, 'Positive',
                                         np.where(labels == 0, 'Neutral',
                                                  'Negative'))})
    colors = ['yellow', 'red', 'blue']
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label'] == label
        plt.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
    plt.legend()
    plt.title('PCA Decomposition of Image Data')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.show()
    # plt.savefig('PCA_plot.png')
项目:mglex    作者:fungs    | 项目源码 | 文件源码
def plot_clusters_pca(responsibilities, color_groups):
    from sklearn.decomposition import RandomizedPCA
    import pylab as pl
    from random import shuffle

    colors = list(colors_dict.values())
    shuffle(colors)

    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(responsibilities)
    # print >>stderr, pca.explained_variance_ratio_

    pl.figure()
    pl.scatter(X[:, 0], X[:, 1], c="grey", label="unknown")
    for c, sub, i in zip(colors, color_groups, count(0)):
        pl.scatter(X[sub, 0], X[sub, 1], c=c, label=str(i))
    pl.legend()
    pl.title("PCA responsibility matrix")
    pl.show()
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fixed_batch_size_comparison(data):
    all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10,
                                                       data.shape[1], num=5)]
    batch_size = 1000
    # Compare runtimes and error for fixed batch size
    all_times = defaultdict(list)
    all_errors = defaultdict(list)
    for n_components in all_features:
        pca = PCA(n_components=n_components)
        rpca = RandomizedPCA(n_components=n_components, random_state=1999)
        ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
                                                               ('ipca', ipca),
                                                               ('rpca', rpca)]}

        for k in sorted(results_dict.keys()):
            all_times[k].append(results_dict[k]['time'])
            all_errors[k].append(results_dict[k]['error'])

    plot_feature_times(all_times, batch_size, all_features, data)
    plot_feature_errors(all_errors, batch_size, all_features, data)
项目:machine-learning-nanodegree-program-capstone    作者:harrylippy    | 项目源码 | 文件源码
def pca_analysis(self):
        if not self._use_pca:
            return

        print "done.\n + Using PCA to analyze the data...",; stdout.flush()

        cols = self._get_columns()

        (X_train, _) = self._train_data
        if not self._pca:
            self._pca = RandomizedPCA(
                            n_components=self._pca_max_n, 
                            whiten=True,
                            random_state=42)
            self._pca.fit(X_train)

        # NOTE:  plot code stolen from sklearn example: http://bit.ly/1X8ZsUw
        fig = plt.figure(self._fig_count, figsize=(4,3))
        plt.clf()
        plt.axes([.2, .2, .7, .7])
        plt.plot(self._pca.explained_variance_ratio_)
        fig.suptitle('RandomizedPCA Analysis')
        plt.axis('tight')
        plt.xlabel('Component')
        plt.ylabel('Explained Variance Ratio')
        plt.show()
        self._fig_count += 1

        # Reset the PCA object, since we will need to set the exact number
        # of components we want to use if and when we use it again
        self._pca = None

    # Train a classifier pipeline that may or may not use PCA or other
    # feature selection methods
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def plot_feature_times(all_times, batch_size, all_components, data):
    plt.figure()
    plot_results(all_components, all_times['pca'], label="PCA")
    plot_results(all_components, all_times['ipca'],
                 label="IncrementalPCA, bsize=%i" % batch_size)
    plot_results(all_components, all_times['rpca'], label="RandomizedPCA")
    plt.legend(loc="upper left")
    plt.suptitle("Algorithm runtime vs. n_components\n \
                 LFW, size %i x %i" % data.shape)
    plt.xlabel("Number of components (out of max %i)" % data.shape[1])
    plt.ylabel("Time (seconds)")
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def plot_feature_errors(all_errors, batch_size, all_components, data):
    plt.figure()
    plot_results(all_components, all_errors['pca'], label="PCA")
    plot_results(all_components, all_errors['ipca'],
                 label="IncrementalPCA, bsize=%i" % batch_size)
    plot_results(all_components, all_errors['rpca'], label="RandomizedPCA")
    plt.legend(loc="lower left")
    plt.suptitle("Algorithm error vs. n_components\n"
                 "LFW, size %i x %i" % data.shape)
    plt.xlabel("Number of components (out of max %i)" % data.shape[1])
    plt.ylabel("Mean absolute error")
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def plot_batch_times(all_times, n_features, all_batch_sizes, data):
    plt.figure()
    plot_results(all_batch_sizes, all_times['pca'], label="PCA")
    plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA")
    plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA")
    plt.legend(loc="lower left")
    plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \
                 LFW, size %i x %i" % (
                 n_features, data.shape[0], data.shape[1]))
    plt.xlabel("Batch size")
    plt.ylabel("Time (seconds)")
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def variable_batch_size_comparison(data):
    batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10,
                                                      data.shape[0], num=10)]

    for n_components in [i.astype(int) for i in
                         np.linspace(data.shape[1] // 10,
                                     data.shape[1], num=4)]:
        all_times = defaultdict(list)
        all_errors = defaultdict(list)
        pca = PCA(n_components=n_components)
        rpca = RandomizedPCA(n_components=n_components, random_state=1999)
        results_dict = {k: benchmark(est, data) for k, est in [('pca', pca),
                                                               ('rpca', rpca)]}

        # Create flat baselines to compare the variation over batch size
        all_times['pca'].extend([results_dict['pca']['time']] *
                                len(batch_sizes))
        all_errors['pca'].extend([results_dict['pca']['error']] *
                                 len(batch_sizes))
        all_times['rpca'].extend([results_dict['rpca']['time']] *
                                 len(batch_sizes))
        all_errors['rpca'].extend([results_dict['rpca']['error']] *
                                  len(batch_sizes))
        for batch_size in batch_sizes:
            ipca = IncrementalPCA(n_components=n_components,
                                  batch_size=batch_size)
            results_dict = {k: benchmark(est, data) for k, est in [('ipca',
                                                                   ipca)]}
            all_times['ipca'].append(results_dict['ipca']['time'])
            all_errors['ipca'].append(results_dict['ipca']['error'])

        plot_batch_times(all_times, n_components, batch_sizes, data)
        # RandomizedPCA error is always worse (approx 100x) than other PCA
        # tests
        plot_batch_errors(all_errors, n_components, batch_sizes, data)