Python sklearn 模块，datasets() 实例源码

我们从Python开源项目中，提取了以下9个代码示例，用于说明如何使用sklearn.datasets()。

项目：scanpy 作者：theislab | 项目源码 | 文件源码

def burczynski06():
    """Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD).

    The study assesses transcriptional profiles in peripheral blood mononuclear
    cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by
    hybridization to microarrays interrogating more than 22,000 sequences.

    Reference
    ---------
    Burczynski et al., "Molecular classification of Crohn's disease and
    ulcerative colitis patients using transcriptional profiles in peripheral
    blood mononuclear cells"
    J Mol Diagn 8, 51 (2006). PMID:16436634.
    """
    filename = 'data/burczynski06/GDS1615_full.soft.gz'
    url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz'
    adata = sc.read(filename, backup_url=url, cache=True)
    return adata

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_fetch_one_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref

项目：amle 作者：elibol | 项目源码 | 文件源码

def get_dataset(did):
    dataset = datasets.get_dataset(did)
    X, y, categorical = dataset.get_data(target=dataset.default_target_attribute,
                                         return_categorical_indicator=True)
    return X, y, categorical

项目：amle 作者：elibol | 项目源码 | 文件源码

def run_tests(automl, num_classes, X_train, y_train, X_test, y_test):
    # import ipdb; ipdb.set_trace()
    print(model_stats(automl))

    print "num_classes", num_classes

    train_predictions = automl.predict(X_train)
    train_auc = roc_auc_score(y_train, train_predictions, num_classes=num_classes)
    print("train data auc score", train_auc)

    test_predictions = automl.predict(X_test)
    test_auc = roc_auc_score(y_test, test_predictions, num_classes=num_classes)
    print("test data auc score", test_auc)

    # they expect a one-hot encoding
    try:
        test_predictions = to_matrix(test_predictions, num_classes=num_classes)
        if num_classes == 2:
            their_auc = classification_metrics.auc_metric(y_test, test_predictions, task=BINARY_CLASSIFICATION)
        else:
            their_auc = classification_metrics.auc_metric(y_test, test_predictions, task=MULTICLASS_CLASSIFICATION)

        # they compute Gini index
        # 2*AUC-1
        # e.g. 2*0.8-1 = 0.6
        # verified for binary and multiclass datasets.
        print("their test data auc score (2*auc-1)", their_auc)
        print("their test data auc score (reverted from Gini index)", (their_auc + 1) / 2)
    except Exception as e:
        print e


# use this for stats

项目：amle 作者：elibol | 项目源码 | 文件源码

def test_dataset(self):
        self.working_dir = "digits_test"
        dataset_name = "digits"
        digits = sklearn.datasets.load_digits()
        X = digits.data
        y = digits.target
        return self.run_dataset(dataset_name, X, y)

项目：scanpy 作者：theislab | 项目源码 | 文件源码

def blobs(n_centers=5, cluster_std=1.0, n_samples=640):
    """Gaussian Blobs.

    Parameters
    ----------
    n_centers : `int`, optional (default: 5)
        Number of cluster centers.
    cluster_std : `float`, optional (default: 1.0)
        Standard deviation of clusters.
    n_samples : `int`, optional (default: 640)
        Number of samples. By default, this is the same sample number as in
        ``sc.examples.krumsiek11()``.

    Returns
    -------
    adata : :class:`~scanpy.api.AnnData`
        Annotated data matrix containing a sample annotation 'blobs' that
        indicates cluster identity.
    """
    import sklearn.datasets
    X, y = sklearn.datasets.make_blobs(n_samples=n_samples,
                                       n_features=11,
                                       centers=n_centers,
                                       cluster_std=cluster_std,
                                       random_state=0)
    return sc.AnnData(X, smp={'blobs': y.astype(str)})

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_mldata_filename():
    cases = [('datasets-UCI iris', 'datasets-uci-iris'),
             ('news20.binary', 'news20binary'),
             ('book-crossing-ratings-1.0', 'book-crossing-ratings-10'),
             ('Nile Water Level', 'nile-water-level'),
             ('MNIST (original)', 'mnist-original')]
    for name, desired in cases:
        assert_equal(mldata_filename(name), desired)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def setup_module():
    # setup mock urllib2 module to avoid downloading from mldata.org
    install_mldata_mock({
        'mnist-original': {
            'data': np.empty((70000, 784)),
            'label': np.repeat(np.arange(10, dtype='d'), 7000),
        },
        'iris': {
            'data': np.empty((150, 4)),
        },
        'datasets-uci-iris': {
            'double0': np.empty((150, 4)),
            'class': np.empty((150,)),
        },
    })