我们从Python开源项目中,提取了以下9个代码示例,用于说明如何使用sklearn.datasets()。
def burczynski06(): """Bulk data with conditions ulcerative colitis (UC) and Crohn's disease (CD). The study assesses transcriptional profiles in peripheral blood mononuclear cells from 42 healthy individuals, 59 CD patients, and 26 UC patients by hybridization to microarrays interrogating more than 22,000 sequences. Reference --------- Burczynski et al., "Molecular classification of Crohn's disease and ulcerative colitis patients using transcriptional profiles in peripheral blood mononuclear cells" J Mol Diagn 8, 51 (2006). PMID:16436634. """ filename = 'data/burczynski06/GDS1615_full.soft.gz' url = 'ftp://ftp.ncbi.nlm.nih.gov/geo/datasets/GDS1nnn/GDS1615/soft/GDS1615_full.soft.gz' adata = sc.read(filename, backup_url=url, cache=True) return adata
def test_download(): """Test that fetch_mldata is able to download and cache a data set.""" _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { 'label': sp.ones((150,)), 'data': sp.ones((150, 4)), }, }) try: mock = fetch_mldata('mock', data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "target", "data"]: assert_in(n, mock) assert_equal(mock.target.shape, (150,)) assert_equal(mock.data.shape, (150, 4)) assert_raises(datasets.mldata.HTTPError, fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref
def test_fetch_one_column(): _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' # create fake data set in cache x = sp.arange(6).reshape(2, 3) datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}}) dset = fetch_mldata(dataname, data_home=tmpdir) for n in ["COL_NAMES", "DESCR", "data"]: assert_in(n, dset) assert_not_in("target", dset) assert_equal(dset.data.shape, (2, 3)) assert_array_equal(dset.data, x) # transposing the data array dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir) assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref
def get_dataset(did): dataset = datasets.get_dataset(did) X, y, categorical = dataset.get_data(target=dataset.default_target_attribute, return_categorical_indicator=True) return X, y, categorical
def run_tests(automl, num_classes, X_train, y_train, X_test, y_test): # import ipdb; ipdb.set_trace() print(model_stats(automl)) print "num_classes", num_classes train_predictions = automl.predict(X_train) train_auc = roc_auc_score(y_train, train_predictions, num_classes=num_classes) print("train data auc score", train_auc) test_predictions = automl.predict(X_test) test_auc = roc_auc_score(y_test, test_predictions, num_classes=num_classes) print("test data auc score", test_auc) # they expect a one-hot encoding try: test_predictions = to_matrix(test_predictions, num_classes=num_classes) if num_classes == 2: their_auc = classification_metrics.auc_metric(y_test, test_predictions, task=BINARY_CLASSIFICATION) else: their_auc = classification_metrics.auc_metric(y_test, test_predictions, task=MULTICLASS_CLASSIFICATION) # they compute Gini index # 2*AUC-1 # e.g. 2*0.8-1 = 0.6 # verified for binary and multiclass datasets. print("their test data auc score (2*auc-1)", their_auc) print("their test data auc score (reverted from Gini index)", (their_auc + 1) / 2) except Exception as e: print e # use this for stats
def test_dataset(self): self.working_dir = "digits_test" dataset_name = "digits" digits = sklearn.datasets.load_digits() X = digits.data y = digits.target return self.run_dataset(dataset_name, X, y)
def blobs(n_centers=5, cluster_std=1.0, n_samples=640): """Gaussian Blobs. Parameters ---------- n_centers : `int`, optional (default: 5) Number of cluster centers. cluster_std : `float`, optional (default: 1.0) Standard deviation of clusters. n_samples : `int`, optional (default: 640) Number of samples. By default, this is the same sample number as in ``sc.examples.krumsiek11()``. Returns ------- adata : :class:`~scanpy.api.AnnData` Annotated data matrix containing a sample annotation 'blobs' that indicates cluster identity. """ import sklearn.datasets X, y = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=11, centers=n_centers, cluster_std=cluster_std, random_state=0) return sc.AnnData(X, smp={'blobs': y.astype(str)})
def test_mldata_filename(): cases = [('datasets-UCI iris', 'datasets-uci-iris'), ('news20.binary', 'news20binary'), ('book-crossing-ratings-1.0', 'book-crossing-ratings-10'), ('Nile Water Level', 'nile-water-level'), ('MNIST (original)', 'mnist-original')] for name, desired in cases: assert_equal(mldata_filename(name), desired)
def setup_module(): # setup mock urllib2 module to avoid downloading from mldata.org install_mldata_mock({ 'mnist-original': { 'data': np.empty((70000, 784)), 'label': np.repeat(np.arange(10, dtype='d'), 7000), }, 'iris': { 'data': np.empty((150, 4)), }, 'datasets-uci-iris': { 'double0': np.empty((150, 4)), 'class': np.empty((150,)), }, })