Python sklearn.datasets 模块,get_data_home() 实例源码

我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用sklearn.datasets.get_data_home()

项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def split(p):
    output = os.path.join(get_data_home(), "kddcup.parq")
    if not os.path.exists(output):

        dtype = {
            1: 'category',
            2: 'category',
            3: 'category',
            41: 'category',
        }

        df = pd.read_csv(p, header=None, dtype=dtype)
        cat_cols = df.select_dtypes(include=['category']).columns
        df[cat_cols] = df[cat_cols].apply(lambda col: col.cat.codes)
        df.columns = list(string.ascii_letters[:len(df.columns)])

        ddf = dd.from_pandas(df, npartitions=16)
        ddf.to_parquet(output)

    return output
项目:pylmnn    作者:johny-c    | 项目源码 | 文件源码
def fetch_load_letters(data_dir=None):
    path = os.path.join(get_data_home(data_dir), 'letter-recognition.data')

    if not os.path.exists(path):
        from urllib import request
        url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
        print('Downloading letter-recognition dataset from {}...'.format(url))
        request.urlretrieve(url=url, filename=path)
    else:
        print('Found letter-recognition in {}!'.format(path))

    X, y = [], []
    with open(path) as f:
        reader = csv.reader(f)
        for row in reader:
            y.append(row[0])
            X.append(row[1:])
    labels, label_idx = np.unique(y, return_inverse=True)
    return np.asarray(X, dtype=float), label_idx
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def download():
    p = os.path.join(get_data_home(), "kddcup.data.gz")
    if os.path.exists(p):
        return p
    r = requests.get(URL, stream=True)
    with open(p, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    return p
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_data_home():
    # get_data_home will point to a pre-existing folder
    data_home = get_data_home(data_home=DATA_HOME)
    assert_equal(data_home, DATA_HOME)
    assert_true(os.path.exists(data_home))

    # clear_data_home will delete both the content and the folder it-self
    clear_data_home(data_home=data_home)
    assert_false(os.path.exists(data_home))

    # if the folder is missing it will be created again
    data_home = get_data_home(data_home=DATA_HOME)
    assert_true(os.path.exists(data_home))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, '20news_home')):
        raise SkipTest("Skipping dataset loading doctests")
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def setup_module(module):
    data_home = get_data_home()
    if not exists(join(data_home, 'lfw_home')):
        raise SkipTest("Skipping dataset loading doctests")
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def setup_module():
    check_skip_network()

    # skip the test in rcv1.rst if the dataset is not already loaded
    rcv1_dir = os.path.join(get_data_home(), "RCV1")
    if not os.path.exists(rcv1_dir):
        raise SkipTest("Download RCV1 dataset to run this test.")
项目:ShallowLearn    作者:giacbrd    | 项目源码 | 文件源码
def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc


###############################################################################
# Main
# ----
#
# Create the vectorizer and limit the number of features to a reasonable
# maximum
项目:pylmnn    作者:johny-c    | 项目源码 | 文件源码
def fetch_load_isolet(data_dir=None):
    train = 'isolet1+2+3+4.data.Z'
    test = 'isolet5.data.Z'
    path_train = os.path.join(get_data_home(data_dir), train)
    path_test  = os.path.join(get_data_home(data_dir), test)

    if not os.path.exists(path_train[:-2]) or not os.path.exists(path_test[:-2]):
        from urllib import request
        url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/isolet/'
        if not os.path.exists(path_train[:-2]):
            if not os.path.exists(path_train):
                print('Downloading Isolated Letter Speech Recognition data set from {}...'.format(
                    url))
                request.urlretrieve(url=url+train, filename=path_train)
            # os.system('gzip -d ' + path_train)
            decompress_z(path_train)
        if not os.path.exists(path_test[:-2]):
            if not os.path.exists(path_test):
                print('Downloading Isolated Letter Speech Recognition data set from {}...'.format(
                    url))
                request.urlretrieve(url=url+test, filename=path_test)
            # os.system('gzip -d ' + path_test)
            decompress_z(path_test)
    else:
        print('Found Isolated Letter Speech Recognition data set!')

    xtr, ytr = [], []
    with open(path_train[:-2]) as f:
        reader = csv.reader(f)
        for row in reader:
            xtr.append(row[:-1])
            ytr.append(int(float(row[-1])))
    labels, ytr = np.unique(ytr, return_inverse=True)
    xte, yte = [], []
    with open(path_test[:-2]) as f:
        reader = csv.reader(f)
        for row in reader:
            xte.append(row[:-1])
            yte.append(int(float(row[-1])))
    labels, yte = np.unique(yte, return_inverse=True)

    return np.asarray(xtr, dtype=float), np.asarray(xte, dtype=float), ytr, yte
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc


###############################################################################
# Main
###############################################################################
# Create the vectorizer and limit the number of features to a reasonable
# maximum