Python sklearn.datasets 模块,load_files() 实例源码

我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用sklearn.datasets.load_files()

项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def load_files(path, encoding='gbk'):
    """
    :param filename:
     structure such as the following:
        container_folder/
            category_1_folder/
                file_1.txt
                file_2.txt
                ...
                file_42.txt
            category_2_folder/
                file_43.txt
                file_44.txt
    :param encoding:
    :return: Bunch object
    """
    return datasets.load_files(path, encoding=encoding, decode_error='ignore', shuffle=False)
项目:nlp-chinese_text_classification    作者:iamiamn    | 项目源码 | 文件源码
def getDatas(dataset_dir_name):
    movie_reviews = load_files(dataset_dir_name)

    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)

    #word_tokenizer ??????????????????????????????????????????????????
    vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
    word_tokenizer = vectorizer.build_tokenizer()


    #????????list
    doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
    doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)


    return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def get_datasets_localdata(container_path=None, categories=None, load_content=True,
                       encoding='utf-8', shuffle=True, random_state=42):
    """
    Load text files with categories as subfolder names.
    Individual samples are assumed to be files stored a two levels folder structure.
    :param container_path: The path of the container
    :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the dataset
    """
    datasets = load_files(container_path=container_path, categories=categories,
                          load_content=load_content, shuffle=shuffle, encoding=encoding,
                          random_state=random_state)
    return datasets
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def get_datasets_localdata(container_path=None, categories=None, load_content=True,
                       encoding='utf-8', shuffle=True, random_state=42):
    """
    Load text files with categories as subfolder names.
    Individual samples are assumed to be files stored a two levels folder structure.
    :param container_path: The path of the container
    :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
    :param shuffle: shuffle the list or not
    :param random_state: seed integer to shuffle the dataset
    :return: data and labels of the dataset
    """
    datasets = load_files(container_path=container_path, categories=categories,
                          load_content=load_content, shuffle=shuffle, encoding=encoding,
                          random_state=random_state)
    return datasets
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def __init__(self, cfg=None):
        """
        Load text files with categories as subfolder names.
        Individual samples are assumed to be files stored a two levels folder structure.
        :param container_path: The path of the container
        :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
        :param shuffle: shuffle the list or not
        :param random_state: seed integer to shuffle the dataset
        :return: data and labels of the dataset
        """
        super().__init__()
        self.__dataset__ = load_files(container_path=cfg['container_path'], categories=cfg['categories'],
                                      load_content=cfg['load_content'], shuffle=cfg['shuffle'],
                                      encoding=cfg['encoding'], random_state=cfg['random_state'])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_default_empty_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 0)
    assert_equal(len(res.target_names), 0)
    assert_equal(res.DESCR, None)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_default_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_load_files_w_categories_desc_and_encoding():
    category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop()
    res = load_files(LOAD_FILES_ROOT, description="test",
                     categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_load_files_wo_load_content():
    res = load_files(LOAD_FILES_ROOT, load_content=False)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.get('data'), None)
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def __init__(self, cfg=None):
        """
        Load text files with categories as subfolder names.
        Individual samples are assumed to be files stored a two levels folder structure.
        :param container_path: The path of the container
        :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
        :param shuffle: shuffle the list or not
        :param random_state: seed integer to shuffle the dataset
        :return: data and labels of the dataset
        """
        super().__init__()
        self.__dataset__ = load_files(container_path=cfg['container_path'], categories=cfg['categories'],
                                      load_content=cfg['load_content'], shuffle=cfg['shuffle'],
                                      encoding=cfg['encoding'], random_state=cfg['random_state'])
项目:faceNet_RealTime    作者:jack55436001    | 项目源码 | 文件源码
def main(args):

    with tf.Graph().as_default():

        with tf.Session() as sess:

            # create output directory if it doesn't exist
            output_dir = os.path.expanduser(args.output_dir)
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)

            # load the model
            print("Loading trained model...\n")
            meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.trained_model_dir))
            facenet.load_model(args.trained_model_dir, meta_file, ckpt_file)

            # grab all image paths and labels
            print("Finding image paths and targets...\n")
            data = load_files(args.data_dir, load_content=False, shuffle=False)
            labels_array = data['target']
            paths = data['filenames']

            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
            phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")

            image_size = images_placeholder.get_shape()[1]
            embedding_size = embeddings.get_shape()[1]

            # Run forward pass to calculate embeddings
            print('Generating embeddings from images...\n')
            start_time = time.time()
            batch_size = args.batch_size
            nrof_images = len(paths)
            nrof_batches = int(np.ceil(1.0*nrof_images / batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in xrange(nrof_batches):
                start_index = i*batch_size
                end_index = min((i+1)*batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, do_random_crop=False, do_random_flip=False, image_size=image_size, do_prewhiten=True)
                feed_dict = { images_placeholder:images, phase_train_placeholder:False}
                emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict)

            time_avg_forward_pass = (time.time() - start_time) / float(nrof_images)
            print("Forward pass took avg of %.3f[seconds/image] for %d images\n" % (time_avg_forward_pass, nrof_images))

            print("Finally saving embeddings and gallery to: %s" % (output_dir))
            # save the gallery and embeddings (signatures) as numpy arrays to disk
            np.save(os.path.join(output_dir, "gallery.npy"), labels_array)
            np.save(os.path.join(output_dir, "signatures.npy"), emb_array)
项目:facenet    作者:davidsandberg    | 项目源码 | 文件源码
def main(args):

    with tf.Graph().as_default():

        with tf.Session() as sess:

            # create output directory if it doesn't exist
            output_dir = os.path.expanduser(args.output_dir)
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)

            # load the model
            print("Loading trained model...\n")
            meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.trained_model_dir))
            facenet.load_model(args.trained_model_dir, meta_file, ckpt_file)

            # grab all image paths and labels
            print("Finding image paths and targets...\n")
            data = load_files(args.data_dir, load_content=False, shuffle=False)
            labels_array = data['target']
            paths = data['filenames']

            # Get input and output tensors
            images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
            embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
            phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")

            image_size = images_placeholder.get_shape()[1]
            embedding_size = embeddings.get_shape()[1]

            # Run forward pass to calculate embeddings
            print('Generating embeddings from images...\n')
            start_time = time.time()
            batch_size = args.batch_size
            nrof_images = len(paths)
            nrof_batches = int(np.ceil(1.0*nrof_images / batch_size))
            emb_array = np.zeros((nrof_images, embedding_size))
            for i in xrange(nrof_batches):
                start_index = i*batch_size
                end_index = min((i+1)*batch_size, nrof_images)
                paths_batch = paths[start_index:end_index]
                images = facenet.load_data(paths_batch, do_random_crop=False, do_random_flip=False, image_size=image_size, do_prewhiten=True)
                feed_dict = { images_placeholder:images, phase_train_placeholder:False}
                emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict)

            time_avg_forward_pass = (time.time() - start_time) / float(nrof_images)
            print("Forward pass took avg of %.3f[seconds/image] for %d images\n" % (time_avg_forward_pass, nrof_images))

            print("Finally saving embeddings and gallery to: %s" % (output_dir))
            # save the gallery and embeddings (signatures) as numpy arrays to disk
            np.save(os.path.join(output_dir, "gallery.npy"), labels_array)
            np.save(os.path.join(output_dir, "signatures.npy"), emb_array)