Python sklearn.feature_extraction 模块,FeatureHasher() 实例源码

我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用sklearn.feature_extraction.FeatureHasher()

项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")],
             ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)                 # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6)
项目:fastxml    作者:Refefer    | 项目源码 | 文件源码
def __init__(self, verbose, min_label_count=1, inference=False):
        self.fh = FeatureHasher(dtype='float32')
        self.verbose = verbose
        self.inference = inference
        self.min_label_count = min_label_count
项目:gym-malware    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype)
项目:gym-malware    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)
项目:gym-malware    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype)
项目:youarespecial    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        libraries = [l.lower() for l in binary.libraries]
        # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry
        imports = [lib.name.lower() + ':' +
                   e.name for lib in binary.imports for e in lib.entries]

        # two separate elements: libraries (alone) and fully-qualified names of imported functions
        return np.concatenate([
            FeatureHasher(256, input_type="string", dtype=self.dtype).transform(
                [libraries]).toarray(),
            FeatureHasher(1024, input_type="string", dtype=self.dtype).transform(
                [imports]).toarray()
        ], axis=-1).flatten().astype(self.dtype)
项目:youarespecial    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)
项目:youarespecial    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):

        return np.concatenate([
            [[binary.header.time_date_stamps]],
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.header.machine)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.header.characteristics_list]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.subsystem)]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(),
            FeatureHasher(10, input_type="string", dtype=self.dtype).transform(
                [[str(binary.optional_header.magic)]]).toarray(),
            [[binary.optional_header.major_image_version]],
            [[binary.optional_header.minor_image_version]],
            [[binary.optional_header.major_linker_version]],
            [[binary.optional_header.minor_linker_version]],
            [[binary.optional_header.major_operating_system_version]],
            [[binary.optional_header.minor_operating_system_version]],
            [[binary.optional_header.major_subsystem_version]],
            [[binary.optional_header.minor_subsystem_version]],
            [[binary.optional_header.sizeof_code]],
            [[binary.optional_header.sizeof_headers]],
            [[binary.optional_header.sizeof_heap_commit]],
        ], axis=-1).flatten().astype(self.dtype)
项目:audit-log-detection    作者:twosixlabs    | 项目源码 | 文件源码
def __init__(self, *args, **kwargs):

        self.hasher = FeatureHasher(*args, **kwargs)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_feature_hasher_dicts():
    h = FeatureHasher(n_features=16)
    assert_equal("dict", h.input_type)

    raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}]
    X1 = FeatureHasher(n_features=16).transform(raw_X)
    gen = (iter(d.items()) for d in raw_X)
    X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen)
    assert_array_equal(X1.toarray(), X2.toarray())
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_feature_hasher_pairs():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2},
                                       {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_hasher_invalid_input():
    assert_raises(ValueError, FeatureHasher, input_type="gobbledygook")
    assert_raises(ValueError, FeatureHasher, n_features=-1)
    assert_raises(ValueError, FeatureHasher, n_features=0)
    assert_raises(TypeError, FeatureHasher, n_features='ham')

    h = FeatureHasher(n_features=np.uint16(2 ** 6))
    assert_raises(ValueError, h.transform, [])
    assert_raises(Exception, h.transform, [[5.5]])
    assert_raises(Exception, h.transform, [[None]])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_hasher_zeros():
    # Assert that no zeros are materialized in the output.
    X = FeatureHasher().transform([{'foo': 0}])
    assert_equal(X.data.shape, (0,))
项目:gym-malware    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        # general statistics about sections
        general = [len(binary.sections),                         # total number of sections
                   # number of sections with nonzero size
                   sum(1 for s in binary.sections if s.size == 0),
                   # number of sections with an empty name
                   sum(1 for s in binary.sections if s.name == ""),
                   sum(1 for s in binary.sections if s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_READ)
                       and s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE)),  # number of RX
                   sum(1 for s in binary.sections if s.has_characteristic(
                       lief.PE.SECTION_CHARACTERISTICS.MEM_WRITE)),  # number of W
                   ]

        # gross characteristics of each section
        section_sizes = [(s.name, len(s.content)) for s in binary.sections]
        section_entropy = [(s.name, s.entropy) for s in binary.sections]
        section_vsize = [(s.name, s.virtual_size) for s in binary.sections]

        # properties of entry point, or if invalid, the first executable section
        try:
            entry = binary.section_from_offset(binary.entrypoint)
        except lief.not_found:
            # bad entry point, let's find the first executable section
            entry = None
            for s in binary.sections:
                if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
                    entry = s
                    break
        if entry is not None:
            entry_name = [entry.name]
            entry_characteristics = [str(c)
                                     for c in entry.characteristics_lists]
            # ['SECTION_CHARACTERISTICS.CNT_CODE', 'SECTION_CHARACTERISTICS.MEM_EXECUTE','SECTION_CHARACTERISTICS.MEM_READ']
        else:
            entry_name = []
            entry_characteristics = []

        # let's dump all this info into a single vector
        return np.concatenate([
            np.atleast_2d(np.asarray(general, dtype=self.dtype)),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_sizes]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_entropy]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_vsize]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform(
                [entry_name]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform([entry_characteristics]).toarray()
            ], axis=-1).flatten().astype(self.dtype)
项目:youarespecial    作者:endgameinc    | 项目源码 | 文件源码
def __call__(self, binary):
        # general statistics about sections
        general = [len(binary.sections),                         # total number of sections
                   # number of sections with nonzero size
                   sum(1 for s in binary.sections if s.size == 0),
                   # number of sections with an empty name
                   sum(1 for s in binary.sections if s.name == ""),
                   sum(1 for s in binary.sections if s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_READ)
                       and s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE)),  # number of RX
                   sum(1 for s in binary.sections if s.has_characteristic(
                       lief.PE.SECTION_CHARACTERISTICS.MEM_WRITE)),  # number of W
                   ]

        # gross characteristics of each section
        section_sizes = [(s.name, len(s.content)) for s in binary.sections]
        section_entropy = [(s.name, s.entropy) for s in binary.sections]
        section_vsize = [(s.name, s.virtual_size) for s in binary.sections]

        # properties of entry point, or if invalid, the first executable section
        try:
            entry = binary.section_from_offset(binary.entrypoint)
        except lief.not_found:
            # bad entry point, let's find the first executable section
            entry = None
            for s in binary.sections:
                if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
                    entry = s
                    break
        if entry is not None:
            entry_name = [entry.name]
            entry_characteristics = [str(c)
                                     for c in entry.characteristics_lists]
            # ['SECTION_CHARACTERISTICS.CNT_CODE', 'SECTION_CHARACTERISTICS.MEM_EXECUTE','SECTION_CHARACTERISTICS.MEM_READ']
        else:
            entry_name = []
            entry_characteristics = []

        # let's dump all this info into a single vector
        return np.concatenate([
            np.atleast_2d(np.asarray(general, dtype=self.dtype)),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_sizes]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_entropy]).toarray(),
            FeatureHasher(50, input_type="pair", dtype=self.dtype).transform(
                [section_vsize]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform(
                [entry_name]).toarray(),
            FeatureHasher(50, input_type="string", dtype=self.dtype).transform([entry_characteristics]).toarray()
            ], axis=-1).flatten().astype(self.dtype)
项目:rec-sys-experiments    作者:rnowling    | 项目源码 | 文件源码
def train_and_score(max_movie_id, training, testset, model_sizes):
    extractors = dict()
    models = dict()

    print "Creating models"
    for model_size in model_sizes:
        extractors[model_size] = FeatureHasher(n_features=2**model_size)
        models[model_size] = SGDClassifier(loss="log", penalty="L2")

    print "Training"
    for i, (user_id, seen_movies) in enumerate(training):
        print "Training on user", i, user_id
        labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies)
        for model_size, extractor in extractors.iteritems():
            seen_features = extractor.transform(seen_pairs)
            unseen_features = extractor.transform(unseen_pairs)
            features = sp.vstack([seen_features, unseen_features])
            model = models[model_size]
            model.partial_fit(features, labels, classes=[0, 1])

    print "Testing"
    all_labels = []
    all_predicted_labels = defaultdict(list)
    all_predicted_prob = defaultdict(list)
    for i, (user_id, seen_movies) in enumerate(testset):
        print "Testing on user", i, user_id
        labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies)
        all_labels.extend(labels)

        for model_size, extractor in extractors.iteritems():
            seen_features = extractor.transform(seen_pairs)
            unseen_features = extractor.transform(unseen_pairs)
            features = sp.vstack([seen_features, unseen_features])

            model = models[model_size]
            predicted_labels = model.predict(features)
            predicted_prob = model.predict_proba(features)
            all_predicted_labels[model_size].extend(predicted_labels)
            # Probabilities for positive class
            all_predicted_prob[model_size].extend(predicted_prob[:, 1])

    print "Scoring"
    aucs = []
    nnz_features = []
    for model_size, model in models.iteritems():
        pred_log_prob = all_predicted_prob[model_size]
        auc = roc_auc_score(all_labels, pred_log_prob)
        cm = confusion_matrix(all_labels, all_predicted_labels[model_size])
        print "Model size", model_size, "auc", auc
        print cm
        print
        aucs.append(auc)
        nnz_features.append(np.count_nonzero(model.coef_))

    return aucs, nnz_features