我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用sklearn.feature_extraction.FeatureHasher()。
def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, 6)
def __init__(self, verbose, min_label_count=1, inference=False): self.fh = FeatureHasher(dtype='float32') self.verbose = verbose self.inference = inference self.min_label_count = min_label_count
def __call__(self, binary): libraries = [l.lower() for l in binary.libraries] # we'll create a string like "kernel32.dll:CreateFileMappingA" for each entry imports = [lib.name.lower() + ':' + e.name for lib in binary.imports for e in lib.entries] # two separate elements: libraries (alone) and fully-qualified names of imported functions return np.concatenate([ FeatureHasher(256, input_type="string", dtype=self.dtype).transform( [libraries]).toarray(), FeatureHasher(1024, input_type="string", dtype=self.dtype).transform( [imports]).toarray() ], axis=-1).flatten().astype(self.dtype)
def __call__(self, binary): return FeatureHasher(128, input_type="string", dtype=self.dtype).transform([binary.exported_functions]).toarray().flatten().astype(self.dtype)
def __call__(self, binary): return np.concatenate([ [[binary.header.time_date_stamps]], FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.header.machine)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.header.characteristics_list]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.subsystem)]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(c) for c in binary.optional_header.dll_characteristics_lists]]).toarray(), FeatureHasher(10, input_type="string", dtype=self.dtype).transform( [[str(binary.optional_header.magic)]]).toarray(), [[binary.optional_header.major_image_version]], [[binary.optional_header.minor_image_version]], [[binary.optional_header.major_linker_version]], [[binary.optional_header.minor_linker_version]], [[binary.optional_header.major_operating_system_version]], [[binary.optional_header.minor_operating_system_version]], [[binary.optional_header.major_subsystem_version]], [[binary.optional_header.minor_subsystem_version]], [[binary.optional_header.sizeof_code]], [[binary.optional_header.sizeof_headers]], [[binary.optional_header.sizeof_heap_commit]], ], axis=-1).flatten().astype(self.dtype)
def __init__(self, *args, **kwargs): self.hasher = FeatureHasher(*args, **kwargs)
def test_feature_hasher_dicts(): h = FeatureHasher(n_features=16) assert_equal("dict", h.input_type) raw_X = [{"dada": 42, "tzara": 37}, {"gaga": 17}] X1 = FeatureHasher(n_features=16).transform(raw_X) gen = (iter(d.items()) for d in raw_X) X2 = FeatureHasher(n_features=16, input_type="pair").transform(gen) assert_array_equal(X1.toarray(), X2.toarray())
def test_feature_hasher_pairs(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 2], x1_nz) assert_equal([1, 3, 4], x2_nz)
def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
def test_hasher_invalid_input(): assert_raises(ValueError, FeatureHasher, input_type="gobbledygook") assert_raises(ValueError, FeatureHasher, n_features=-1) assert_raises(ValueError, FeatureHasher, n_features=0) assert_raises(TypeError, FeatureHasher, n_features='ham') h = FeatureHasher(n_features=np.uint16(2 ** 6)) assert_raises(ValueError, h.transform, []) assert_raises(Exception, h.transform, [[5.5]]) assert_raises(Exception, h.transform, [[None]])
def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{'foo': 0}]) assert_equal(X.data.shape, (0,))
def __call__(self, binary): # general statistics about sections general = [len(binary.sections), # total number of sections # number of sections with nonzero size sum(1 for s in binary.sections if s.size == 0), # number of sections with an empty name sum(1 for s in binary.sections if s.name == ""), sum(1 for s in binary.sections if s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_READ) and s.has_characteristic(lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE)), # number of RX sum(1 for s in binary.sections if s.has_characteristic( lief.PE.SECTION_CHARACTERISTICS.MEM_WRITE)), # number of W ] # gross characteristics of each section section_sizes = [(s.name, len(s.content)) for s in binary.sections] section_entropy = [(s.name, s.entropy) for s in binary.sections] section_vsize = [(s.name, s.virtual_size) for s in binary.sections] # properties of entry point, or if invalid, the first executable section try: entry = binary.section_from_offset(binary.entrypoint) except lief.not_found: # bad entry point, let's find the first executable section entry = None for s in binary.sections: if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: entry = s break if entry is not None: entry_name = [entry.name] entry_characteristics = [str(c) for c in entry.characteristics_lists] # ['SECTION_CHARACTERISTICS.CNT_CODE', 'SECTION_CHARACTERISTICS.MEM_EXECUTE','SECTION_CHARACTERISTICS.MEM_READ'] else: entry_name = [] entry_characteristics = [] # let's dump all this info into a single vector return np.concatenate([ np.atleast_2d(np.asarray(general, dtype=self.dtype)), FeatureHasher(50, input_type="pair", dtype=self.dtype).transform( [section_sizes]).toarray(), FeatureHasher(50, input_type="pair", dtype=self.dtype).transform( [section_entropy]).toarray(), FeatureHasher(50, input_type="pair", dtype=self.dtype).transform( [section_vsize]).toarray(), FeatureHasher(50, input_type="string", dtype=self.dtype).transform( [entry_name]).toarray(), FeatureHasher(50, input_type="string", dtype=self.dtype).transform([entry_characteristics]).toarray() ], axis=-1).flatten().astype(self.dtype)
def train_and_score(max_movie_id, training, testset, model_sizes): extractors = dict() models = dict() print "Creating models" for model_size in model_sizes: extractors[model_size] = FeatureHasher(n_features=2**model_size) models[model_size] = SGDClassifier(loss="log", penalty="L2") print "Training" for i, (user_id, seen_movies) in enumerate(training): print "Training on user", i, user_id labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies) for model_size, extractor in extractors.iteritems(): seen_features = extractor.transform(seen_pairs) unseen_features = extractor.transform(unseen_pairs) features = sp.vstack([seen_features, unseen_features]) model = models[model_size] model.partial_fit(features, labels, classes=[0, 1]) print "Testing" all_labels = [] all_predicted_labels = defaultdict(list) all_predicted_prob = defaultdict(list) for i, (user_id, seen_movies) in enumerate(testset): print "Testing on user", i, user_id labels, (seen_pairs, unseen_pairs) = generate_features(max_movie_id, seen_movies) all_labels.extend(labels) for model_size, extractor in extractors.iteritems(): seen_features = extractor.transform(seen_pairs) unseen_features = extractor.transform(unseen_pairs) features = sp.vstack([seen_features, unseen_features]) model = models[model_size] predicted_labels = model.predict(features) predicted_prob = model.predict_proba(features) all_predicted_labels[model_size].extend(predicted_labels) # Probabilities for positive class all_predicted_prob[model_size].extend(predicted_prob[:, 1]) print "Scoring" aucs = [] nnz_features = [] for model_size, model in models.iteritems(): pred_log_prob = all_predicted_prob[model_size] auc = roc_auc_score(all_labels, pred_log_prob) cm = confusion_matrix(all_labels, all_predicted_labels[model_size]) print "Model size", model_size, "auc", auc print cm print aucs.append(auc) nnz_features.append(np.count_nonzero(model.coef_)) return aucs, nnz_features