我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.OneHotEncoder()。
def __do_one_hot_encodings(self): df_train, cv = self.res_data_dict[g_singletonDataFilePath.getTrainDir()] df_testset1 = self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] df_testset2 = self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] enc = OneHotEncoder(sparse=False) cross_feature_dict = self.__get_label_encode_dict() to_be_encoded = [] for _, new_feature_name in cross_feature_dict.iteritems(): to_be_encoded.append(new_feature_name) #fix all data source to_be_stacked_df = pd.concat([df_train[to_be_encoded], df_testset1[to_be_encoded], df_testset2[to_be_encoded]], axis = 0) enc.fit(to_be_stacked_df) enc, to_be_encoded = self.__filter_too_big_onehot_encoding(enc, to_be_encoded, df_train, df_testset1, df_testset2) # transform on seprate data source self.res_data_dict[g_singletonDataFilePath.getTrainDir()] = self.__do_one_hot_encoding(df_train, enc, to_be_encoded),cv self.res_data_dict[g_singletonDataFilePath.getTest1Dir()] = self.__do_one_hot_encoding(df_testset1,enc, to_be_encoded) self.res_data_dict[g_singletonDataFilePath.getTest2Dir()] = self.__do_one_hot_encoding(df_testset2, enc, to_be_encoded) return
def vectors_from_txtfile(fname, codec, limit=-1, mutagen=None): f = open(fname) skipped = Counter() vecs = [] for line in f: line = line.strip() try: vecs.append(codec.encode(line, mutagen=mutagen)) if len(vecs) == limit: break except NonEncodableTextException as e: # Too long, or illegal characters skipped[e.reason] += 1 logging.debug("Gathered {} vectors. Skipped {} ({})".format(len(vecs), sum(skipped.values()), dict(skipped))) vecs = np.asarray(vecs) # TODO: Why default to dtype=float? Seems wasteful? Maybe it doesn't really matter. Actually, docs here seem inconsistent? Constructor docs say default float. transform docs say int. Should file a bug on sklearn. return OneHotEncoder(len(codec.alphabet)).fit_transform(vecs) # Adapted from sklearn.utils.extmath.softmax
def __init__(self, n_values, feature_indices): import warnings from sklearn.preprocessing import OneHotEncoder if not isinstance(n_values, np.ndarray): n_values = np.array(n_values) if not isinstance(feature_indices, np.ndarray): feature_indices = np.array(feature_indices) assert feature_indices.size > 0 assert feature_indices.shape == n_values.shape for nv in n_values: if nv <= 2: raise Exception("Categorical features must have 3+ labels") self.feature_indices = feature_indices self.n_values = n_values with warnings.catch_warnings(): warnings.simplefilter("ignore") self.encoder = OneHotEncoder(n_values=n_values, sparse=False) self.columnlabels = None self.xform_start_indices = None
def test_boston_OHE_plus_trees(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Trees",GradientBoostingRegressor(random_state = 1))]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'target') # Get predictions df = pd.DataFrame(data.data, columns=data.feature_names) df['prediction'] = pl.predict(data.data) # Evaluate it result = evaluate_regressor(spec, df, 'target', verbose = False) assert result["max_error"] < 0.0001
def test_boston_OHE(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: model = OneHotEncoder(categorical_features = categorical_features, sparse=False) model.fit(data.data, data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data)] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0 # This test still isn't working
def test_boston_OHE_pipeline(self): data = load_boston() for categorical_features in [ [3], [8], [3, 8], [8,3] ]: # Put it in a pipeline so that we can test whether the output dimension # handling is correct. model = Pipeline([("OHE", OneHotEncoder(categorical_features = categorical_features)), ("Normalizer", Normalizer())]) model.fit(data.data.copy(), data.target) # Convert the model spec = sklearn.convert(model, data.feature_names, 'out').get_spec() input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in model.transform(data.data.copy())] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def __init__(self, X, y, multinomial, rounding=None): self.input_features = X.columns.values X = X.values cat_idx = [i for i in range(X.shape[1]) if min(X[:, i]) == 0] self.encoder = OneHotEncoder(categorical_features=cat_idx, sparse=False) X = self.encoder.fit_transform(X) self.features = range(X.shape[1]) self.rounding = rounding # train a model on the whole dataset self.model = LogisticRegression() self.model.fit(X, y) self.w = self.model.coef_ self.intercept = self.model.intercept_ self.multinomial = multinomial assert not (multinomial and len(self.get_classes()) == 2) RegressionExtractor.__init__(self)
def fit_transform(self, X, y=None, sample_weight=None): X = check_array(X, accept_sparse=['csc'], ensure_2d=False) if sp.issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. X.sort_indices() X_, y_ = generate_discriminative_dataset(X) super(RandomForestEmbedding, self).fit(X_, y_, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse=True) if self.sparse_output: return self.one_hot_encoder_.fit_transform(self.apply(X)) return self.apply(X)
def test_OneHotEncoder(): ''' test the method :return: None ''' X=[ [1,2,3,4,5], [5,4,3,2,1], [3,3,3,3,3,], [1,1,1,1,1] ] print("before transform:",X) encoder=OneHotEncoder(sparse=False) encoder.fit(X) print("active_features_:",encoder.active_features_) print("feature_indices_:",encoder.feature_indices_) print("n_values_:",encoder.n_values_) print("after transform:",encoder.transform( [[1,2,3,4,5]]))
def get_one_hot_key(): encoder = OneHotEncoder(n_values=[3, 3], sparse=False) encoder.fit([[0, 0]]) intersection_id_map = dict( A=0, B=1, C=2 ) def one_hot_key(ix, **kargs): return encoder.transform([ [ intersection_id_map[ix[-2]], ix[-1] - 1 ] ])[0].tolist() return one_hot_key
def one_hot(hypno, n_categories): enc = OneHotEncoder(n_values=n_categories) hypno = enc.fit_transform(hypno).toarray() return np.array(hypno,'int32')
def one_hot_encoder(df, estimated_var): df_class = df.copy() ohe = OneHotEncoder() label_classes = df_class[estimated_var].factorize()[1] new_one_hot_encoded_features = [''.join([estimated_var, '_', x]) for x in label_classes] mask = ~df[estimated_var].isnull() feature_var_values = ohe.fit_transform(np.reshape(np.array(df[''.join([estimated_var, 'Num'])][mask].values), (df[mask].shape[0], 1))).toarray().astype(int) # Create new feature_var columns with one-hot encoded values for ite in new_one_hot_encoded_features: df[ite] = df[estimated_var] df.loc[mask, tuple(new_one_hot_encoded_features)] = feature_var_values
def gen_pclass(self, data): from sklearn.preprocessing import OneHotEncoder data_pclass = data['data_df'][['Pclass']] # set unknown as a class data_pclass.fillna(4, inplace=True) return {'pclass': OneHotEncoder(sparse=False) .fit_transform(data_pclass.values)}
def generate_train_random_batch(data,label,batch_size,is_train = True): indics = np.random.randint(0,len(data),size=batch_size) vector = np.zeros([len(data)]) vector[indics] = 1 #y_label = OneHotEncoder(len(data),indics,sparse=False) data_batch = data.iloc[indics] if is_train: label_batch = label.iloc[indics] return data_batch.as_matrix(),label_batch.as_matrix(),vector else: return data_batch.as_matrix(),vector
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = RandomForestClassifier() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = RandomForestClassifier() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(Exception): from sklearn.preprocessing import OneHotEncoder model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = NuSVR() spec = scikit_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(TypeError): model = OneHotEncoder() spec = scikit_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Check the expected class during covnersion. with self.assertRaises(TypeError): model = OneHotEncoder() spec = libsvm.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = DecisionTreeRegressor() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = GradientBoostingRegressor() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = GradientBoostingRegressor() spec = xgb_converter.convert(model, 'data', 'out') # Check the expected class during conversion with self.assertRaises(TypeError): model = OneHotEncoder() spec = xgb_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = SVR() spec = sklearn_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(TypeError): model = OneHotEncoder() spec = sklearn_converter.convert(model, 'data', 'out')
def test_conversion_one_column(self): # Fit a single OHE scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data) spec = sklearn.convert(scikit_model, 'single_feature', 'out').get_spec() test_data = [{'single_feature' : row} for row in self.scikit_data] scikit_output = [{'out' : row} for row in scikit_model.transform(self.scikit_data).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_conversion_many_columns(self): scikit_model = OneHotEncoder() scikit_model.fit(self.scikit_data_multiple_cols) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols] scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_conversion_one_column_of_several(self): scikit_model = OneHotEncoder(categorical_features = [0]) scikit_model.fit(copy(self.scikit_data_multiple_cols)) spec = sklearn.convert(scikit_model, ['feature_1', 'feature_2'], 'out').get_spec() test_data = [{'feature_1': row[0], 'feature_2': row[1]} for row in self.scikit_data_multiple_cols] scikit_output = [{'out': row} for row in scikit_model.transform(self.scikit_data_multiple_cols).toarray()] metrics = evaluate_transformer(spec, test_data, scikit_output) self.assertIsNotNone(spec) self.assertIsNotNone(spec.description) self.assertEquals(metrics['num_errors'], 0)
def test_conversion_bad_inputs(self): """ Failure testing for bad conversion. """ # Error on converting an untrained model with self.assertRaises(TypeError): model = OneHotEncoder() spec = converter.convert(model, 'data', 'out', 'regressor')
def test_conversion_bad_inputs(self): from sklearn.preprocessing import OneHotEncoder # Error on converting an untrained model with self.assertRaises(TypeError): model = SVC() spec = scikit_converter.convert(model, 'data', 'out') # Check the expected class during conversion with self.assertRaises(TypeError): model = OneHotEncoder() spec = scikit_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = LinearRegression() spec = convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(TypeError): model = OneHotEncoder() spec = convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = DecisionTreeClassifier() spec = skl_converter(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = GradientBoostingClassifier() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): from sklearn.preprocessing import OneHotEncoder # Error on converting an untrained model with self.assertRaises(TypeError): model = NuSVC() spec = scikit_converter.convert(model, 'data', 'out') # Check the expected class during conversion with self.assertRaises(TypeError): model = OneHotEncoder() spec = scikit_converter.convert(model, 'data', 'out')
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = RandomForestRegressor() spec = skl_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. from sklearn.preprocessing import OneHotEncoder with self.assertRaises(Exception): model = OneHotEncoder() spec = skl_converter.convert(model, 'data', 'out')
def encode_onehot(df, cols, vec=None): """ One-hot encoding is applied to columns specified in a pandas DataFrame. Modified from: https://gist.github.com/kljensen/5452382 Details: http://en.wikipedia.org/wiki/One-hot http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html @param df pandas DataFrame @param cols a list of columns to encode @return a DataFrame with one-hot encoding """ x_data = df[cols] if vec is None: vec = preprocessing.OneHotEncoder() results = vec.fit_transform(x_data).toarray() else: results = vec.transform(x_data).toarray() result_size = results.shape[1] #TODO bug in pca code, find and fix #after_pca_size = 4*len(cols) #if(result_size > 5 and after_pca_size < result_size): # results = results[:min(500000, results.shape[0])] # results = doPCA(results, after_pca_size) # result_size = after_pca_size vec_data = pd.DataFrame(results) vec_data.columns = ["f"+str(i) for i in range(0, result_size)] vec_data.index = df.index #df = df.drop(cols, axis=1) df = df.join(vec_data) return df, vec
def day_of_week(encode_flg=False): """????? """ train_df = data_loader.train() day_of_week_df = train_df.datetime.dt.dayofweek day_of_week_df.index = train_df.datetime if encode_flg: oh_encoder = OneHotEncoder(sparse=False) return pd.DataFrame( oh_encoder.fit_transform(train_df.datetime.dt.dayofweek.values.reshape(-1, 1)), index=train_df.datetime, columns=['dow0', 'dow1', 'dow2', 'dow3', 'dow4', 'dow5', 'dow6'] ) return day_of_week_df.to_frame('dow')
def getmAP(clf, X_base, y_base, X_query, y_query, id_label, y_label): y_base, y_query = y_base[:, 0], y_query[:, 0] oneh = OneHotEncoder(sparse=False) y_label_1h = oneh.fit_transform(y_label) activations = clf.predict_proba(X_base) activations[id_label] = y_label_1h if args.code == "onehot": argmax = activations.argmax(axis=1).reshape((-1, 1)) activations = oneh.fit_transform(argmax) if args.code == "lsh": index = faiss.IndexLSH(y_label_1h.shape[1], args.nbits, True, True) else: index = faiss.IndexFlatIP(y_label_1h.shape[1]) index.train(activations.astype(np.float32)) index.add(activations.astype(np.float32)) queryAct = clf.predict_proba(X_query).astype(np.float32) _, idc = index.search(queryAct, y_base.shape[0]) predictions = y_base[idc] results = np.equal(predictions, np.expand_dims(y_query, axis=1)) return computemAP(results)
def getdataset(datasetname, onehot_encode_strings=True): # load dataset = fetch_mldata(datasetname) # get X and y X = dshape(dataset.data) try: target = dshape(dataset.target) except: print "WARNING: No target found. Taking last column of data matrix as target" target = X[:, -1] X = X[:, :-1] if len(target.shape)>1 and target.shape[1]>X.shape[1]: # some mldata sets are mixed up... X = target target = dshape(dataset.data) if len(X.shape) == 1 or X.shape[1] <= 1: for k in dataset.keys(): if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]: X = np.hstack((X, dshape(dataset[k]))) # one-hot for categorical values if onehot_encode_strings: cat_ft=[i for i in range(X.shape[1]) if 'str' in str(type(unpack(X[0,i]))) or 'unicode' in str(type(unpack(X[0,i])))] if len(cat_ft): for i in cat_ft: X[:,i] = tonumeric(X[:,i]) X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X) # if sparse, make dense try: X = X.toarray() except: pass # convert y to monotonically increasing ints y = tonumeric(target).astype(int) return np.nan_to_num(X.astype(float)),y
def xgb_to_features(model, X_train, X_test): """Converts xgboost model into categorical features. Reference: "Practical Lessons from Predicting Clicks on Ads at Facebook" https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/ """ import xgboost as xgb f_train = model.predict(xgb.DMatrix(X_train), pred_leaf=True) f_test = model.predict(xgb.DMatrix(X_test), pred_leaf=True) enc = OneHotEncoder() enc.fit(f_train) return enc.transform(f_train), enc.transform(f_test)
def make_OHE(names): data = [] for name in names: data.append([name]) enc = preprocessing.OneHotEncoder() enc.fit(data) OHE_data = enc.transform(data).toarray() return OHE_data
def one_hot(x): return np.array(OneHotEncoder().fit_transform(x.reshape(-1,1)).todense())
def load_train(base): driver_imgs_list = pd.read_csv('driver_imgs_list.csv') driver_imgs_grouped = driver_imgs_list.groupby('classname') X_train = [] y_train = [] driver_ids = [] print('Reading train images...') for j in range(NUM_CLASSES): print('Loading folder c{}...'.format(j)) driver_ids_group = driver_imgs_grouped.get_group('c{}'.format(j)) paths = os.path.join(base, 'c{}/'.format(j)) + driver_ids_group.img if SUBSET: paths = paths[:100] driver_ids_group = driver_ids_group.iloc[:100] driver_ids += driver_ids_group['subject'].tolist() for i, path in tqdm(enumerate(paths), total=len(paths)): img = load_image(path) if i == 0: imsave('c{}.jpg'.format(j), img) img = img.swapaxes(2, 0) X_train.append(img) y_train.append(j) X_train = np.array(X_train) y_train = np.array(y_train) y_train = OneHotEncoder(n_values=NUM_CLASSES) \ .fit_transform(y_train.reshape(-1, 1)) \ .toarray() return X_train, y_train, driver_ids
def one_hot(col, **kwargs): z = col.reshape(-1,1) enc = OneHotEncoder(sparse=False, **kwargs) return enc.fit_transform(z)
def mapper(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=True, pos_features=False): numbers = set('%d' % i for i in range(-100,100)) feature_extractors = [] feature_extractors.extend([ (col, vec(ngram_range, lowercase, binary, min_df, max_df)) for col in df.columns if col in numbers ]) # we are using separate CountVectorizer for each word position (for min_df to work correctly) if caps_features: feature_extractors.extend([ ([col], OneHotEncoder(sparse=False, handle_unknown='ignore')) for col in df.columns if col.endswith('_cap') ]) if pos_features: feature_extractors.extend([ (col, LabelBinarizer()) for col in df.columns if '_Extra' in col ]) return DataFrameMapper(feature_extractors, sparse=True)
def getdataset(datasetname, onehot_encode_strings=True): # load dataset = fetch_mldata(datasetname) # get X and y X = dshape(dataset.data) try: target = dshape(dataset.target) except: print("WARNING: No target found. Taking last column of data matrix as target") target = X[:, -1] X = X[:, :-1] if len(target.shape) > 1 and target.shape[1] > X.shape[1]: # some mldata sets are mixed up... X = target target = dshape(dataset.data) if len(X.shape) == 1 or X.shape[1] <= 1: for k in dataset.keys(): if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]: X = np.hstack((X, dshape(dataset[k]))) # one-hot for categorical values if onehot_encode_strings: cat_ft = [i for i in range(X.shape[1]) if 'str' in str( type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))] if len(cat_ft): for i in cat_ft: X[:, i] = tonumeric(X[:, i]) X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X) # if sparse, make dense try: X = X.toarray() except: pass # convert y to monotonically increasing ints y = tonumeric(target).astype(int) return np.nan_to_num(X.astype(float)), y
def encode(features, labels): """One-hot encode the values of each feature Keyword arguments: features -- feature matrix labels -- labels of samples """ enc = OneHotEncoder() enc.fit(features) arr = enc.transform(features).toarray() result = np.array([[0 for j in range(len(arr[0])+1)] for k in range(len(arr))]) for i in range(len(arr)): result[i] = np.append(arr[i], labels[i]) return result
def get_one_hot_weekday(): encoder = OneHotEncoder(n_values=[7], sparse=False) encoder.fit([[0]]) def one_hot_weekday(ix, **kargs): return encoder.transform([ [datetime.date(2016, *ix_to_date(ix)).weekday()] ])[0].tolist() return one_hot_weekday
def get_one_hot_key(): encoder = OneHotEncoder(n_values=[3, 2], sparse=False) encoder.fit([[0, 0]]) def one_hot_key(ix, **kargs): return encoder.transform([[ix[-2] - 1, ix[-1]]])[0].tolist() return one_hot_key