我们从Python开源项目中,提取了以下37个代码示例,用于说明如何使用sklearn.preprocessing.RobustScaler()。
def scale_data(X, scaler=None): """ Scale X with robust scaling. Args: X (np.array): feature matrix indexed by binID. scaler (RobustScaler): pre-trained scaler. Default is None Returns: np.array: normalized feature matrix. RobustScaler: robust scaler fitted with training data, only returned when there is no pre-trained scaler. """ if scaler is not None: return scaler.transform(X) else: scaler = RobustScaler(copy=False) scaler.fit(X) return scaler.transform(X), scaler
def choose_best_lag(seq, pre_period, lags = range(1,30), Kmax = 200): """ ????lazzy model,????? """ models = [] # ??? std_sca = StandardScaler().fit(np.array(seq).reshape(-1,1)) # rob_sca = RobustScaler().fit(np.array(seq).reshape(-1,1)) seq = std_sca.transform(np.array(seq).reshape(-1,1)) # ????????????,??????? for input_lag in lags: # window = input_lag + pre_period X, Y = create_dataset(seq.flatten(), input_lag, pre_period) lazzy_models = lazzy_loo(X[-1], X[0:-1], Y[:-1], Kmax) y_pred = lazzy_prediction(X[-1], X[0:-1], Y[:-1], lazzy_models) err = err_evaluation(y_pred, Y[-1]) lazzy_models.sort() models.append((err, input_lag, lazzy_models[0][1] )) models.sort() best_lag = models[0][1] best_k = models[0][2] return models, best_lag, best_k
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def fit(self, X, y=None): self.rs = RobustScaler() self.rs.fit(X) self.center_ = pd.Series(self.rs.center_, index=X.columns) self.scale_ = pd.Series(self.rs.scale_, index=X.columns) return self
def keras_mlp1(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(256, kernel_initializer='Orthogonal')(input_) #model = layers.BatchNormalization()(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Activation('selu')(model) #model = layers.Dropout(0.7)(model) model = layers.Dense(64, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) #model = layers.Dropout(0.9)(model) model = layers.Dense(16, kernel_initializer='Orthogonal')(model) #model = layers.BatchNormalization()(model) model = layers.Activation('selu')(model) #model = layers.advanced_activations.PReLU()(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Nadam()) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def keras_mlp2(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(1024, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(128, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(16, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.RMSprop()) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def keras_mlp3(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(256, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(32, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(4, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.SGD(nesterov=True)) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def keras_mlp3(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(512, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(256, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(32, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.SGD(nesterov=True)) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def keras_base(train2, y, test2, v, z, build_model, N_splits, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) model = build_model(train3.shape[1]) model.summary(line_length=120) model_path = '../data/working/' + cname + base_data_name() + '_keras_model.h5' num_splits = N_splits ss = model_selection.StratifiedKFold(n_splits=num_splits, random_state=base_seed) for n, (itrain, ival) in enumerate(ss.split(train3, y)): model = build_model(train3.shape[1]) xtrain, xval = train3[itrain], train3[ival] ytrain, yval = y[itrain], y[ival] model.fit( xtrain, ytrain, epochs=10000, batch_size=256, validation_data=(xval, yval), verbose=0, callbacks=keras_fit_callbacks(model_path), shuffle=True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += pconvert(p).ravel() score = metrics.log_loss(y[ival], p) print(cname, 'fold %d: '%(n+1), score, now()) scores.append(score) z[cname] += pconvert(model.predict(test3).ravel()) del model os.remove(model_path) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= num_splits #@tf_force_cpu
def keras_mlp2(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(1024, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(128, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(16, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.SGD()) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def keras_mlp3(train2, y, test2, v, z): from keras import layers from keras import models from keras import optimizers cname = sys._getframe().f_code.co_name num_splits = 9 scaler = preprocessing.RobustScaler() train3 = scaler.fit_transform(train2) test3 = scaler.transform(test2) input_dims = train3.shape[1] def build_model(): input_ = layers.Input(shape=(input_dims,)) model = layers.Dense(512, kernel_initializer='Orthogonal')(input_) model = layers.Activation('selu')(model) model = layers.Dense(256, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(32, kernel_initializer='Orthogonal')(model) model = layers.Activation('selu')(model) model = layers.Dense(1, activation='sigmoid')(model) model = models.Model(input_, model) model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Adam()) #print(model.summary(line_length=120)) return model keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
def choose_best_lag(seq, pre_period, lags = range(1,30), Kmax = 200): """ ????lazzy model,????? ???(?????????) """ models = [] # ??? std_sca = StandardScaler().fit(np.array(seq).reshape(-1,1)) # rob_sca = RobustScaler().fit(np.array(seq).reshape(-1,1)) seq = std_sca.transform(np.array(seq).reshape(-1,1)) # ????????????,??????? from sklearn.model_selection import train_test_split for input_lag in lags: # window = input_lag + pre_period X, Y = create_dataset(seq.flatten(), input_lag, pre_period) # lazzy_models = lazzy_loo(X[-1], X[0:-1], Y[:-1], Kmax) # y_pred = lazzy_prediction(X[-1], X[0:-1], Y[:-1], lazzy_models) # err = err_evaluation(y_pred.flatten(), Y[-1]) # # lazzy_models.sort() # models.append((err, input_lag, lazzy_models[0][1])) # do more cv # for state in range(0,3): err = 0.0 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=0) for x_q,y_q in zip(X_test,y_test): lazzy_models = lazzy_loo(x_q, X_train, y_train, Kmax) y_pred = lazzy_prediction(x_q, X_train, y_train, lazzy_models) err += err_evaluation(y_pred.flatten(), y_q) lazzy_models.sort() models.append((err/len(X_test), input_lag, lazzy_models[0][1])) models.sort() best_lag = models[0][1] best_k = models[0][2] # fig, ax = plt.subplots() # ax.plot(y_pred.flatten(),label='prediction') # ax.plot(Y[-1],label='real') # ax.set_title('best cv lags') return models, best_lag, best_k
def test_random_grid(): # build a pipeline pipe = Pipeline([ ('retainer', FeatureRetainer()), # will retain all ('dropper', FeatureDropper()), # won't drop any ('mapper', FunctionMapper()), # pass through ('encoder', OneHotCategoricalEncoder()), # no object dtypes, so will pass through ('collinearity', MulticollinearityFilterer(threshold=0.85)), ('imputer', SelectiveImputer()), # pass through ('scaler', SelectiveScaler()), ('boxcox', BoxCoxTransformer()), ('nzv', NearZeroVarianceFilterer(threshold=1e-4)), ('pca', SelectivePCA(n_components=0.9)), ('model', RandomForestClassifier(n_jobs=1)) ]) # let's define a set of hyper-parameters over which to search hp = { 'collinearity__threshold': uniform(loc=.8, scale=.15), 'collinearity__method': ['pearson', 'kendall', 'spearman'], 'scaler__scaler': [StandardScaler(), RobustScaler()], 'pca__n_components': uniform(loc=.75, scale=.2), 'pca__whiten': [True, False], 'model__n_estimators': randint(5, 10), 'model__max_depth': randint(2, 5), 'model__min_samples_leaf': randint(1, 5), 'model__max_features': uniform(loc=.5, scale=.5), 'model__max_leaf_nodes': randint(10, 15) } # define the gridsearch search = RandomizedSearchCV(pipe, hp, n_iter=2, # just to test it even works scoring='accuracy', cv=2, random_state=42) # fit the search search.fit(X_train, y_train) # test the report report_grid_score_detail(search, charts=False)
def normalize_padded(padded, means=None, stds=None): """Normalize by last dim of padded with means/stds or calculate them. .. TODO:: * consider importing instead ex: from sklearn.preprocessing import StandardScaler, RobustScaler robust_scaler = RobustScaler() x_train = robust_scaler.fit_transform(x_train) x_test = robust_scaler.transform(x_test) ValueError: Found array with dim 3. RobustScaler expected <= 2. * Don't normalize binary features * If events are sparse then this may lead to huge values. """ # TODO epsilon choice is random epsilon = 1e-6 original_dtype = padded.dtype is_flat = len(padded.shape) == 2 if is_flat: padded = np.expand_dims(padded, axis=-1) n_features = padded.shape[2] n_obs = padded.shape[0] * padded.shape[1] if means is None: means = np.nanmean(np.float128( padded.reshape(n_obs, n_features)), axis=0) means = means.reshape([1, 1, n_features]) padded = padded - means if stds is None: stds = np.nanstd(np.float128( padded.reshape(n_obs, n_features)), axis=0) stds = stds.reshape([1, 1, n_features]) if (stds < epsilon).any(): print('warning. Constant cols: ', np.where((stds < epsilon).flatten())) stds[stds < epsilon] = 1.0 # should be (small number)/1.0 as mean is subtracted. # Possible prob depending on machine err # 128 float cast otherwise padded = (padded / stds).astype(original_dtype) if is_flat: # Return to flat padded = np.squeeze(padded) return padded, means, stds
def model(self): #cname = sys._getframe().f_code.co_name cname = 'keras' train, y, test = self.train_, self.y_, self.test_ np.random.seed(1234) train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from sklearn import pipeline pipe = pipeline.make_pipeline(preprocessing.Imputer(), preprocessing.RobustScaler()) train = pipe.fit_transform(train) test = pipe.transform(test) self.input_dims_ = train.shape[1] def build_model(): return self.build_keras_model() batch_size = self.batch_size_ build_model().summary(line_length=120) ss = model_selection.StratifiedKFold(n_splits = self.num_splits_, random_state = 11, shuffle = True) scores = list() model_path = self.temp_name('keras_mlp_weights') v, z = self.v_, self.z_ v[cname] = 0 z[cname] = 0 for n, (itrain, ival) in enumerate(ss.split(train, y)): xtrain, xval = train[itrain], train[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = batch_size, epochs = 10000, validation_data = (xval, yval), verbose = 0, callbacks = build_keras_fit_callbacks(model_path), shuffle = True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += p.ravel() score = metrics.log_loss(y[ival], p) if score != score: raise Exception('NaN score!!!') print(cname, 'fold %d: '%(n+1), score, self.now()) scores.append(score) z[cname] += model.predict(test).ravel() del model for i in range(3): gc.collect(i) print('scores:', scores, np.mean(scores), np.std(scores)) self.drop_temp(model_path) cv=np.mean(scores) z[cname] /= self.num_splits_ z['y'] = z[cname] return cv, None