我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.StandardScaler()。
def load_data(data_dir, num_files=30): files_list = os.listdir(data_dir) data = None ac_data = None for fname in files_list[:num_files]: print fname f = os.path.join(data_dir, fname) with netcdf.netcdf_file(f, 'r') as fid: m = fid.variables['outputMeans'][:].copy() s = fid.variables['outputStdevs'][:].copy() feats = fid.variables['targetPatterns'][:].copy() ac_feats = fid.variables['inputs'][:].copy() scaler = preprocessing.StandardScaler() scaler.mean_ = m scaler.scale_ = s feats = scaler.inverse_transform(feats) assert feats.shape[0] == ac_feats.shape[0] # feats = np.concatenate((feats,ac_feats),axis=1) if data == None and ac_data == None: data = feats ac_data = ac_feats else: data = np.vstack((data, feats)) ac_data = np.vstack((ac_data, ac_feats)) return data, ac_data
def computeNeighboursScores(self): all_instances = self.iteration.datasets.instances # Connectivity matrix pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', NearestNeighbors(self.num_neighbours, n_jobs = -1))]) pipeline.fit(all_instances.getFeatures()) # Labels labels = np.array([generateLabel(x) for x in all_instances.getLabels()]) # Compute neighbour scores scores = [] all_neighbours = pipeline.named_steps['model'].kneighbors(return_distance = False) for i, label in enumerate(labels): if label != 0: continue else: neighbours = all_neighbours[i] score = sum(labels[neighbours] + 1) / (2.0 * self.num_neighbours) scores.append(score) return np.array(scores)
def scale_numeric_data(pandas_data): # Scaling is important because if the variables are too different from # one another, it can throw off the model. # EX: If one variable has an average of 1000, and another has an average # of .5, then the model won't be as accurate. for col in pandas_data.columns: if pandas_data[col].dtype == np.float64 or pandas_data[col].dtype == np.int64: pandas_data[col] = preprocessing.scale(pandas_data[col]) return pandas_data # Creates a standard scaler based on the training data and applies it to both train # and test data. # Input: # - Two Pandas DataFrames, same number of columns # Output: # - Two Pandas DataFrames, both of which have been scaled based on StandardScaler # trained on training data.
def build_ensemble(**kwargs): """Generate ensemble.""" ens = SuperLearner(**kwargs) prep = {'Standard Scaling': [StandardScaler()], 'Min Max Scaling': [MinMaxScaler()], 'No Preprocessing': []} est = {'Standard Scaling': [ElasticNet(), Lasso(), KNeighborsRegressor()], 'Min Max Scaling': [SVR()], 'No Preprocessing': [RandomForestRegressor(random_state=SEED), GradientBoostingRegressor()]} ens.add(est, prep) ens.add(GradientBoostingRegressor(), meta=True) return ens
def tf2npz(tf_path, export_folder=FAST): vid_ids = [] labels = [] mean_rgb = [] mean_audio = [] tf_basename = os.path.basename(tf_path) npz_basename = tf_basename[:-len('.tfrecord')] + '.npz' isTrain = '/test' not in tf_path for example in tf.python_io.tf_record_iterator(tf_path): tf_example = tf.train.Example.FromString(example).features vid_ids.append(tf_example.feature['video_id'].bytes_list.value[0].decode(encoding='UTF-8')) if isTrain: labels.append(np.array(tf_example.feature['labels'].int64_list.value)) mean_rgb.append(np.array(tf_example.feature['mean_rgb'].float_list.value).astype(np.float32)) mean_audio.append(np.array(tf_example.feature['mean_audio'].float_list.value).astype(np.float32)) save_path = export_folder + '/' + npz_basename np.savez(save_path, rgb=StandardScaler().fit_transform(np.array(mean_rgb)), audio=StandardScaler().fit_transform(np.array(mean_audio)), ids=np.array(vid_ids), labels=labels )
def preprocess_data(train_data_matrix, valid_data_matrix, test_data_matrix): """ Function to preprocess the data with the standard scaler from sci-kit learn. It takes in the training, validation, and testing matrices and returns the standardized versions of them. Input: train_data_matrix The data matrix with the training set data valid_data_matrix The data matrix with the validation set data test_data_matrix The data matrix with the testing set data . Output: transform_train_data_matrix The data matrix with the standardized training set data transform_valid_data_matrix The data matrix with the standardized validation set data transform_test_data_matrix The data matrix with the standardized testing set data Usage: analyze_ml_data(actual_bg_test_array, test_prediction, True, False, True, False, "00000001", "Linear Regression", "Pred30Data5") """ reg_scaler = prep.StandardScaler().fit(train_data_matrix) transform_train_data_matrix = reg_scaler.transform(train_data_matrix) transform_valid_data_matrix = reg_scaler.transform(valid_data_matrix) transform_test_data_matrix = reg_scaler.transform(test_data_matrix) return transform_train_data_matrix, transform_valid_data_matrix, transform_test_data_matrix
def load_norm_stats(stats_file, dim, method="MVN"): #### load norm stats #### io_funcs = BinaryIOCollection() norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim) assert frame_number==2 if method=="MVN": scaler = preprocessing.StandardScaler() scaler.mean_ = norm_matrix[0, :] scaler.scale_ = norm_matrix[1, :] elif method=="MINMAX": scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99)) scaler.min_ = norm_matrix[0, :] scaler.scale_ = norm_matrix[1, :] return scaler
def test_group_lasso(): """Group Lasso test.""" n_samples, n_features = 100, 90 # assign group ids groups = np.zeros(90) groups[0:29] = 1 groups[30:59] = 2 groups[60:] = 3 # sample random coefficients beta0 = np.random.normal(0.0, 1.0, 1) beta = np.random.normal(0.0, 1.0, n_features) beta[groups == 2] = 0. # create an instance of the GLM class glm_group = GLM(distr='softplus', alpha=1.) # simulate training data Xr = np.random.normal(0.0, 1.0, [n_samples, n_features]) yr = simulate_glm(glm_group.distr, beta0, beta, Xr) # scale and fit scaler = StandardScaler().fit(Xr) glm_group.fit(scaler.transform(Xr), yr)
def load_data(data_dir, num_files=30): files_list = os.listdir(data_dir) data = None for fname in files_list[:num_files]: print fname f = os.path.join(data_dir, fname) with netcdf.netcdf_file(f, 'r') as fid: m = fid.variables['outputMeans'][:].copy() s = fid.variables['outputStdevs'][:].copy() feats = fid.variables['targetPatterns'][:].copy() scaler = preprocessing.StandardScaler() scaler.mean_ = m scaler.scale_ = s feats = scaler.inverse_transform(feats) if data == None: data = feats else: data = np.vstack((data, feats)) return data
def load_data(data_dir, num_files=30): files_list = os.listdir(data_dir) dataset = [] ac_dataset = [] for fname in files_list[:num_files]: #print(fname) f = os.path.join(data_dir, fname) with netcdf.netcdf_file(f, 'r') as fid: m = fid.variables['outputMeans'][:].copy() s = fid.variables['outputStdevs'][:].copy() feats = fid.variables['targetPatterns'][:].copy() ac_feats = fid.variables['inputs'][:].copy() scaler = preprocessing.StandardScaler() scaler.mean_ = m scaler.scale_ = s feats = scaler.inverse_transform(feats) assert feats.shape[0] == ac_feats.shape[0] dataset.extend(feats) ac_dataset.extend(ac_feats) dataset = np.asarray(dataset) ac_dataset = np.asarray(ac_dataset) #print(dataset.shape, ac_dataset.shape) return dataset, ac_dataset
def init_state(indata, test=False): close = indata['close'].values diff = np.diff(close) diff = np.insert(diff, 0, 0) sma15 = SMA(indata, timeperiod=15) sma60 = SMA(indata, timeperiod=60) rsi = RSI(indata, timeperiod=14) atr = ATR(indata, timeperiod=14) #--- Preprocess data xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr)) xdata = np.nan_to_num(xdata) if test == False: scaler = preprocessing.StandardScaler() xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1) joblib.dump(scaler, 'data/scaler.pkl') elif test == True: scaler = joblib.load('data/scaler.pkl') xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1) state = xdata[0:1, 0:1, :] return state, xdata, close #Take Action
def init_state(data): close = data diff = np.diff(data) diff = np.insert(diff, 0, 0) #--- Preprocess data xdata = np.column_stack((close, diff)) xdata = np.nan_to_num(xdata) scaler = preprocessing.StandardScaler() xdata = scaler.fit_transform(xdata) state = xdata[0:1, :] return state, xdata #Take Action
def sample_pipelines(pca_kernels=None, svr_kernels=None): """ Pipelines that can't be fit in a reasonable amount of time on the whole dataset """ # Model instances model_steps = [] if pca_kernels is None: pca_kernels = ['poly', 'rbf', 'sigmoid', 'cosine'] for pca_kernel in pca_kernels: model_steps.append([ KernelPCA(n_components=2, kernel=pca_kernel), LinearRegression(), ]) if svr_kernels is None: svr_kernels = ['poly', 'rbf', 'sigmoid'] for svr_kernel in svr_kernels: model_steps.append(SVR(kernel=svr_kernel, verbose=True, cache_size=1000)) # Pipelines pipelines = [] for m in model_steps: # Steps common_steps = [ StandardScaler(), ] model_steps = m if isinstance(m, list) else [m] steps = common_steps + model_steps pipelines.append(make_pipeline(*steps)) return pipelines
def PCA_analysis(data, mode, cell_stages = None): """Principal Component Analysis. """ assert mode in {'pca', 'pca2'} mean_shifter = StandardScaler(with_std = False) if mode == 'pca': pca = PCA(min(data.shape)) projected_data = pca.fit_transform(data) projected_data = pca.fit_transform(mean_shifter.fit_transform(data)) components = pca.components_ else: assert isinstance(cell_stages, np.ndarray) idx = np.where(cell_stages == np.max(cell_stages))[0] pca = PCA(min(idx.size, data.shape[1])) pca.fit(mean_shifter.fit_transform(data[idx])) components = pca.components_ projected_data = np.dot(data, components.T) return components, projected_data
def mfccFeature_audio(filename_wav,index_keep,feature_type='mfcc'): audio = ess.MonoLoader(downmix = 'left', filename = filename_wav, sampleRate = fs)() if feature_type == 'mfcc': feature = getFeature(audio) elif feature_type == 'mfccBands1D': feature = getMFCCBands1D(audio) elif feature_type == 'mfccBands2D': feature = getMFCCBands2D(audio,nbf=True) if feature_type == 'mfccBands1D' or feature_type == 'mfccBands2D': feature = np.log(100000 * feature + 1) scaler = pickle.load(open(kerasScaler_path,'rb')) feature = scaler.transform(feature) # feature = preprocessing.StandardScaler().fit_transform(feature) # index_keep = pitchProcessing_audio(filename_wav) feature_out = feature[index_keep[0],:] for index in index_keep[1:]: feature_out = np.vstack((feature_out,feature[index,:])) if feature_type == 'mfccBands2D': feature_out = featureReshape(feature_out) return feature_out
def trainValidationSplit(dic_pho_feature_train,validation_size=0.2): ''' split the feature in dic_pho_feature_train into train and validation set :param dic_pho_feature_train: input dictionary, key: phoneme, value: feature vectors :return: ''' feature_all = [] label_all = [] for key in dic_pho_feature_train: feature = dic_pho_feature_train[key] label = [dic_pho_label[key]] * len(feature) if len(feature): if not len(feature_all): feature_all = feature else: feature_all = np.vstack((feature_all, feature)) label_all += label label_all = np.array(label_all,dtype='int64') feature_all = preprocessing.StandardScaler().fit_transform(feature_all) feature_train, feature_validation, label_train, label_validation = \ train_test_split(feature_all, label_all, test_size=validation_size, stratify=label_all) return feature_train, feature_validation, label_train, label_validation
def test_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = StandardScaler().fit(scikit_data.data) spec = converter.convert(scikit_model, scikit_data.feature_names, 'out').get_spec() input_data = [dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data] output_data = [{"out" : row} for row in scikit_model.transform(scikit_data.data)] metrics = evaluate_transformer(spec, input_data, output_data) assert metrics["num_errors"] == 0
def test_boston_OHE_plus_normalizer(self): data = load_boston() pl = Pipeline([ ("OHE", OneHotEncoder(categorical_features = [8], sparse=False)), ("Scaler",StandardScaler())]) pl.fit(data.data, data.target) # Convert the model spec = convert(pl, data.feature_names, 'out') input_data = [dict(zip(data.feature_names, row)) for row in data.data] output_data = [{"out" : row} for row in pl.transform(data.data)] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def dataset_generator(): """ generate dataset for binary classification :return: """ X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] X, y = datasets[0] y[y == 0] = -1 X = StandardScaler().fit_transform(X) return X, y
def make_x_y(self, data, code): data_x = [] data_y = [] data.loc[:, 'month'] = data.loc[:, '??']%10000/100 data = data.drop(['??', '????'], axis=1) # normalization data = np.array(data) if len(data) <= 0 : return np.array([]), np.array([]) if code not in self.scaler: self.scaler[code] = StandardScaler() data = self.scaler[code].fit_transform(data) elif code not in self.scaler: return np.array([]), np.array([]) else: data = self.scaler[code].transform(data) for i in range(self.frame_len, len(data)-self.predict_dist+1): data_x.extend(np.array(data[i-self.frame_len:i, :])) data_y.append(data[i+self.predict_dist-1][0]) np_x = np.array(data_x).reshape(-1, 23*30) np_y = np.array(data_y) return np_x, np_y
def make_x_y(self, data, code): data_x = [] data_y = [] data.loc[:, 'month'] = data.loc[:, '??']%10000/100 data = data.drop(['??', '????'], axis=1) # normalization data = np.array(data) if len(data) <= 0 : return np.array([]), np.array([]) if code not in self.scaler: self.scaler[code] = StandardScaler() data = self.scaler[code].fit_transform(data) elif code not in self.scaler: return np.array([]), np.array([]) else: data = self.scaler[code].transform(data) for i in range(self.frame_len, len(data)-self.predict_dist+1): data_x.extend(np.array(data[i-self.frame_len:i, :])) data_y.append(data[i+self.predict_dist-1][0]) np_x = np.array(data_x).reshape(-1, 23*self.frame_len) np_y = np.array(data_y) return np_x, np_y
def fit_model(self, logging_uuid, model=None, epochs=1000, batch_size=10): if model is not None: self.model = model X, y, _ = self.get_formulation_training_data() scaler = StandardScaler().fit(X) lcb = LambdaCallback( on_epoch_end= lambda epoch, logs: r.set(logging_uuid, json.dumps({'model_state': 'training', 'epoch': epoch, 'epochs': epochs, 'loss': logs['loss']})), on_train_end= lambda logs: r.set(logging_uuid, json.dumps({'model_state': 'training', 'epoch': epochs, 'epochs': epochs})), ) self.fit_history = self.model.fit(scaler.transform(X), y, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[lcb]) return self.model, self.fit_history
def save_grid_to_db(self, model=None): if model is not None: self.model = model f_instance = Formulation.query.get(self.f_id) f_instance.formulation_data_grid.delete() # prepare data lines to plot X, y, data_traces = self.get_formulation_training_data() # train model to fit data lines scaler = StandardScaler().fit(X) # prepare mesh grid to plot max_t, max_f = np.amax(X, axis=0) min_t, min_f = np.amin(X, axis=0) xv, yv = np.meshgrid(np.arange(floor(min_t), ceil(max_t)), np.arange(floor(min_f), ceil(max_f)), indexing='ij') xv = xv.reshape((xv.shape[0], xv.shape[1], -1)) yv = yv.reshape((yv.shape[0], yv.shape[1], -1)) grid_xys = np.concatenate((xv, yv), axis=2).reshape((-1, 2)) # predict z for grid grid_zs = self.model.predict(scaler.transform(grid_xys)).reshape((-1)) for x, y, z in zip(grid_xys[:, 0], grid_xys[:, 1], grid_zs): f_instance.formulation_data_grid.append(FormulationDataGrid(x_value=x, y_value=y, z_value=z)) db.session.commit()
def scale_features(features, train): """Scale features, using test set to learn parameters. Returns: Scaled copy of features. """ if FLAGS.scaling is None: return features logging.info('Scaling features with %s', FLAGS.scaling) if FLAGS.scaling == 'max_abs': scaler = preprocessing.MaxAbsScaler() elif FLAGS.scaling == 'standard': scaler = preprocessing.StandardScaler() else: raise ValueError('Unrecognized scaling %s' % FLAGS.scaling) scaler.fit(features[train]) return scaler.transform(features)
def test_multiprocessing(): generator = check_random_state(0) data = genData(n_samples=200, n_features=4, n_redundant=2,strRel=2, n_repeated=0, class_sep=1, flip_y=0, random_state=generator) X_orig, y = data X_orig = StandardScaler().fit(X_orig).transform(X_orig) X = np.c_[X_orig, generator.normal(size=(len(X_orig), 6))] y = list(y) # regression test: list should be supported # Test using the score function fri = EnsembleFRI(FRIClassification(random_state=generator),n_bootstraps=5,n_jobs=2, random_state=generator) fri.fit(X, y) # non-regression test for missing worst feature: assert len(fri.allrel_prediction_) == X.shape[1] assert len(fri.interval_) == X.shape[1] # All strongly relevant features have a lower bound > 0 assert np.all(fri.interval_[0:2,0]>0) # All weakly relevant features should have a lower bound 0 assert np.any(fri.interval_[2:4,0]>0) == False
def __init__(self, X=None, y=None, ax=None, scale=True, color=None, proj_dim=2, colormap=palettes.DEFAULT_SEQUENCE, **kwargs): super(PCADecomposition, self).__init__(ax=ax, **kwargs) # Data Parameters if proj_dim not in (2, 3): raise YellowbrickValueError("proj_dim object is not 2 or 3.") self.color = color self.pca_features_ = None self.scale = scale self.proj_dim = proj_dim self.pca_transformer = Pipeline([('scale', StandardScaler(with_std=self.scale)), ('pca', PCA(self.proj_dim, )) ]) # Visual Parameters self.colormap = colormap
def get_standardized_wine_data(): df = pd.read_csv(os.path.join('datasets', 'wine.data'), header=None) df.columns = [ 'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline', ] X = df.iloc[:, 1:].values y = df.iloc[:, 0].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0, ) sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) return X_train_std, X_test_std, y_train, y_test
def load_data_train(trainfile): print "Getting the training data" a=htk.open(trainfile) train_data=a.getall() print "Done with Loading the training data: ",train_data.shape data=filter_data_train(train_data) # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model x_train=data[:,:-2] #Set to different column based on different model scaler=StandardScaler().fit(x_train) # x_train=scaler.transform(x_train) Y_train=data[:,-2] print Y_train.shape # print np.where(Y_train==2) Y_train=Y_train.reshape(Y_train.shape[0],1) y_train=np_utils.to_categorical(Y_train,2) print y_train[0:5,:] gender_train=data[:,-1] del data return x_train,y_train,gender_train,scaler
def load_data_train(trainfile): print "Getting the training data" a=htk.open(trainfile) train_data=a.getall() print "Done with Loading the training data: ",train_data.shape data=filter_data_train(train_data) # x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model x_train=data[:,:-2] #Set to different column based on different model scaler=StandardScaler().fit(x_train) # x_train=scaler.transform(x_train) Y_train=data[:,-2] print Y_train.shape # print np.where(Y_train==2) Y_train=Y_train.reshape(Y_train.shape[0],1) y_train=np_utils.to_categorical(Y_train,2) print y_train[0:5,:] gender_train=data[:,-1] del data #x_train has complete data, that is gammatone and also the pitch variance values. return x_train,y_train,gender_train,scaler
def load_data_train(trainfile): print "Getting the training data" a=htk.open(trainfile) train_data=a.getall() print "Done with Loading the training data: ",train_data.shape data=filter_data_train(train_data) x_train=data[:,:-2] scaler=StandardScaler().fit(x_train) # x_train=scaler.transform(x_train) x_train=cnn_reshaper(data[:,:-2]) #Set to different column based on different model Y_train=data[:,-2] print Y_train.shape # print np.where(Y_train==2) Y_train=Y_train.reshape(Y_train.shape[0],1) y_train=np_utils.to_categorical(Y_train,2) gender_train=data[:,-1] del data return x_train,y_train,gender_train,scaler
def test_cross_val_predict(): # Make sure it works in cross_val_predict for multiclass. X, y = load_iris(return_X_y=True) y = LabelBinarizer().fit_transform(y) X = StandardScaler().fit_transform(X) mlp = MLPClassifier(n_epochs=10, solver_kwargs={'learning_rate': 0.05}, random_state=4567).fit(X, y) cv = KFold(n_splits=4, random_state=457, shuffle=True) y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba') auc = roc_auc_score(y, y_oos, average=None) assert np.all(auc >= 0.96)
def create_model(self, training_articles): model = OneVsRestClassifier(svm.SVC(probability=True)) features = [] labels = [] i = 0 for article in training_articles: print("Generating features for article " + str(i) + "...") google_cloud_response = self.analyze_text_google_cloud(article["article"]) relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"]) # Only count this article if a relevant entity is present if relevant_entities: article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"]) features.append(article_features) labels.append(article["label"]) else: print("Skipping article " + str(i) + "...") i = i + 1 print("Performing feature scaling...") scaler = preprocessing.StandardScaler().fit(features) features_scaled = scaler.transform(features) print("Fitting model...") model.fit(features_scaled, labels) print("Saving model...") joblib.dump(scaler, "data_analysis/caler.pkl") joblib.dump(model, "data_analysis/model.pkl") print("Done!") # For use in prod
def __load_chn_data(self,selectChan,file_name): spk_startswith = "spike_{0}".format(selectChan) with hp.File(file_name,"r") as f: times = list() waveforms = list() units = list() for chn_unit in f["spikes"].keys(): if chn_unit.startswith(spk_startswith): tep_time = f["spikes"][chn_unit]["times"].value waveform = f["spikes"][chn_unit]["waveforms"].value unit = int(chn_unit.split("_")[-1]) unit = np.ones(tep_time.shape,dtype=np.int32)*unit times.append(tep_time) waveforms.append(waveform) units.append(unit) if times: times = np.hstack(times) units = np.hstack(units) waveforms = np.vstack(waveforms) sort_index = np.argsort(times) units = units[sort_index] waveforms = waveforms[sort_index] times = times[sort_index] # calculate waveform_range waveforms_max = np.apply_along_axis(max,1,waveforms) waveforms_min = np.apply_along_axis(min,1,waveforms) waveforms_range = np.vstack([waveforms_min,waveforms_max]).T # calculate PCA of waveforms scaler = StandardScaler() scaler.fit(waveforms) waveforms_scaled = scaler.transform(waveforms) pca = PCA(n_components=self.pca_used_num) pca.fit(waveforms_scaled) wavePCAs = pca.transform(waveforms_scaled) return times,units,waveforms_range,wavePCAs else: return None,None,None,None
def pre_processData(train_data,file_path): train_data.loc[(train_data.Age.isnull()), 'Age' ] = np.mean(train_data.Age) # ??????????? train_data.loc[(train_data.Cabin.notnull(),'Cabin')] = 'yes' # Cabin??????yes train_data.loc[(train_data.Cabin.isnull(),'Cabin')] = 'no' '''0/1????''' dummies_cabin = pd.get_dummies(train_data['Cabin'],prefix='Cabin') # get_dummies?????0/1??????????????prefix???Cabin dummies_Embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked') dummies_Sex = pd.get_dummies(train_data['Sex'], prefix='Sex') dummies_Pclass = pd.get_dummies(train_data['Pclass'],prefix='Pclass') train_data = pd.concat([train_data,dummies_cabin,dummies_Embarked,dummies_Pclass,dummies_Sex], axis=1) # ??dataframe,axis=1?? train_data.drop(['Pclass','Name','Sex','Embarked','Cabin','Ticket'],axis=1,inplace=True) # ???????????? header_string = ','.join(train_data.columns.tolist()) # ?????string??????? np.savetxt(file_path+r'/pre_processData1.csv', train_data, delimiter=',',header=header_string) # ????????????? '''???????(Age?Fare)''' scaler = StandardScaler() age_scaler = scaler.fit(train_data['Age']) train_data['Age'] = age_scaler.fit_transform(train_data['Age']) if np.sum(train_data.Fare.isnull()): # ??Fare??????????? train_data.loc[(train_data.Fare.isnull(),'Fare')]=np.mean(train_data.Fare) fare_scaler = scaler.fit(train_data['Fare']) train_data['Fare'] = fare_scaler.transform(train_data['Fare']) header_string = ','.join(train_data.columns.tolist()) # ?????string??????? np.savetxt(file_path+r'/pre_processData_scaled.csv', train_data, delimiter=',',header=header_string) # ????????????? return train_data ## feature engineering?????-?????
def feature_scaling(self, df): df = df.copy() # Standardization (centering and scaling) of dataset that removes mean and scales to unit variance standard_scaler = StandardScaler() numerical_feature_names_of_non_modified_df = TwoSigmaFinModTools._numerical_feature_names if any(tuple(df.columns == 'y')): if not TwoSigmaFinModTools._is_one_hot_encoder: numerical_feature_names_of_non_modified_df = np.concatenate( [TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values]) # Include scaling of y y = df['y'].values relevant_features = df[numerical_feature_names_of_non_modified_df].columns[ (df[numerical_feature_names_of_non_modified_df].columns != 'y') & (df[numerical_feature_names_of_non_modified_df].columns != 'id')] mask = ~df[relevant_features].isnull() res = standard_scaler.fit_transform(X=df[relevant_features][mask].values, y=y) if (~mask).sum().sum() > 0: df = self.standardize_relevant_features(df, relevant_features, res) else: df.loc[:, tuple(relevant_features)] = res else: if not TwoSigmaFinModTools._is_one_hot_encoder: numerical_feature_names_of_non_modified_df = np.concatenate( [TwoSigmaFinModTools._feature_names_num.values, numerical_feature_names_of_non_modified_df.values]) relevant_features = df[numerical_feature_names_of_non_modified_df].columns[ (df[numerical_feature_names_of_non_modified_df].columns != 'id')] mask = ~df[relevant_features].isnull() res = standard_scaler.fit_transform(df[relevant_features][mask].values) if mask.sum().sum() > 0: df = self.standardize_relevant_features(df, relevant_features, res) else: df.loc[:, tuple(relevant_features)] = res return df
def make_standard(X_train, X_test): scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) pickle.dump(scaler, open("scaler_model.sav", 'wb')) return X_train, X_test
def v_demo(dir, prefix, pre_prefix, file_name, _dir): _val = [] _coords = [] file_dir_fix = dir + "\\output_INFLO.csv" #f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv" with open(file_dir_fix, 'rU') as inp: rd = csv.reader(inp) for row in rd: _val.append([row[1], row[2], row[0]]) #print(_center) _val = np.asarray(_val) _val_original = _val _val_original = map(myFloat, _val_original) _val_original = map(myInt, _val_original) #_val_original = map(myTemp, _val_original) _val_original = np.asarray(_val_original) _val = preprocessing.StandardScaler().fit_transform(_val) #_center = preprocessing.MinMaxScaler() #_center.fit_transform(_val) #_arr = StandardScaler().inverse_transform(_center) #print(_arr) #print(_center) new_file = prefix + file_name + ".png" dbFun(_val, _val_original, new_file) #_len = len(_center) return
def v_demo(dir, prefix, pre_prefix, file_name, _dir): _val = [] _coords = [] file_dir_fix = dir + "\\output_INFLO.csv" #f = "C:\Users\Abdullah #Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv" with open(file_dir_fix, 'rU') as inp: rd = csv.reader(inp) for row in rd: _val.append([row[1], row[2], row[0]]) #print(_center) _val = np.asarray(_val) _val_original = _val _val_original = map(myFloat, _val_original) _val_original = map(myInt, _val_original) #_val_original = map(myTemp, _val_original) _val_original = np.asarray(_val_original) _val = preprocessing.StandardScaler().fit_transform(_val) #_center = preprocessing.MinMaxScaler() #_center.fit_transform(_val) #_arr = StandardScaler().inverse_transform(_center) #print(_arr) #print(_center) new_file = prefix + file_name + ".png" dbFun(_val, _val_original, new_file) #_len = len(_center) return ############################################################################################## # Getting the clusters and printing in the most trivial way as asked by Dr Sheikh Faisal
def supervised_reduction(method=None, dataset=None): np.random.seed(1) sklearn.utils.check_random_state(1) train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1) scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) if dataset == 'yale': regularizer_weight = 0.0001 else: regularizer_weight = 1 n_classes = len(np.unique(train_labels)) if method == 'lda': proj = LinearDiscriminantAnalysis(n_components=n_classes - 1) proj.fit(train_data, train_labels) elif method == 's-lda': proj = LinearSEF(train_data.shape[1], output_dimensionality=(n_classes - 1)) proj.cuda() loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100, target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001, verbose=False) elif method == 's-lda-2x': # SEF output dimensions are not limited proj = LinearSEF(train_data.shape[1], output_dimensionality=2 * (n_classes - 1)) proj.cuda() loss = proj.fit(data=train_data, target_labels=train_labels, epochs=100, target='supervised', batch_size=256, regularizer_weight=regularizer_weight, learning_rate=0.001, verbose=False) acc = evaluate_svm(proj.transform(train_data), train_labels, proj.transform(test_data), test_labels) print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def outofsample_extensions(method=None, dataset=None): np.random.seed(1) sklearn.utils.check_random_state(1) train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1) # Learn a new space using Isomap isomap = Isomap(n_components=10, n_neighbors=20) train_data_isomap = np.float32(isomap.fit_transform(train_data)) if method == 'linear-regression': from sklearn.preprocessing import StandardScaler std = StandardScaler() train_data = std.fit_transform(train_data) test_data = std.transform(test_data) # Use linear regression to provide baseline out-of-sample extensions proj = LinearRegression() proj.fit(np.float64(train_data), np.float64(train_data_isomap)) acc = evaluate_svm(proj.predict(train_data), train_labels, proj.predict(test_data), test_labels) elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d': # Use the SEF to provide out-of-sample extensions if method == 'c-ISOMAP-10d': proj = LinearSEF(train_data.shape[1], output_dimensionality=10) proj.cuda() else: proj = LinearSEF(train_data.shape[1], output_dimensionality=20) proj.cuda() loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy', epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1) acc = evaluate_svm(proj.transform(train_data), train_labels, proj.transform(test_data), test_labels) print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def __init__(self, input_dimensionality, output_dimensionality, scaler='default'): """ SEF_Base constuctor :param input_dimensionality: dimensionality of the input space :param output_dimensionality: dimensionality of the target space :param scaler: the scaler used to scale the data """ self.input_dimensionality = input_dimensionality self.output_dimensionality = output_dimensionality if scaler == 'default': self.scaler = StandardScaler() elif scaler is not None: self.scaler = scaler() else: self.scaler = None # Scaling factor for computing the similarity matrix of the projected data self.sigma_projection = np.float32(0.1) self.use_gpu = False # The parameters of the model that we want to learn self.trainable_params = [] # Other non-trainable parametsr self.non_trainable_params = []
def add_params(cs: ConfigurationSpace): ''' adds parameters to ConfigurationSpace ''' switch = CategoricalHyperparameter( "StandardScaler", choices=[True, False], default=True) cs.add_hyperparameter(switch)