我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.scale()。
def scale_numeric_data(pandas_data): # Scaling is important because if the variables are too different from # one another, it can throw off the model. # EX: If one variable has an average of 1000, and another has an average # of .5, then the model won't be as accurate. for col in pandas_data.columns: if pandas_data[col].dtype == np.float64 or pandas_data[col].dtype == np.int64: pandas_data[col] = preprocessing.scale(pandas_data[col]) return pandas_data # Creates a standard scaler based on the training data and applies it to both train # and test data. # Input: # - Two Pandas DataFrames, same number of columns # Output: # - Two Pandas DataFrames, both of which have been scaled based on StandardScaler # trained on training data.
def transform(self, X, STANDARDIZE=True): if not isinstance(X, np.ndarray): X = to_array(X) assert(X.ndim == 2), "Input array must have two dimensions." if not check_standardized(X): if STANDARDIZE: X = preprocessing.scale(X) print "Standardize input data for transform" if not self.model: print "Load or fit a model before performing trsnaformation." else: assert(X.shape[1] > self.model.n_components),\ "Input data must have a dimension larger than model components %d."\ % self.model.n_components xp = self.model.transform(X) return xp
def train(self, df, shuffle=True, preprocess=False, *args, **kwargs): """ Takes a dataframe of features + a 'label' column and trains the lobe """ if self._trained: logger.warning('Overwriting an already trained brain!') self._trained = False # shuffle data for good luck if shuffle: df = shuffleDataFrame(df) # scale train data and fit lobe x = df.drop('label', axis=1).values y = df['label'].values del df if preprocess: x = preprocessing.scale(x) logger.info('Training with %d samples', len(x)) self.lobe.fit(x, y) self._trained = True
def get_sample(self, N=600, scale=False): all_data = self.pre_process(self.file_name) #print('data_type: ' + str(all_data.dtypes)) all_data = all_data.values xs = all_data[:, 2:] y = all_data[:, 1] if scale: xs = preprocessing.scale(xs) if N != -1: perm = np.random.permutation(xs.shape[0]) xs = xs[perm] y = y[perm] xs_train, xs_test = np.split(xs, [N]) y_train, y_test = np.split(y, [N]) return xs_train, xs_test, y_train, y_test else: return xs, y
def get_X_y(self): """Builds an X, y feature/target pair from the data. :returns: a tuple of (feature matrix, labels) """ # X X = np.array(self.data[self.features]) X = scale(X) # y stock_change = np.array(self.data["stock_p_change"]) sp500_change = np.array(self.data["sp500_p_change"]) is_above_threshold = stock_change-sp500_change > self.threshold y = is_above_threshold.astype('i') return (X, y)
def scale_sets(x_train, x_test, classifier_name): """ :param x_train: ndarray, required - The train data of the feature matrix :param x_test: ndarray, required - The test data of the feature matrix :param classifier_name: string, optional - The name of the selected classifier :return: ndarray """ # scaling leads to poorer performance in the case of random forests, xgb, etc. if classifier_name not in ["random_forests", "XGB", "GBC"]: # x_train, x_test are expected to be numpy arrays. Simple conditions such as if x_train will raise a ValueError. x_train = scale(x_train) if x_train is not None else x_train x_test = scale(x_test) if x_test is not None else x_test return x_train, x_test
def classify(self, M): """ Classify a hyperspectral cube using the ROIs defined clusters by the fit method. Parameters: M: `numpy array` A HSI cube (m x n x p). Returns: `numpy array` A class map (m x n x 1). """ img = self._convert2D(M) image_scaled = preprocessing.scale(img) cls = self.clf.predict(image_scaled) self.cmap = self._convert3d(cls, M.shape[0], M.shape[1]) return self.cmap
def load_data(fname='transit_data.pkl',categorical=False,whiten=True,DIR='pickle_data/'): data = pickle.load(open(DIR+fname,'rb')) # convert to numpy array fo float type from object type pvals = arr(data['results'][:,0]) transits = arr(data['results'][:,1]) null = arr(data['results'][:,2]) X = np.vstack([transits,null]) y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] ) if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] ) if whiten: X = preprocessing.scale(X,axis=1) return X,y,pvals,data['keys'],data['time']
def load_data(fname='transit_data_train.pkl',categorical=False,whiten=True,DIR='pickle_data/'): data = pickle.load(open(DIR+fname,'rb')) # convert to numpy array fo float type from object type pvals = arr(data['results'][:,0]) transits = arr(data['results'][:,1]) null = arr(data['results'][:,2]) X = np.vstack([transits,null]) y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] ) if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] ) if whiten: X = preprocessing.scale(X,axis=1) return X,y,pvals,data['keys'],data['time']
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label): """ Label? ??? ??? ??? ??? ??? Row ??? ????. Args: params: * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale'] * _df_csv_read_ori : pandas dataframe * _label Returns: Preprocessing Dataframe """ if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False: logging.info("No Duplicate") result_df = _df_csv_read_ori else : cell_features = _df_csv_read_ori.columns.tolist() cell_features.remove(_label) result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first") logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index))) temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk" result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename) return result_df
def compute_preprocessor(self,method): self.data={} if method=='none': self.data=self.orig_data elif method=='min_max': transform=preprocessing.MinMaxScaler() self.data['X_train']=transform.fit_transform(self.orig_data['X_train']) self.data['X_val']=transform.transform(self.orig_data['X_val']) self.data['X_test']=transform.transform(self.orig_data['X_test']) elif method=='scaled': self.data['X_train']=preprocessing.scale(self.orig_data['X_train']) self.data['X_val']=preprocessing.scale(self.orig_data['X_val']) self.data['X_test']=preprocessing.scale(self.orig_data['X_test']) elif method=='normalized': self.data['X_train']=preprocessing.normalize(self.orig_data['X_train']) self.data['X_val']=preprocessing.normalize(self.orig_data['X_val']) self.data['X_test']=preprocessing.normalize(self.orig_data['X_test']) self.data['y_train']=self.orig_data['y_train'] self.data['y_val']=self.orig_data['y_val'] self.data['y_test']=self.orig_data['y_test']
def compute_preprocessor(self,method): self.data={} if method=='min_max': transform=preprocessing.MinMaxScaler() self.data['X_train']=transform.fit_transform(self.orig_data['X_train']) self.data['X_val']=transform.transform(self.orig_data['X_val']) self.data['X_test']=transform.transform(self.orig_data['X_test']) elif method=='scaled': self.data['X_train']=preprocessing.scale(self.orig_data['X_train']) self.data['X_val']=preprocessing.scale(self.orig_data['X_val']) self.data['X_test']=preprocessing.scale(self.orig_data['X_test']) elif method=='normalized': self.data['X_train']=preprocessing.normalize(self.orig_data['X_train']) self.data['X_val']=preprocessing.normalize(self.orig_data['X_val']) self.data['X_test']=preprocessing.normalize(self.orig_data['X_test']) self.data['y_train']=self.orig_data['y_train'] self.data['y_val']=self.orig_data['y_val'] self.data['y_test']=self.orig_data['y_test']
def FeatureCombination(Df,s='',num_feature=2): feature_set = [] for c in Df.columns: if c.startswith(s): feature_set.append(c) print('combining', len(feature_set), 'features') data = Df[feature_set].values for c in Df.columns: if Df[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(Df[c].values)) Df[c] = lbl.transform(list(Df[c].values)) imp = preprocessing.Imputer() data = imp.fit_transform(data) data = preprocessing.scale(data) pca = PCA(num_feature) pca.fit(data) print('explained_variance_ratio_:', pca.explained_variance_ratio_) trans = pca.transform(data) for i in range(0,num_feature): Df[s+'_%d'%(i+1)] = trans[:,i] Df.drop(feature_set,1,inplace=True) return Df
def get_ind_return(data): ''' ??xlsx???????????????????????????????? :param [DataFrame] data: ?xlsx????????-???? :return: [DataFrame] ind_ret: ??*?? ??????????? ''' # ??stk_ind_pair.xlsx??????????????? stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx') # ?stk_ind????????????????? stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6]) # ?stk_ind?data??merge??????????data data = pd.merge(data, stk_ind, on='Stkcd') # ????????? groups = data.groupby(['Trdmnt', 'ind']) # ??????????????? total_Ms = groups['Msmvttl'].sum() # ????????????????????? total_Mr=groups['total_Mr'].sum() # ????????????????? ind_ret=total_Mr/total_Ms # ?ind_ret???level???? ind_ret=ind_ret.unstack() #?ind_ret??? ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns) return ind_ret
def create_model(context, data): # Get the relevant daily prices recent_prices = data.history(context.assets, 'price',context.history_range, '1d') context.ma_50 =recent_prices.values[-50:].mean() context.ma_200 = recent_prices.values[-200:].mean() #print context.ma_50 #print context.ma_200 time_lags = pd.DataFrame(index=recent_prices.index) time_lags['price']=recent_prices.values time_lags['returns']=(time_lags['price'].pct_change()).fillna(0.0001) time_lags['lag1'] = (time_lags['returns'].shift(1)).fillna(0.0001) time_lags['lag2'] = (time_lags['returns'].shift(2)).fillna(0.0001) time_lags['direction'] = np.sign(time_lags['returns']) X = time_lags[['returns','lag2']] # Independent, or input variables Y = time_lags['direction'] # Dependent, or output variable X_scaled = preprocessing.scale(X) context.model.fit(X_scaled, Y) # Generate our model
def __init__(self, data_set_parameters): OutputLog().write('Loading dataset: ' + data_set_parameters['name']) self.dataset_path = data_set_parameters['path'] self.trainset = None self.testset = None self.tuning = None self.reduce_val = 0 self.x_y_mapping = {'train': None, 'dev': None, 'test': None} self.x_reduce = {'train': None, 'dev': None, 'test': None} self.data_set_parameters = data_set_parameters self.scale = bool(int(data_set_parameters['scale'])) self.scale_rows = bool(int(data_set_parameters['scale_samples'])) self.whiten = bool(int(data_set_parameters['whiten'])) self.pca = map(int, data_set_parameters['pca'].split()) self.normalize_data = bool(int(data_set_parameters['normalize'])) self.preprocessors = None
def rerun_task(job_id, task_id): """ Reruns a specific task from a job. Sets the task status to 'pending' and triggers an asynchronous function to process the task. Parameters ---------- job_id: str task_id: int Returns ------- None """ job = mongo_no_context_get_job(job_id) task = mongo_no_context_get_task(job_id, task_id) k = task['k'] covar_type = task['covar_type'] covar_tied = task['covar_tied'] n_init = task['n_init'] s3_file_key = job['s3_file_key'] columns = job['columns'] scale = job.get('scale', False) response = mongo_no_context_update_task_status(job_id, task_id, 'pending') work_task.delay(job_id, task_id, k, covar_type, covar_tied, n_init, s3_file_key, columns, scale)
def train(): os.chdir(dname) for selected_stock in onlyfiles: df = pd.read_csv(os.path.join('data_files',selected_stock)) #preprocessing the data df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] #measure of volatility df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] forecast_col = 'Adj. Close' df.fillna(value=-99999, inplace=True) forecast_out = int(math.ceil(0.01 * len(df))) df['label'] = df[forecast_col].shift(-forecast_out) X = np.array(df.drop(['label'],1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2) svr = SVR() pickle.dump(svr,open(join(dname+'/models/svr_unfit/', selected_stock+'svr.sav'),'wb')) svr.fit(X_train, y_train) lr = LinearRegression() pickle.dump(lr,open(join(dname+'/models/lr_unfit/', selected_stock+'lr.sav'),'wb')) lr.fit(X_train, y_train) mlp = MLPRegressor() pickle.dump(mlp,open(join(dname+'/models/mlp_unfit/', selected_stock+'mlp.sav'),'wb')) mlp.fit(X_train, y_train) pickle.dump(svr,open(join(dname+'/models/svr_fit/', selected_stock+'svr.sav'),'wb')) pickle.dump(lr,open(join(dname+'/models/lr_fit/', selected_stock+'lr.sav'),'wb')) pickle.dump(mlp,open(join(dname+'/models/mlp_fit/', selected_stock+'mlp.sav'),'wb')) print(selected_stock+" - trained")
def lession_7(): X= np.array([[10,12,2], [-1,-9,99], [22,33,11]]) print X print preprocessing.scale(X) #X,y=make
def normalise(csv_filepath): """ load csv data and normalize it :param csv_filepath: :return: """ df = pd.read_csv(csv_filepath)[[ 'companyScore', 'describeScore', 'comprehensiveScore', 'interviewerScore', 'usefulCount', 'myScore', 'replyCount', 'isAnonymous']][1:] senti_df = pd.read_csv(csv_filepath)['sentiment'][1:] labels_ = [1 if _ > 0.9 else 0 for _ in senti_df] df['isAnonymous'] = [int(_) for _ in df['isAnonymous']] df_scaled = pd.DataFrame(preprocessing.scale(df)) return df_scaled, labels_
def get_name(self): return 'unit-scale'
def apply(self, data): return preprocessing.scale(data, axis=data.ndim-1)
def get_name(self): return 'unit-scale-feat'
def apply(self, data): return preprocessing.scale(data, axis=1)
def make_feat(self): d = self.init_data() for i in range(1, 100): d['dif{}'.format(i)] = d.diff(i) for i in range(0, 9): d['hc{}'.format(i)] = d.HIGH.shift(i) - d.CLOSE.shift(i) d['lc{}'.format(i)] = d.LOW.shift(i) - d.CLOSE.shift(i) d['hl{}'.format(i)] = d.HIGH.shift(i) - d.LOW.shift(i) d['oc{}'.format(i)] = d.OPEN.shift(i) - d.CLOSE.shift(i) d['oh{}'.format(i)] = d.OPEN.shift(i) - d.HIGH.shift(i) d['ol{}'.format(i)] = d.OPEN.shift(i) - d.LOW.shift(i) d = d.fillna(0) d = preprocessing.scale(d) filename = join(self.out_poath, 'f_{0}.csv'.format(self.struc)) d.to_csv(path_or_buf=filename)
def train(self, data, labels): """ Trains current classifier with matrix data and labels, where labels[i] describes data[:, i]. :param data: Matrix of data, where each column is a separate sample. :param labels: List of labels, each corresponding to a column of data. """ if self.use_pca: u, s, _ = scipy.sparse.linalg.svds(data) self.svc.fit( preprocessing.scale(u[:, :self.rank+1].T.dot(data).T), labels) else: self.svc.fit(preprocessing.scale(data.T), labels)
def classify(self, data): """ Classifies data based on current model. :param data: Matrix with each column a different sample. :returns: List of predictions, where return[i] describes data[:, i]. """ if self.use_pca: u, s, _ = scipy.sparse.linalg.svds(data) self.svc.predict( preprocessing.scale(u[:, :self.rank+1].T.dot(data).T)) else: return self.svc.predict(preprocessing.scale(data.T))
def sk_min_max(X): min_max_scaler = MinMaxScaler() # X = scale(X, axis=0, with_mean=True, with_std=True, copy=True) return min_max_scaler.fit_transform(X)
def min_max(X): min_max_scaler = MinMaxScaler() X = scale(X, axis=0, with_mean=True, with_std=True, copy=True ) X = min_max_scaler.fit_transform(X) return X
def sk_scale(X): return scale(X, axis=0, with_mean=True, with_std=True, copy=True )
def Standardization(self): # feature 10: minimum price so far; feature 11: maximum price so far # feature 12: current price scaled = preprocessing.scale(self.X_train[:, 10:13]) self.X_train[:, 10:13] = scaled scaled = preprocessing.scale(self.X_test[:, 10:13]) self.X_test[:, 10:13] = scaled
def Standardization(self): scaled = preprocessing.scale(self.X_train[:, 10:12]) self.X_train[:, 10:12] = scaled scaled = preprocessing.scale(self.X_test[:, 10:12]) self.X_test[:, 10:12] = scaled
def prepare(self): with open('%s' % self.cfg.pca_pkl, 'r') as pklfile: self.pca = pickle.load(pklfile) try: self.df = self.df.query('face == 1') except: print 'Face column not found in the dataframe', print 'Treated as not being processed by skin_filter.' x = self.df[self.ftcols].as_matrix() x = preprocessing.scale(x) xp = self.pca.transform(x) self.dfp = pd.DataFrame(xp) self.dfp[['number','time']] = self.df[['number','time']]
def fit(self, X, STANDARDIZE=True, n=10): if not isinstance(X, np.ndarray): X = to_array(X) assert(X.ndim == 2), "Input array must have two dimensions." if not check_standardized(X): if STANDARDIZE: X = preprocessing.scale(X) print "Standardize input data for fit." else: print "WARNING: data is not standardized and you switch off STANDARDIZE option.", print "Make sure this is what you intended." self.model = PCA(n_components=n) self.model.fit(X)
def svc_rbf_xyat(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["x"] = df["x"] df_new["y"] = df["y"] df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] df_new["accuracy"] = df["accuracy"].apply(np.log10) return preprocessing.scale(df_new.values) logging.info("train svc_rbf_xyat model") clf = SVC(kernel='rbf', probability=True, cache_size=3000) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def svc_lin_xyat(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["x"] = df["x"] df_new["y"] = df["y"] df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] df_new["accuracy"] = df["accuracy"].apply(np.log10) return preprocessing.scale(df_new.values) logging.info("train svc_lin_xyat model") clf = SVC(kernel='linear', probability=True, cache_size=3000) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def svc_rbf_xyatu(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["x"] = df["x"] df_new["y"] = df["y"] df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] df_new["accuracy"] = df["accuracy"] return preprocessing.scale(df_new.values) logging.info("train svc_rbf_xyatu model") clf = SVC(kernel='rbf', probability=True, cache_size=3000) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def svc_lin_xyatu(df_cell_train_feats, y_train, df_cell_test_feats): def prepare_feats(df): df_new = pd.DataFrame() df_new["x"] = df["x"] df_new["y"] = df["y"] df_new["hour"] = df["hour"] df_new["weekday"] = df["weekday"] df_new["accuracy"] = df["accuracy"] return preprocessing.scale(df_new.values) logging.info("train svc_lin_xyatu model") clf = SVC(kernel='linear', probability=True, cache_size=3000) clf.fit(prepare_feats(df_cell_train_feats), y_train) y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats)) return y_test_pred
def scaleDataset(data): ''' Scaling the dataset between 1 to -1 ''' data = scale(data) return data
def get_recommendations(): module_dir = os.path.dirname(__file__) train_df = build_training_set() if train_df is None: return [] x_train = train_df.iloc[:, 5:] try: x_train = scale(x_train) except: print("First migrations") y_train = train_df.iloc[:, 3] x_train_labels = train_df.iloc[:, 0] target_df = pd.read_csv(os.path.join(module_dir,'data.csv')) target_df = pd.DataFrame(target_df) target_df = target_df.append(train_df) target_df = target_df.append(train_df) target_df = target_df.drop_duplicates('SeriesName', keep=False) x_target = scale(target_df.iloc[:, 5:]) x_target_labels = target_df.iloc[:, 0] clf = RandomForestClassifier() clf.fit(x_train,y_train) y_target = clf.predict(x_target) new_df = pd.DataFrame() new_df['seriesName'] = x_target_labels new_df['tvdbID'] = target_df.iloc[:, 1] new_df['PredictedRating'] = y_target new_df['indicator'] = (target_df.iloc[:, 4]/target_df.iloc[:, 3])*new_df['PredictedRating'] new_df = new_df.sort_values(['indicator'], ascending=False) initial_list = list(new_df.iloc[:4, 1]) latter_list = list(new_df.iloc[5:15, 1]) shuffle(latter_list) return list(initial_list + latter_list[:5])
def scale_feature(self, col=None, scaling=None, scaling_parms=None): ''' Scales a given set of numerical columns. This only works for columns with numerical values. Parameters ---------- col : a string of a column name, or a list of many columns names or None (default). If col is None, all numerical columns will be used. scaling : {'zscore', 'minmax_scale' (default), 'scale', 'maxabs_scale', 'robust_scale'} User-defined scaling functions can also be used through self.transform_feature scaling_parms : dictionary any additional parameters to be used for sklearn's scaling functions. ''' self._validate_params(params_list = {'col':col,'scaling':scaling}, expected_types= {'col':[str,list,type(None)], 'scaling':[str,type(None)]}) if scaling is None: scaling = 'minmax_scale' if scaling == 'zscore': scaling = 'lambda x: (x - x.mean()) / x.std()' elif scaling == 'minmax_scale' and scaling_parms is None: scaling_parms = {'feature_range':(0, 1),'axis':0} elif scaling == 'scale' and scaling_parms is None: scaling_parms = {'with_mean':True, 'with_std':True,'axis':0} elif scaling == 'maxabs_scale' and scaling_parms is None: scaling_parms = {'axis':0} elif scaling == 'robust_scale' and scaling_parms is None: scaling_parms = {'with_centering':True, 'with_scaling':True, 'axis':0} # 'quantile_range':(25.0, 75.0), else: raise TypeError('UNSUPPORTED scaling TYPE') self.transform_feature(col=col, func_str=scaling, addtional_params=scaling_parms)
def gen_feature_imp_matrix(model_id_list, features_df): feature_imp_matrix = pd.DataFrame for model_id in model_id_list[:1]: feature_imp_matrix = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values for model_id in model_id_list[1:]: b = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values feature_imp_matrix = np.vstack((feature_imp_matrix, b)) feature_imp_matrix_normd = scale(np.transpose(feature_imp_matrix),axis=0,with_mean=True, with_std=True, copy=True) return feature_imp_matrix_normd
def spectrogramPower(audio, window_size=0.02, window_stride=0.01): """ short time fourier transform Details: audio - This is the input time-domain signal you wish to find the spectrogram of. It can't get much simpler than that. In your case, the signal you want to find the spectrogram of is defined in the following code: win_length - If you recall, we decompose the image into chunks, and each chunk has a specified width. window defines the width of each chunkin terms of samples. As this is a discrete-time signal, you know that this signal was sampled with a particular sampling frequency and sampling period. You can determine how large the window is in terms of samples by: window_samples = window_time/Ts hop_length - the same as stride in convolution network, overlapping width """ samplingRate, samples = wav.read(audio) win_length = int(window_size * samplingRate) hop_length = int(window_stride * samplingRate) n_fft = win_length D = librosa.core.stft(samples, n_fft=n_fft,hop_length=hop_length, win_length=win_length) mag = np.abs(D) log_mag = np.log1p(mag) # normalization log_mag = preprocessing.scale(log_mag) # size: frequency_bins*time_len return log_mag
def standardize(data): numeric_list = ['BsmtFullBath', 'LotArea', 'YearRemodAdd', 'GrLivArea', 'BsmtHalfBath', 'MiscVal', 'YearBuilt', 'WoodDeckSF', 'KitchenAbvGr', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'OpenPorchSF', 'MoSold', 'LowQualFinSF', 'BedroomAbvGr', 'Fireplaces', '1stFlrSF', 'FullBath', 'BsmtFinSF1', 'BsmtFinSF2', 'HalfBath', 'Porch', '2ndFlrSF', 'MasVnrArea', 'YrSold', 'BsmtUnfSF', 'LotFrontage', 'TotRmsAbvGrd'] data.loc[:, numeric_list] = preprocessing.scale(data.loc[:, numeric_list])
def get_concat_data(label_csv, label_col, other_csvs, is_rate, important_feats): print 'important_feats : ',len(important_feats) rank_feats = [f for f in get_csv_header(dataset1_csv) if 'click' in f] rank_feats = [f for f in rank_feats if f in important_feats] if important_feats else rank_feats X = pd.read_csv(label_csv, usecols = rank_feats+[label_col]).apply(small_dtype) X = X[:1000000] if is_tiny else X print 'concat csvs ......' X = pd.concat([X, get_need_feats(other_csvs, is_rate, is_tiny, important_feats)], axis=1) #if label_csv.split('/')[-1] == 'dataset2.csv': # for c in X.columns: # if c.endswith('_fset_total_cnt'): # X = X.drop(X[X[c]==0].index, axis=0) feat_cols = [f for f in X.columns if f != label_col] if is_to_csv: save_file = label_csv.split('.csv')[0]+'_concat.csv' if os.path.exists(save_file): print save_file + " has exists" else: print 'to csv ........' X = X.replace(np.nan, -1) X = X.replace(np.inf, -2) X[feat_cols] = scale(X[feat_cols]).astype('float16') X.to_csv(save_file, index=False, chunksize = 50000) print X.shape # TODO cate_feats = [f for f in X.columns if 'click' in f] ## ????????? 3 ???????,??rank_feats ???????? X, = change_to_category([X], cate_feats) y = X[label_col].values X = X[feat_cols] if label_col == 'label': print 'positive percent ',y.mean() return X, y
def windowCharacter(x): tmp = np.zeros((x.shape[0])) n=0 for row in x.iterrows(): tmp[n] = signalMag(row[1]['X'],row[1]['Y'],row[1]['Z']) n=n+1 # if np.std(tmp) > 5: # return None # else: p_25 = np.percentile(tmp,25) p_75 = np.percentile(tmp,75) tmp_25 = [each for each in tmp if each < p_25] tmp_75 = [each for each in tmp if each < p_75] data_dm = scale(tmp,with_mean=True, with_std=False) # demean data (freq_1,power_1) = butterFilter(data_dm,lowcut_1,highcut_1) idx_1 = np.argmax(power_1) freq_1_sec = np.delete(freq_1,idx_1) power_1_sec = np.delete(power_1,idx_1) idx_1_sec = np.argmax(power_1_sec) (freq_2,power_2) = butterFilter(data_dm,lowcut_2,highcut_2) idx_2 = np.argmax(power_2) return np.mean(tmp), np.std(tmp), np.median(tmp), np.linalg.norm(tmp_25), np.linalg.norm(tmp_75),np.sum(power_1), freq_1[idx_1],power_1[idx_1], freq_1_sec[idx_1_sec], power_1_sec[idx_1_sec], freq_2[idx_2],power_2[idx_2],freq_1[idx_1]/np.sum(power_1)
def normalize(x, sf, logtrans=True, sfnorm=True, zeromean=True): if sfnorm: assert len(sf.shape) == 1 x = x / (sf[:, None]+1e-8) # colwise div if logtrans: x = np.log1p(x) if zeromean: x = scale(x) return x
def test_scale(): matrix = [[0,30], [1, 27], [3, 24]] scaled = pre.scale(matrix) print(scaled)