我们从Python开源项目中,提取了以下8个代码示例,用于说明如何使用sklearn.preprocessing()。
def _validate_sklearn_preprocessing(self): '''Validate "sklearn_preprocessing" dict in config''' self.sklearn_preprocessing = self.config.get('sklearn_preprocessing') or {} self._validate_type(self.sklearn_preprocessing, 'sklearn_preprocessing', dict) for k, v in self.sklearn_preprocessing.items(): self._validate_type(v, 'sklearn_preprocessing:{}'.format(k), dict) if v.get('method') in dir(skpre) or callable(v.get('method')): pass else: self._validate_custom_callable(v.get('method'), True, 'sklearn_preprocessing:{} - method'.format(k)) if v['method'].split(':')[-1] == 'FunctionTransformer': self._validate_custom_callable(v.get('func'), True, 'sklearn_preprocessing:{} - func passed to FunctionTransformer'.format(k))
def outofsample_extensions(method=None, dataset=None): np.random.seed(1) sklearn.utils.check_random_state(1) train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1) # Learn a new space using Isomap isomap = Isomap(n_components=10, n_neighbors=20) train_data_isomap = np.float32(isomap.fit_transform(train_data)) if method == 'linear-regression': from sklearn.preprocessing import StandardScaler std = StandardScaler() train_data = std.fit_transform(train_data) test_data = std.transform(test_data) # Use linear regression to provide baseline out-of-sample extensions proj = LinearRegression() proj.fit(np.float64(train_data), np.float64(train_data_isomap)) acc = evaluate_svm(proj.predict(train_data), train_labels, proj.predict(test_data), test_labels) elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d': # Use the SEF to provide out-of-sample extensions if method == 'c-ISOMAP-10d': proj = LinearSEF(train_data.shape[1], output_dimensionality=10) proj.cuda() else: proj = LinearSEF(train_data.shape[1], output_dimensionality=20) proj.cuda() loss = proj.fit(data=train_data, target_data=train_data_isomap, target='copy', epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1) acc = evaluate_svm(proj.transform(train_data), train_labels, proj.transform(test_data), test_labels) print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def vectorize_fold(fold, tags, meta_df, use_metafeats=True): with time_me('Loaded pdicts'): scoreses = [common.pdict_for_tag(tag, fold) for tag in tags] df = meta_df[meta_df['fold']==fold] assert len(df) y = df['label'] n_predictors = len(scoreses) with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'): # TODO: could use the logit loading fn added to user_wrapper module scores = munge_scoreses(scoreses, df) if not use_metafeats: X = scores else: meta_cols = metavectorize.metafeature_columns meta = df[meta_cols].values # Special f_0 dummy meta feature for learning vanilla weight term per predictor metafeats = np.hstack([np.ones( (len(df), 1) ), meta]) # Oh fuck this, I've spent too long trying to understand np.einsum... # (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility # that might have been useful here. But this is fine.) n_metafeats = metafeats.shape[1] logging.info('{} predictors x {} metafeatures -> {} coefs'.format( n_predictors, n_metafeats, n_predictors*n_metafeats)) # X is 'metafeat major'. i.e. the first n_p values for each vector are the # raw scores for each predictor, they're followed by each predictor's score # multiplied by the first metafeature and so on. X = np.tile(scores, n_metafeats) * np.repeat(metafeats, n_predictors, axis=1) return X, y
def mfcc_features(filename): """Preprocessing per CTC paper. (These are not the simpler linear spectrogram features alone as in Deep Speech). Properties: - 10ms frames with 5ms overlap - 12 MFCCs with 26 filter banks - replace first MFCC with energy (TODO: log-energy) - add first-order derivatives for all of the above - total: 26 coefficients """ d, sr = librosa.load(filename) frame_length_seconds = 0.010 frame_overlap_seconds = 0.005 mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr)) # energy (TODO: log?) energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr)) mfccs[0] = energy # replace first MFCC with energy, per convention deltas = librosa.feature.delta(mfccs, order=1) mfccs_plus_deltas = np.vstack([mfccs, deltas]) coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1) return coeffs
def _weightProcessing(weightDF): weightDF = weightDF.loc[1:, :] weightDF['coefficient'] = weightDF['coefficient'].abs() min_max_scaler = preprocessing.MinMaxScaler() weight_scaled = min_max_scaler.fit_transform(weightDF[['coefficient']]) weightDF['coefficient'] = weight_scaled print weightDF.sort_values('coefficient', ascending=False).to_string(index=False)
def __init__(self, clf=None, le=None): # type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None """Construct a new intent classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf
def cal_minute_bar_similarity(line_data): """????? line_data format: file_path, json_data ??? 1. ?????? 2. ????? 3. ?????? - ???? 4. ????? - ???? Return: square diff and var diff of two lines. [diff_square, diff_var, (line_path)] [diff_square_normalized, diff_var_normalized, (line_path)] """ tmp = pd.DataFrame() import sklearn.preprocessing scaler = sklearn.preprocessing.MinMaxScaler() today_data = pd.DataFrame.from_dict(json.loads(df_today_share.value)) today_data_length = today_length_share.value line_path, line_df = line_data line_df = pd.DataFrame.from_dict(json.loads(line_df)) line_df.sort(columns=['barTime'], ascending=True, inplace=True) tmp['first'] = list(today_data[: today_data_length]['ratio']) tmp['second'] = list(line_df[: today_data_length]['ratio']) _first, _second = list(tmp['first']), list(tmp['second']) tmp['first_normalized'] = list(scaler.fit_transform(np.array(_first))) tmp['second_normalized'] = list(scaler.fit_transform(np.array(_second))) tmp['diff'] = tmp['first'] - tmp['second'] tmp['diff_normalized'] = tmp['first_normalized'] - tmp['second_normalized'] diff_square = sum(tmp['diff'] ** 2) diff_square_normalized = sum(tmp['diff_normalized'] ** 2) diff_var = float(tmp['diff'].var()) diff_var_normalized = float(tmp['diff_normalized'].var()) res_square = [round(diff_square, 5), round(diff_square_normalized, 5), (line_path)] res_var = [round(diff_var, 5), round(diff_var_normalized, 5), (line_path)] return res_square + res_var # ### ???