Python sklearn 模块,preprocessing() 实例源码


def _validate_sklearn_preprocessing(self):
        '''Validate "sklearn_preprocessing" dict in config'''
        self.sklearn_preprocessing = self.config.get('sklearn_preprocessing') or {}
        self._validate_type(self.sklearn_preprocessing, 'sklearn_preprocessing', dict)
        for k, v in self.sklearn_preprocessing.items():
            self._validate_type(v, 'sklearn_preprocessing:{}'.format(k), dict)
            if v.get('method') in dir(skpre) or callable(v.get('method')):
                                               'sklearn_preprocessing:{} - method'.format(k))
            if v['method'].split(':')[-1] == 'FunctionTransformer':
                                               'sklearn_preprocessing:{} - func passed to FunctionTransformer'.format(k))
def outofsample_extensions(method=None, dataset=None):

    train_data, train_labels, test_data, test_labels = dataset_loader(dataset, seed=1)

    # Learn a new space using Isomap
    isomap = Isomap(n_components=10, n_neighbors=20)
    train_data_isomap = np.float32(isomap.fit_transform(train_data))

    if method == 'linear-regression':
        from sklearn.preprocessing import StandardScaler
        std = StandardScaler()
        train_data = std.fit_transform(train_data)
        test_data = std.transform(test_data)

        # Use linear regression to provide baseline out-of-sample extensions
        proj = LinearRegression(), np.float64(train_data_isomap))
        acc = evaluate_svm(proj.predict(train_data), train_labels,
                           proj.predict(test_data), test_labels)
    elif method == 'c-ISOMAP-10d' or method == 'c-ISOMAP-20d':
        # Use the SEF to provide out-of-sample extensions
        if method == 'c-ISOMAP-10d':
            proj = LinearSEF(train_data.shape[1], output_dimensionality=10)
            proj = LinearSEF(train_data.shape[1], output_dimensionality=20)
        loss =, target_data=train_data_isomap, target='copy',
                        epochs=50, batch_size=1024, verbose=False, learning_rate=0.001, regularizer_weight=1)
        acc = evaluate_svm(proj.transform(train_data), train_labels,
                           proj.transform(test_data), test_labels)

    print("Method: ", method, " Test accuracy: ", 100 * acc, " %")
def vectorize_fold(fold, tags, meta_df, use_metafeats=True):
  with time_me('Loaded pdicts'):
    scoreses = [common.pdict_for_tag(tag, fold) for tag in tags]
  df = meta_df[meta_df['fold']==fold]
  assert len(df)
  y = df['label']
  n_predictors = len(scoreses)
  with time_me('Munged scores for {} predictors'.format(n_predictors), mode='print'):
    # TODO: could use the logit loading fn added to user_wrapper module
    scores = munge_scoreses(scoreses, df)
  if not use_metafeats:
    X = scores
    meta_cols = metavectorize.metafeature_columns
    meta = df[meta_cols].values
    # Special f_0 dummy meta feature for learning vanilla weight term per predictor
    metafeats = np.hstack([np.ones( (len(df), 1) ), meta])
    # Oh fuck this, I've spent too long trying to understand np.einsum...
    # (Worth noting that sklearn.preprocessing has a 'PolynomialFeatures' utility
    # that might have been useful here. But this is fine.)
    n_metafeats = metafeats.shape[1]'{} predictors x {} metafeatures -> {} coefs'.format(
      n_predictors, n_metafeats, n_predictors*n_metafeats))
    # X is 'metafeat major'. i.e. the first n_p values for each vector are the 
    # raw scores for each predictor, they're followed by each predictor's score
    # multiplied by the first metafeature and so on.
    X = np.tile(scores, n_metafeats) * np.repeat(metafeats, n_predictors, axis=1)
  return X, y
def mfcc_features(filename):
  """Preprocessing per CTC paper.

  (These are not the simpler linear spectrogram features alone as in Deep

  - 10ms frames with 5ms overlap
  - 12 MFCCs with 26 filter banks
  - replace first MFCC with energy (TODO: log-energy)
  - add first-order derivatives for all of the above
  - total: 26 coefficients
  d, sr = librosa.load(filename)

  frame_length_seconds = 0.010
  frame_overlap_seconds = 0.005

  mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))

  # energy (TODO: log?)
  energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))

  mfccs[0] = energy # replace first MFCC with energy, per convention

  deltas =, order=1)
  mfccs_plus_deltas = np.vstack([mfccs, deltas])

  coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1)

  return coeffs
def _weightProcessing(weightDF):
    weightDF = weightDF.loc[1:, :]
    weightDF['coefficient'] = weightDF['coefficient'].abs()
    min_max_scaler = preprocessing.MinMaxScaler()
    weight_scaled = min_max_scaler.fit_transform(weightDF[['coefficient']])
    weightDF['coefficient'] = weight_scaled
    print weightDF.sort_values('coefficient', ascending=False).to_string(index=False)
def __init__(self, clf=None, le=None):
        # type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        if le is not None:
            self.le = le
            self.le = LabelEncoder()
        self.clf = clf
def __init__(self, clf=None, le=None):
        # type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None
        """Construct a new intent classifier using the sklearn framework."""
        from sklearn.preprocessing import LabelEncoder

        if le is not None:
            self.le = le
            self.le = LabelEncoder()
        self.clf = clf
def cal_minute_bar_similarity(line_data):

    line_data format: file_path, json_data

        1. ??????
        2. ?????
        3. ?????? - ????
        4. ????? - ????

        square diff and var diff of two lines.
        [diff_square, diff_var, (line_path)]
        [diff_square_normalized, diff_var_normalized, (line_path)]
    tmp = pd.DataFrame()

    import sklearn.preprocessing
    scaler = sklearn.preprocessing.MinMaxScaler()

    today_data = pd.DataFrame.from_dict(json.loads(df_today_share.value))
    today_data_length = today_length_share.value
    line_path, line_df = line_data

    line_df = pd.DataFrame.from_dict(json.loads(line_df))
    line_df.sort(columns=['barTime'], ascending=True, inplace=True)

    tmp['first'] = list(today_data[: today_data_length]['ratio'])
    tmp['second'] = list(line_df[: today_data_length]['ratio'])

    _first, _second = list(tmp['first']), list(tmp['second'])
    tmp['first_normalized'] = list(scaler.fit_transform(np.array(_first)))
    tmp['second_normalized'] = list(scaler.fit_transform(np.array(_second)))

    tmp['diff'] = tmp['first'] - tmp['second']
    tmp['diff_normalized'] = tmp['first_normalized'] - tmp['second_normalized']

    diff_square = sum(tmp['diff'] ** 2)
    diff_square_normalized = sum(tmp['diff_normalized'] ** 2)

    diff_var = float(tmp['diff'].var())
    diff_var_normalized = float(tmp['diff_normalized'].var())
    res_square = [round(diff_square, 5), round(diff_square_normalized, 5), (line_path)]
    res_var = [round(diff_var, 5), round(diff_var_normalized, 5), (line_path)]

    return res_square + res_var

