Python sklearn.preprocessing 模块,MinMaxScaler() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.MinMaxScaler()

项目:sef    作者:passalis    | 项目源码 | 文件源码
def evaluate_svm(train_data, train_labels, test_data, test_labels, n_jobs=-1):
    """
    Evaluates a representation using a Linear SVM
    It uses 3-fold cross validation for selecting the C parameter
    :param train_data:
    :param train_labels:
    :param test_data:
    :param test_labels:
    :param n_jobs:
    :return: the test accuracy
    """

    # Scale data to 0-1
    scaler = MinMaxScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)

    parameters = {'kernel': ['linear'], 'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]}
    model = svm.SVC(max_iter=10000)
    clf = grid_search.GridSearchCV(model, parameters, n_jobs=n_jobs, cv=3)
    clf.fit(train_data, train_labels)
    lin_svm_test = clf.score(test_data, test_labels)
    return lin_svm_test
项目:mlens    作者:flennerhag    | 项目源码 | 文件源码
def build_ensemble(**kwargs):
    """Generate ensemble."""

    ens = SuperLearner(**kwargs)
    prep = {'Standard Scaling': [StandardScaler()],
            'Min Max Scaling': [MinMaxScaler()],
            'No Preprocessing': []}

    est = {'Standard Scaling':
               [ElasticNet(), Lasso(), KNeighborsRegressor()],
           'Min Max Scaling':
               [SVR()],
           'No Preprocessing':
               [RandomForestRegressor(random_state=SEED),
                GradientBoostingRegressor()]}

    ens.add(est, prep)

    ens.add(GradientBoostingRegressor(), meta=True)

    return ens
项目:muffnn    作者:civisanalytics    | 项目源码 | 文件源码
def test_replicability():
    """Make sure it can be seeded properly."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae1 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc1 = ae1.fit_transform(X)

    ae2 = Autoencoder(hidden_units=(1,),
                      n_epochs=1000,
                      random_state=4556,
                      learning_rate=1e-2,
                      keep_prob=1.0)
    Xenc2 = ae2.fit_transform(X)

    assert_array_almost_equal(Xenc1, Xenc2)
项目:golden_touch    作者:at553    | 项目源码 | 文件源码
def train_model(self):
        # scale
        scaler = MinMaxScaler(feature_range=(0, 1))
        dataset = scaler.fit_transform(self.data)

        # split into train and test sets
        train_size = int(len(dataset) * 0.95)
        train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]

        look_back = 5
        trainX, trainY = self.create_dataset(train, look_back)

        # reshape input to be [samples, time steps, features]
        trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
        # create and fit the LSTM network
        model = Sequential()
        model.add(LSTM(6, input_dim=look_back))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(trainX, trainY, nb_epoch=100, batch_size=1, verbose=2)
        return model
项目:keras-timeseries-prediction    作者:gcarq    | 项目源码 | 文件源码
def load_dataset(datasource: str) -> (numpy.ndarray, MinMaxScaler):
    """
    The function loads dataset from given file name and uses MinMaxScaler to transform data
    :param datasource: file name of data source
    :return: tuple of dataset and the used MinMaxScaler
    """
    # load the dataset
    dataframe = pandas.read_csv(datasource, usecols=[1])
    dataframe = dataframe.fillna(method='pad')
    dataset = dataframe.values
    dataset = dataset.astype('float32')

    plt.plot(dataset)
    plt.show()

    # normalize the dataset
    scaler = MinMaxScaler(feature_range=(0, 1))
    dataset = scaler.fit_transform(dataset)
    return dataset, scaler
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def train(self, train_size=0.8, k_folds=5):

        # retrieve data from DB and pre-process
        self._get_data()

        # perform train/test split
        self._get_train_test_split(train_size=train_size)

        # define text pre-processing pipeline
        text_pipeline = Pipeline([
            ('extract_text', DFColumnExtractor(TEXT_FEATURES)),
            ('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
        ])

        # define pipeline for pre-processing of numeric features
        numeric_pipeline = Pipeline([
            ('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
            ('scaler', MinMaxScaler())
        ])

        # combine both steps into a single pipeline
        pipeline = Pipeline([
            ('features', FeatureUnion([
                ('text_processing', text_pipeline),
                ('num_processing', numeric_pipeline)
            ])),
            ('clf', self._estimator)
        ])

        self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
        gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)

        X = self.data.iloc[self.train_inds_, :]
        y = self.data[LABEL].values[self.train_inds_]

        gs.fit(X, y)

        self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))

        self.gs_ = gs
        self.model_ = gs.best_estimator_
项目:triage    作者:dssg    | 项目源码 | 文件源码
def test_cutoff_inside_a_pipeline(data):
    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5

    mms = preprocessing.MinMaxScaler().fit(data['X_train'])

    assert np.all(( mms.transform(X_fake_new_data) > 1  ) == (pipeline.transform(X_fake_new_data) == 1))
项目:triage    作者:dssg    | 项目源码 | 文件源码
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def get_input(self):
        # Input data.
        # Load the training, validation and test data into constants that are
        # attached to the graph.
        self.x_train, self.y_train,self.x_validation,self.y_validation = self.get_train_validationset()
        self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.as_matrix(), self.y_train.as_matrix().reshape((-1,1)),\
                                                                         self.x_validation.as_matrix(),self.y_validation.as_matrix().reshape((-1,1))
#         self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.astype(np.float32), self.y_train.astype(np.float32),\
#                                                                          self.x_validation.astype(np.float32),self.y_validation.astype(np.float32)
        sc = MinMaxScaler()
        sc.fit(self.x_train)
        self.x_train= sc.transform(self.x_train)
        self.x_validation= sc.transform(self.x_validation)

        self.inputlayer_num = len(self.get_used_features())
        self.outputlayer_num = 1

        # Input placehoolders
        with tf.name_scope('input'):
            self.x = tf.placeholder(tf.float32, [None, self.inputlayer_num], name='x-input')
            self.y_true = tf.placeholder(tf.float32, [None, self.outputlayer_num ], name='y-input')
        self.keep_prob = tf.placeholder(tf.float32, name='drop_out')

        return
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_df_values(self):
        est1 = dpp.MinMaxScaler()
        est2 = dpp.MinMaxScaler()

        result_ar = est1.fit_transform(X)
        result_df = est2.fit_transform(df)

        for attr in ['data_min_', 'data_max_', 'data_range_',
                     'scale_', 'min_']:
            assert_eq_ar(getattr(est1, attr), getattr(est2, attr).values)

        assert_eq_ar(est1.transform(X), est2.transform(X))
        assert_eq_ar(est1.transform(df).values, est2.transform(X))
        assert_eq_ar(est1.transform(X), est2.transform(df).values)

        assert_eq_ar(result_ar, result_df.values)
项目:copper_price_forecast    作者:liyinwei    | 项目源码 | 文件源码
def _pp_min_max_scale(df):
    """
    ????????
    """
    print("  start minmax scaling...")
    # drop?id?price_date??
    # df = df.drop(['id', 'price_date'], axis=1)
    # ??index???column??
    index = df.index
    columns = df.columns
    # ????????
    feature_scaled = preprocessing.MinMaxScaler().fit_transform(df.iloc[:, :-1])

    target = np.array(df.iloc[:, -1])
    target.shape = (len(target), 1)

    # ???????X???????y?????Pandas ? DataFrame??????numpy?ndarray???
    df_scaled = pd.DataFrame(np.hstack((feature_scaled, target)))
    # ???????column??
    df_scaled.index = index
    df_scaled.columns = columns

    print("  minmax scaling finished.")
    return df_scaled
项目:merlin    作者:CSTR-Edinburgh    | 项目源码 | 文件源码
def load_norm_stats(stats_file, dim, method="MVN"):
    #### load norm stats ####
    io_funcs = BinaryIOCollection()

    norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim)
    assert frame_number==2

    if method=="MVN":
        scaler = preprocessing.StandardScaler()
        scaler.mean_  = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]
    elif method=="MINMAX":
        scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))
        scaler.min_   = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]

    return scaler
项目:merlin    作者:CSTR-Edinburgh    | 项目源码 | 文件源码
def load_norm_stats(stats_file, dim, method="MVN"):
    #### load norm stats ####
    io_funcs = BinaryIOCollection()

    norm_matrix, frame_number = io_funcs.load_binary_file_frame(stats_file, dim)
    assert frame_number==2

    if method=="MVN":
        scaler = preprocessing.StandardScaler()
        scaler.mean_  = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]
    elif method=="MINMAX":
        scaler = preprocessing.MinMaxScaler(feature_range=(0.01, 0.99))
        scaler.min_   = norm_matrix[0, :]
        scaler.scale_ = norm_matrix[1, :]

    return scaler
项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:ottertune    作者:cmu-db    | 项目源码 | 文件源码
def __init__(self, mins=None, maxs=None):
        from sklearn.preprocessing import MinMaxScaler

        self.scaler_ = MinMaxScaler()
        if mins is not None:
            assert isinstance(mins, np.ndarray)
            if mins.ndim == 1:
                mins = mins.reshape(1, -1)
            self.scaler_.partial_fit(mins)
            self.mins_ = mins
        else:
            self.mins_ = None
        if maxs is not None:
            assert isinstance(maxs, np.ndarray)
            if maxs.ndim == 1:
                maxs = maxs.reshape(1, -1)
            self.scaler_.partial_fit(maxs)
            self.maxs_ = maxs
        else:
            self.maxs_ = None
        if self.mins_ is not None and self.maxs_ is not None:
            self.fitted_ = True
        else:
            self.fitted_ = False
项目:stock-price-prediction    作者:chinuy    | 项目源码 | 文件源码
def applyFeatures(dataset, delta):
    """
    applies rolling mean and delayed returns to each dataframe in the list
    """
    columns = dataset.columns
    close = columns[-3]
    returns = columns[-1]
    for n in delta:
        addFeatures(dataset, close, returns, n)

    dataset = dataset.drop(dataset.index[0:max(delta)]) #drop NaN due to delta spanning

    # normalize columns
    scaler = preprocessing.MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(dataset),\
            columns=dataset.columns, index=dataset.index)
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def prepare_faces():
    data = sklearn.datasets.fetch_olivetti_faces('../data', shuffle=False)
    X = data.data
    y = data.target

    X = np.split(X, 40)
    y = np.split(y, 40)

    X_train = [x[0:7, :] for x in X]
    X_test = [x[7:, :] for x in X]
    y_train = [a[0:7] for a in y]
    y_test = [a[7:] for a in y]
    X_train = np.concatenate(X_train)
    X_test = np.concatenate(X_test)
    y_train = pd.Series(np.concatenate(y_train))
    y_test = pd.Series(np.concatenate(y_test))

    scaler = MinMaxScaler(feature_range=(-1, 1))
    X_train = pd.DataFrame(scaler.fit_transform(X_train))
    X_test = pd.DataFrame(scaler.transform(X_test))

    return X_train, y_train, X_test, y_test, scaler
项目:Steal-ML    作者:ftramer    | 项目源码 | 文件源码
def prepare_faces():
    data = sklearn.datasets.fetch_olivetti_faces('../data', shuffle=False)
    X = data.data
    y = data.target

    X = np.split(X, 40)
    y = np.split(y, 40)

    X_train = [x[0:7, :] for x in X]
    X_test = [x[7:, :] for x in X]
    y_train = [a[0:7] for a in y]
    y_test = [a[7:] for a in y]
    X_train = np.concatenate(X_train)
    X_test = np.concatenate(X_test)
    y_train = np.concatenate(y_train)
    y_test = np.concatenate(y_test)

    scaler = MinMaxScaler(feature_range=(-1, 1))
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, y_train, X_test, y_test, scaler
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:muffnn    作者:civisanalytics    | 项目源码 | 文件源码
def test_persistence():
    """Make sure we can pickle it."""
    X = iris.data  # Use the iris features.
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(1,),
                     n_epochs=1000,
                     random_state=4556,
                     learning_rate=1e-2,
                     keep_prob=1.0)
    Xenc = ae.fit_transform(X)

    b = BytesIO()
    pickle.dump(ae, b)
    ae_pickled = pickle.loads(b.getvalue())
    Xenc_pickled = ae_pickled.transform(X)
    assert_array_almost_equal(Xenc, Xenc_pickled)
项目:muffnn    作者:civisanalytics    | 项目源码 | 文件源码
def test_monitor_ae():
    """Test the monitor keyword."""
    # Use the iris features.
    X = iris.data
    X = MinMaxScaler().fit_transform(X)

    ae = Autoencoder(hidden_units=(3, 2,),
                     n_epochs=7500,
                     random_state=4556,
                     learning_rate=DEFAULT_LEARNING_RATE,
                     keep_prob=1.0,
                     hidden_activation=tf.nn.sigmoid,
                     encoding_activation=tf.nn.sigmoid,
                     output_activation=tf.nn.sigmoid)

    def _monitor(epoch, est, stats):
        assert epoch <= 1000, "The autoencoder has been running too long!"
        if stats['loss'] < 0.2:
            assert epoch > 10, "The autoencoder returned too soon!"
            return True
        else:
            return False
    ae.fit(X, monitor=_monitor)
项目:CNN-parallel    作者:harpribot    | 项目源码 | 文件源码
def extract_train_and_validation_data(self,num_labels):
        data = pd.read_csv(self.train_data_filename, header=0).values
        # convert to Numpy array forms
        feature_vec = data[0::,1::]
        labels = data[0::,0]

        # mean normalize features
        min_max_scaler = preprocessing.MinMaxScaler()
        feature_vec = min_max_scaler.fit_transform(feature_vec.T).T

        # convert to one hot form for labels
        labels_onehot = (np.arange(num_labels) == labels[:, None]).astype(np.float32)

        # divide data into train and validation data
        self.train_X, self.val_X, self.train_y, self.val_y = train_test_split(\
                                            feature_vec, labels_onehot,
                                            test_size=0.2, random_state=42)
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def get_today_data_for_MLP(code):
    '''
    :param code:????
    :return: ?????X
    '''
    import numpy as np
    data_path = "./data/stock_data/"
    oneDayLine, date = load_data_from_tushare(data_path + str(code) + '.csv')
    volumn, volumn_dates = load_volume_from_tushare(data_path + str(code) + '.csv')
    daynum = 5
    X = []
    ef = Extract_Features()
    for i in range(daynum, len(date)):
        X_delta = [oneDayLine[k] - oneDayLine[k - 1] for k in range(i - daynum, i)] + \
                  [volumn[k] for k in range(i - daynum, i)] + \
                  [float(ef.parse_weekday(date[i]))] + \
                  [float(ef.lunar_month(date[i]))] + \
                  [ef.rrr(date[i])] + \
                  [ef.MoneySupply(date[i])]
        X.append(X_delta)

    X = preprocessing.MinMaxScaler().fit_transform(X)
    return np.array(X[-1])
项目:catwalk    作者:dssg    | 项目源码 | 文件源码
def test_cutoff_inside_a_pipeline(data):
    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    X_fake_new_data = data['X_test'][-1,:].reshape(1,-1) + 0.5

    mms = preprocessing.MinMaxScaler().fit(data['X_train'])

    assert np.all(( mms.transform(X_fake_new_data) > 1  ) == (pipeline.transform(X_fake_new_data) == 1))
项目:catwalk    作者:dssg    | 项目源码 | 文件源码
def test_dsapp_lr(data):
    dsapp_lr = ScaledLogisticRegression()
    dsapp_lr.fit(data['X_train'], data['y_train'])

    minmax_scaler = preprocessing.MinMaxScaler()
    dsapp_cutoff = CutOff()
    lr = linear_model.LogisticRegression()

    pipeline =Pipeline([
        ('minmax_scaler',minmax_scaler),
        ('dsapp_cutoff', dsapp_cutoff),
        ('lr', lr)
    ])

    pipeline.fit(data['X_train'], data['y_train'])

    assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='none':
            self.data=self.orig_data
        elif method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def get_train_test( X, pca_order = 10):
    X = X.astype('float32')

    scaler = MinMaxScaler(feature_range=(0, 1))
    X = scaler.fit_transform(X.reshape(-1,1)).reshape( X.shape)

    if pca_order > 0:
        pca = PCA(3)
        X = pca.fit_transform(X)
        X = pca.inverse_transform(X)   

    n_samples = X.shape[0]
    train_size = int(n_samples * 0.67)
    test_size = n_samples - train_size
    train, test = X[0:train_size,:], X[train_size:n_samples,:]
    return train, test, scaler
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_MinMaxScaler():
    '''
    test the method of MinMax Scaler
    :return: None
    '''
    X=[   [1,5,1,2,10],
      [2,6,3,2,7],
      [3,7,5,6,4,],
      [4,8,7,8,1] ]
    print("before transform:",X)
    scaler=MinMaxScaler(feature_range=(0,2))
    scaler.fit(X)
    print("min_ is :",scaler.min_)
    print("scale_ is :",scaler.scale_)
    print("data_max_ is :",scaler.data_max_)
    print("data_min_ is :",scaler.data_min_)
    print("data_range_ is :",scaler.data_range_)
    print("after transform:",scaler.transform(X))
项目:cartographer    作者:pablodecm    | 项目源码 | 文件源码
def test_graph_simple():
    data, labels = make_circles(n_samples=2000, noise=0.03, factor=0.3)
    params = {'coverer__intervals': 10,
              'coverer__overlap': 0.1,
              'clusterer__min_samples': 3,
              'clusterer__eps': 0.5}
    m = Mapper(params=params)
    scaled_data = MinMaxScaler().fit_transform(data)
    m.fit(data, scaled_data)
    categories = {"labels": labels}
    scales = {"y[0]": scaled_data[:, 0],
              "y[1]": scaled_data[:, 1]}

    json_graph_str = json_graph(m, categories, scales)
    # check if it can be loaded to validate html
    json_graph_dict = json.loads(json_graph_str)
    html_graph_str = html_graph(m, categories, scales)  # validate HTML?
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def plot_on_dataset(X, y, ax, name):
    # for each dataset, plot learning for each learning strategy
    print("\nlearning on dataset %s" % name)
    ax.set_title(name)
    X = MinMaxScaler().fit_transform(X)
    mlps = []
    if name == "digits":
        # digits is larger but converges fairly quickly
        max_iter = 15
    else:
        max_iter = 400

    for label, param in zip(labels, params):
        print("training: %s" % label)
        mlp = MLPClassifier(verbose=0, random_state=0,
                            max_iter=max_iter, **param)
        mlp.fit(X, y)
        mlps.append(mlp)
        print("Training set score: %f" % mlp.score(X, y))
        print("Training set loss: %f" % mlp.loss_)
    for mlp, label, args in zip(mlps, labels, plot_args):
            ax.plot(mlp.loss_curve_, label=label, **args)
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def normalized_usage_by_package(self, package_usage_frame: pd.DataFrame,
                                    drop_package_prefix: str = None):
        scaler = MinMaxScaler()
        df = package_usage_frame.drop('package', 1)
        df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
        if drop_package_prefix:
            df_scaled['package'] = package_usage_frame['package'].apply(
                lambda text: text[text.startswith(drop_package_prefix)
                                  and len(drop_package_prefix):])
        else:
            df_scaled['package'] = package_usage_frame['package']
        df_sorted = df_scaled.sort_values('user_count').reset_index()
        del df_sorted['index']
        return df_sorted
项目:golden_touch    作者:at553    | 项目源码 | 文件源码
def predict_new(self, input):
        model = self.train_model()
        assert len(input) == 5 and type(input) == list
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(self.data)
        inp = scaler.transform([input])
        print(scaler.inverse_transform(model.predict(numpy.array(inp).reshape(1, 1, 5))))


# x = Predict()
# x.predict_new([1243.068, 1298.713, 1336.560, 1299.175, 1288.913])
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def v_demo(dir, prefix, pre_prefix, file_name, _dir):
    _val = []
    _coords = []
    file_dir_fix = dir + "\\output_INFLO.csv"
    #f = "C:\Users\Abdullah
    #Akmal\Documents\ifruitfly_temp\output_files\output_INFLO.csv"
    with open(file_dir_fix, 'rU') as inp:
        rd = csv.reader(inp)
        for row in rd:
            _val.append([row[1], row[2], row[0]])

    #print(_center)
    _val = np.asarray(_val)
    _val_original = _val
    _val_original = map(myFloat, _val_original)
    _val_original = map(myInt, _val_original)
    #_val_original = map(myTemp, _val_original)
    _val_original = np.asarray(_val_original)
    _val = preprocessing.StandardScaler().fit_transform(_val)
    #_center = preprocessing.MinMaxScaler()
    #_center.fit_transform(_val)
    #_arr = StandardScaler().inverse_transform(_center)
    #print(_arr)
    #print(_center)
    new_file = prefix + file_name + ".png"
    dbFun(_val, _val_original, new_file)
    #_len = len(_center)
    return

##############################################################################################
# Getting the clusters and printing in the most trivial way as asked by Dr Sheikh Faisal
项目:website-fingerprinting    作者:AxelGoetz    | 项目源码 | 文件源码
def next_batch(self, batches, in_memory):
        """
        Returns the next batch in some fixed-length representation.
        Currently we use Panchenko et al.'s cumulative traces

        @param batches an iterator with all of the batches (
            if in_memory == True:
                in batch-major form without padding
            else:
                A list of paths to the files
        )
        @param in_memory is a boolean value

        @return if in_memory is False, returns a tuple of (dict, [paths]) where paths is a list of paths for each batch
            else it returns a dict for training
        """
        batch = next(batches)
        data_batch = batch

        if not in_memory:
            data_batch = [helpers.read_cell_file(path) for path in batch]

        data_batch = [self._process_trace(trace, self.layers[0]) for trace in data_batch]

        min_max_scaler = MinMaxScaler()
        data_batch = min_max_scaler.fit_transform(data_batch)

        encoder_inputs_ = data_batch
        decoder_targets_ = data_batch

        train_dict = {
            self.encoder_inputs: encoder_inputs_,
            self.decoder_targets: decoder_targets_,
        }

        if not in_memory:
            return (train_dict, batch)
        return train_dict
项目:rbm-ae-tf    作者:Cospel    | 项目源码 | 文件源码
def min_max_scale(X_train, X_test):
    preprocessor = prep.MinMaxScaler().fit(np.concatenate((X_train, X_test), axis=0))
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test
项目:triage    作者:dssg    | 项目源码 | 文件源码
def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
                 fit_intercept=True, intercept_scaling=1, class_weight=None,
                 random_state=None, solver='liblinear', max_iter=100,
                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
        self.penalty = penalty
        self.dual = dual
        self.tol = tol
        self.C = C
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.class_weight = class_weight
        self.random_state = random_state
        self.solver = solver
        self.max_iter = max_iter
        self.multi_class = multi_class
        self.verbose = verbose
        self.warm_start = warm_start
        self.n_jobs = n_jobs

        self.minmax_scaler = MinMaxScaler()
        self.dsapp_cutoff = CutOff()
        self.lr = LogisticRegression(penalty=penalty,
                                     dual=dual,
                                     tol=tol,
                                     C=C,
                                     fit_intercept=fit_intercept,
                                     intercept_scaling=intercept_scaling,
                                     class_weight=class_weight,
                                     random_state=random_state,
                                     solver=solver,
                                     max_iter=max_iter,
                                     multi_class=multi_class,
                                     verbose=verbose,
                                     warm_start=warm_start,
                                     n_jobs=n_jobs)

        self.pipeline = Pipeline([
            ('minmax_scaler', self.minmax_scaler),
            ('dsapp_cutoff', self.dsapp_cutoff),
            ('lr', self.lr)
        ])
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
        clf = KNeighborsClassifier(n_neighbors = 33)
        min_max_scaler = preprocessing.MinMaxScaler()
        self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)])
        return
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
#         self.clf = Ridge(alpha=0.0000001, tol=0.0000001)
        clf = LinearRegression()
        min_max_scaler = preprocessing.MinMaxScaler()
        self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)])
        return
项目:Supply-demand-forecasting    作者:LevinJ    | 项目源码 | 文件源码
def setClf(self):
        clf = SVR(C=100, epsilon=0.1, gamma = 0.0001,cache_size = 10240)
        min_max_scaler = preprocessing.MinMaxScaler()
        self.clf = Pipeline([('scaler', min_max_scaler), ('estimator', clf)])
        return
项目:AutoFolio    作者:mlindauer    | 项目源码 | 文件源码
def __init__(self, classifier_class):
        '''
            Constructor
        '''
        self.classifiers = []
        self.logger = logging.getLogger("PairwiseClassifier")
        self.classifier_class = classifier_class
        self.normalizer = MinMaxScaler()
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works
项目:finance-ml    作者:Omarkhursheed    | 项目源码 | 文件源码
def scale(train, test):
    scale_f = MinMaxScaler(feature_range=(-1,1))
    scale_f = scale_f.fit(train)
    train = train.reshape(train.shape[0], train.shape[1])
    train_s = scale_f.transform(train)
    test = test.reshape(test.shape[0], test.shape[1])
    test_s = scale_f.transform(test)
    return scale_f, train_s, test_s
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def get_scaled_user():
    dataset = get_dataset()
    new_df = pd.DataFrame(index=set(dataset.index))
    new_df = new_df.sort_index()
    for user_id in get_user_id_list():
        #print user_id
        if not check_empty(user_id):
            new_df[user_id] = dataset[dataset.user_id == user_id].power_consumption
    new_df_log = new_df.apply(np.log)
    new_df_log_scaled = preprocessing.MinMaxScaler().fit_transform(new_df_log.ix[60:,:].dropna())
    return pd.DataFrame(new_df_log_scaled,columns = new_df_log.columns)
项目:Kutils    作者:ishank26    | 项目源码 | 文件源码
def scale_features(data):
    extract_features = theano.function([model.layers[0].input], model.layers[
                                       32].output, allow_input_downcast=True)
    features = extract_features(data)
    scale = MinMaxScaler()
    scale_feat = scale.fit_transform(features)
    return scale_feat
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs):
    new_train_set = list(trainSet)
    new_y_train = list(y_train)

    trainAndBSData = trainSet + bootstrap_data

    generateDataDrivenFeats(trainSet, trainAndBSData, es)

    featurized = featurize(trainAndBSData)

    train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
    test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)]

    #Do feature selection on train data
    train_feats = fs.runFeatureSelection(train_feats, y_train, es)
    train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es)

    # calculate Inter-annotator weighting. 
    weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot)

    vectorizer = DictVectorizer()   
    x_train = vectorizer.fit_transform(train_feats)
    x_test = vectorizer.transform(test_feats)

    if es.scaleData:
        min_max_scalar = MinMaxScaler()
        x_train = min_max_scalar.fit_transform(x_train.toarray())
        x_test = min_max_scalar.transform(x_test.toarray())

    model = train(estimator, x_train, y_train, weights_train, model=None)

    y_pred_prob = model.predict_proba(x_test)
    for i, cur_y in enumerate(y_pred_prob):
        if np.max(cur_y) > th_bs:
            new_train_set.append(bootstrap_data[i])
            new_y_train.append(np.argmax(cur_y))

    return (new_train_set, new_y_train) #update none to confidence vector