Python sklearn.preprocessing 模块,scale() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.preprocessing.scale()

项目:MENGEL    作者:CodeSpaceHQ    | 项目源码 | 文件源码
def scale_numeric_data(pandas_data):
    # Scaling is important because if the variables are too different from
    # one another, it can throw off the model.
    # EX: If one variable has an average of 1000, and another has an average
    # of .5, then the model won't be as accurate.
    for col in pandas_data.columns:
        if pandas_data[col].dtype == np.float64 or pandas_data[col].dtype == np.int64:
            pandas_data[col] = preprocessing.scale(pandas_data[col])

    return pandas_data


# Creates a standard scaler based on the training data and applies it to both train
# and test data.
# Input:
# - Two Pandas DataFrames, same number of columns
# Output:
# - Two Pandas DataFrames, both of which have been scaled based on StandardScaler
# trained on training data.
项目:UVA    作者:chiachun    | 项目源码 | 文件源码
def transform(self, X, STANDARDIZE=True):
      if not isinstance(X, np.ndarray):
          X =  to_array(X)
      assert(X.ndim == 2), "Input array must have two dimensions."
      if not check_standardized(X):
          if STANDARDIZE:
              X = preprocessing.scale(X)
              print "Standardize input data for transform"
      if not self.model:
          print "Load or fit a model before performing trsnaformation."
      else:
          assert(X.shape[1] > self.model.n_components),\
              "Input data must have a dimension larger than model components %d."\
              % self.model.n_components
          xp = self.model.transform(X)
          return xp
项目:marconibot    作者:s4w3d0ff    | 项目源码 | 文件源码
def train(self, df, shuffle=True, preprocess=False, *args, **kwargs):
        """
        Takes a dataframe of features + a 'label' column and trains the lobe
        """
        if self._trained:
            logger.warning('Overwriting an already trained brain!')
            self._trained = False

        # shuffle data for good luck
        if shuffle:
            df = shuffleDataFrame(df)
        # scale train data and fit lobe
        x = df.drop('label', axis=1).values
        y = df['label'].values
        del df
        if preprocess:
            x = preprocessing.scale(x)
        logger.info('Training with %d samples', len(x))
        self.lobe.fit(x, y)
        self._trained = True
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def get_sample(self, N=600, scale=False):
        all_data = self.pre_process(self.file_name)
        #print('data_type: ' + str(all_data.dtypes))
        all_data = all_data.values
        xs = all_data[:, 2:]
        y = all_data[:, 1]
        if scale:
            xs = preprocessing.scale(xs)
        if N != -1:
            perm = np.random.permutation(xs.shape[0])
            xs = xs[perm]
            y = y[perm]
            xs_train, xs_test = np.split(xs, [N])
            y_train, y_test = np.split(y, [N])
            return xs_train, xs_test, y_train, y_test
        else:
            return xs, y
项目:Artificial-Intelligence    作者:Jaisonbabu    | 项目源码 | 文件源码
def get_X_y(self):
        """Builds an X, y feature/target pair from the data.

        :returns: a tuple of (feature matrix, labels)
        """

        # X
        X = np.array(self.data[self.features])
        X = scale(X)

        # y
        stock_change = np.array(self.data["stock_p_change"])
        sp500_change = np.array(self.data["sp500_p_change"])

        is_above_threshold = stock_change-sp500_change > self.threshold
        y = is_above_threshold.astype('i')

        return (X, y)
项目:nba-games    作者:ixarchakos    | 项目源码 | 文件源码
def scale_sets(x_train, x_test, classifier_name):
    """
    :param x_train: ndarray, required
            - The train data of the feature matrix
    :param x_test: ndarray, required
            - The test data of the feature matrix
    :param classifier_name: string, optional
            - The name of the selected classifier
    :return: ndarray
    """
    # scaling leads to poorer performance in the case of random forests, xgb, etc.
    if classifier_name not in ["random_forests", "XGB", "GBC"]:
        # x_train, x_test are expected to be numpy arrays. Simple conditions such as if x_train will raise a ValueError.
        x_train = scale(x_train) if x_train is not None else x_train
        x_test = scale(x_test) if x_test is not None else x_test
    return x_train, x_test
项目:pysptools    作者:ctherien    | 项目源码 | 文件源码
def classify(self, M):
        """
        Classify a hyperspectral cube using the ROIs defined clusters by the fit method.

        Parameters:
            M: `numpy array`
              A HSI cube (m x n x p).

        Returns: `numpy array`
              A class map (m x n x 1).
        """
        img = self._convert2D(M)
        image_scaled = preprocessing.scale(img)
        cls = self.clf.predict(image_scaled)
        self.cmap = self._convert3d(cls, M.shape[0], M.shape[1])
        return self.cmap
项目:Exoplanet-Artificial-Intelligence    作者:pearsonkyle    | 项目源码 | 文件源码
def load_data(fname='transit_data.pkl',categorical=False,whiten=True,DIR='pickle_data/'):

    data = pickle.load(open(DIR+fname,'rb'))

    # convert to numpy array fo float type from object type
    pvals = arr(data['results'][:,0])
    transits = arr(data['results'][:,1])
    null = arr(data['results'][:,2])

    X = np.vstack([transits,null])
    y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] )

    if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] )
    if whiten: X = preprocessing.scale(X,axis=1)

    return X,y,pvals,data['keys'],data['time']
项目:Exoplanet-Artificial-Intelligence    作者:pearsonkyle    | 项目源码 | 文件源码
def load_data(fname='transit_data_train.pkl',categorical=False,whiten=True,DIR='pickle_data/'):

    data = pickle.load(open(DIR+fname,'rb'))

    # convert to numpy array fo float type from object type
    pvals = arr(data['results'][:,0])
    transits = arr(data['results'][:,1])
    null = arr(data['results'][:,2])

    X = np.vstack([transits,null])
    y = np.hstack([np.ones(transits.shape[0]), np.zeros(null.shape[0])] )

    if categorical: y = np_utils.to_categorical(y, np.unique(y).shape[0] )
    if whiten: X = preprocessing.scale(X,axis=1)

    return X,y,pvals,data['keys'],data['time']
项目:skp_edu_docker    作者:TensorMSA    | 项目源码 | 文件源码
def make_drop_duplicate(self, _df_csv_read_ori, _drop_duplicate , _label):
        """ Label? ??? ??? ??? ??? ??? Row ??? ????.
        Args:
          params:
            * _preprocessing_type: ['scale', 'minmax_scale', 'robust_scale', 'normalize', 'maxabs_scale']
            * _df_csv_read_ori : pandas dataframe
            * _label
        Returns:
          Preprocessing Dataframe
        """
        if _drop_duplicate == None or _drop_duplicate == 'null' or _drop_duplicate == False:
            logging.info("No Duplicate")
            result_df =  _df_csv_read_ori
        else :
            cell_features = _df_csv_read_ori.columns.tolist()
            cell_features.remove(_label)
            result_df = _df_csv_read_ori.drop_duplicates(cell_features, keep="first")
            logging.info("duplicated row delete {0}".format(len(_df_csv_read_ori.index)-len(result_df.index)))
            temp_duplicate_filename = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) + "_dup.csvbk"
            result_df.to_csv(self.data_src_path + "/backup/" + temp_duplicate_filename)
        return result_df
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='none':
            self.data=self.orig_data
        elif method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:hyperband_benchmarks    作者:lishal    | 项目源码 | 文件源码
def compute_preprocessor(self,method):
        self.data={}
        if method=='min_max':
            transform=preprocessing.MinMaxScaler()
            self.data['X_train']=transform.fit_transform(self.orig_data['X_train'])
            self.data['X_val']=transform.transform(self.orig_data['X_val'])
            self.data['X_test']=transform.transform(self.orig_data['X_test'])
        elif method=='scaled':
            self.data['X_train']=preprocessing.scale(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.scale(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.scale(self.orig_data['X_test'])
        elif method=='normalized':
            self.data['X_train']=preprocessing.normalize(self.orig_data['X_train'])
            self.data['X_val']=preprocessing.normalize(self.orig_data['X_val'])
            self.data['X_test']=preprocessing.normalize(self.orig_data['X_test'])
        self.data['y_train']=self.orig_data['y_train']
        self.data['y_val']=self.orig_data['y_val']
        self.data['y_test']=self.orig_data['y_test']
项目:Kaggle-Competition-Sberbank    作者:LenzDu    | 项目源码 | 文件源码
def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))

    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def get_ind_return(data):
    '''
    ??xlsx????????????????????????????????
    :param [DataFrame] data: ?xlsx????????-????
    :return: [DataFrame] ind_ret: ??*?? ???????????
    '''
    # ??stk_ind_pair.xlsx???????????????
    stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
    # ?stk_ind?????????????????
    stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
    # ?stk_ind?data??merge??????????data
    data = pd.merge(data, stk_ind, on='Stkcd')
    # ?????????
    groups = data.groupby(['Trdmnt', 'ind'])
    # ???????????????
    total_Ms = groups['Msmvttl'].sum()
    # ?????????????????????
    total_Mr=groups['total_Mr'].sum()
    # ?????????????????
    ind_ret=total_Mr/total_Ms
    # ?ind_ret???level????
    ind_ret=ind_ret.unstack()
    #?ind_ret???
    ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
    return ind_ret
项目:quantopian-machinelearning    作者:arshpreetsingh    | 项目源码 | 文件源码
def create_model(context, data):
    # Get the relevant daily prices
    recent_prices = data.history(context.assets, 'price',context.history_range, '1d')

    context.ma_50 =recent_prices.values[-50:].mean()     
    context.ma_200 = recent_prices.values[-200:].mean() 
    #print context.ma_50
    #print context.ma_200
    time_lags = pd.DataFrame(index=recent_prices.index)
    time_lags['price']=recent_prices.values
    time_lags['returns']=(time_lags['price'].pct_change()).fillna(0.0001)
    time_lags['lag1'] = (time_lags['returns'].shift(1)).fillna(0.0001)
    time_lags['lag2'] = (time_lags['returns'].shift(2)).fillna(0.0001)
    time_lags['direction'] = np.sign(time_lags['returns'])


    X = time_lags[['returns','lag2']] # Independent, or input variables
    Y = time_lags['direction'] # Dependent, or output variable
    X_scaled = preprocessing.scale(X)
    context.model.fit(X_scaled, Y) # Generate our model
项目:2WayNet    作者:aviveise    | 项目源码 | 文件源码
def __init__(self, data_set_parameters):

        OutputLog().write('Loading dataset: ' + data_set_parameters['name'])

        self.dataset_path = data_set_parameters['path']

        self.trainset = None
        self.testset = None
        self.tuning = None

        self.reduce_val = 0
        self.x_y_mapping = {'train': None, 'dev': None, 'test': None}
        self.x_reduce = {'train': None, 'dev': None, 'test': None}

        self.data_set_parameters = data_set_parameters
        self.scale = bool(int(data_set_parameters['scale']))
        self.scale_rows = bool(int(data_set_parameters['scale_samples']))
        self.whiten = bool(int(data_set_parameters['whiten']))
        self.pca = map(int, data_set_parameters['pca'].split())
        self.normalize_data = bool(int(data_set_parameters['normalize']))
        self.preprocessors = None
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def rerun_task(job_id, task_id):
    """
    Reruns a specific task from a job. Sets the task status to 'pending' and triggers an asynchronous function to
    process the task.

    Parameters
    ----------
    job_id: str
    task_id: int

    Returns
    -------
    None
    """
    job = mongo_no_context_get_job(job_id)
    task = mongo_no_context_get_task(job_id, task_id)
    k = task['k']
    covar_type = task['covar_type']
    covar_tied = task['covar_tied']
    n_init = task['n_init']
    s3_file_key = job['s3_file_key']
    columns = job['columns']
    scale = job.get('scale', False)
    response = mongo_no_context_update_task_status(job_id, task_id, 'pending')
    work_task.delay(job_id, task_id, k, covar_type, covar_tied, n_init, s3_file_key, columns, scale)
项目:finance-ml    作者:Omarkhursheed    | 项目源码 | 文件源码
def train():
    os.chdir(dname)
    for selected_stock in onlyfiles:
        df = pd.read_csv(os.path.join('data_files',selected_stock))
        #preprocessing the data
        df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]
        #measure of volatility
        df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0
        df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
        df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
        forecast_col = 'Adj. Close'
        df.fillna(value=-99999, inplace=True)
        forecast_out = int(math.ceil(0.01 * len(df)))
        df['label'] = df[forecast_col].shift(-forecast_out)

        X = np.array(df.drop(['label'],1))
        X = preprocessing.scale(X)
        X_lately = X[-forecast_out:]
        X = X[:-forecast_out]

        df.dropna(inplace=True)
        y = np.array(df['label'])
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

        svr = SVR()
        pickle.dump(svr,open(join(dname+'/models/svr_unfit/', selected_stock+'svr.sav'),'wb'))
        svr.fit(X_train, y_train)

        lr = LinearRegression()
        pickle.dump(lr,open(join(dname+'/models/lr_unfit/', selected_stock+'lr.sav'),'wb'))
        lr.fit(X_train, y_train)

        mlp = MLPRegressor()
        pickle.dump(mlp,open(join(dname+'/models/mlp_unfit/', selected_stock+'mlp.sav'),'wb'))
        mlp.fit(X_train, y_train)

        pickle.dump(svr,open(join(dname+'/models/svr_fit/', selected_stock+'svr.sav'),'wb'))
        pickle.dump(lr,open(join(dname+'/models/lr_fit/', selected_stock+'lr.sav'),'wb'))
        pickle.dump(mlp,open(join(dname+'/models/mlp_fit/', selected_stock+'mlp.sav'),'wb'))

        print(selected_stock+" - trained")
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def lession_7():
    X= np.array([[10,12,2],
              [-1,-9,99],
              [22,33,11]])
    print X
    print preprocessing.scale(X)
    #X,y=make
项目:LagouJob    作者:EclipseXuLu    | 项目源码 | 文件源码
def normalise(csv_filepath):
    """
    load csv data and normalize it
    :param csv_filepath:
    :return:
    """
    df = pd.read_csv(csv_filepath)[[
        'companyScore', 'describeScore', 'comprehensiveScore', 'interviewerScore', 'usefulCount', 'myScore',
        'replyCount', 'isAnonymous']][1:]
    senti_df = pd.read_csv(csv_filepath)['sentiment'][1:]
    labels_ = [1 if _ > 0.9 else 0 for _ in senti_df]
    df['isAnonymous'] = [int(_) for _ in df['isAnonymous']]
    df_scaled = pd.DataFrame(preprocessing.scale(df))

    return df_scaled, labels_
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def get_name(self):
        return 'unit-scale'
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def apply(self, data):
        return preprocessing.scale(data, axis=data.ndim-1)
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def get_name(self):
        return 'unit-scale-feat'
项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def apply(self, data):
        return preprocessing.scale(data, axis=1)
项目:Quantrade    作者:quant-trade    | 项目源码 | 文件源码
def make_feat(self):
        d = self.init_data()
        for i in range(1, 100):
            d['dif{}'.format(i)] = d.diff(i)
        for i in range(0, 9):
            d['hc{}'.format(i)] = d.HIGH.shift(i) - d.CLOSE.shift(i)
            d['lc{}'.format(i)] = d.LOW.shift(i) - d.CLOSE.shift(i)
            d['hl{}'.format(i)] = d.HIGH.shift(i) - d.LOW.shift(i)
            d['oc{}'.format(i)] = d.OPEN.shift(i) - d.CLOSE.shift(i)
            d['oh{}'.format(i)] = d.OPEN.shift(i) - d.HIGH.shift(i)
            d['ol{}'.format(i)] = d.OPEN.shift(i) - d.LOW.shift(i)
        d = d.fillna(0)
        d = preprocessing.scale(d)
        filename = join(self.out_poath, 'f_{0}.csv'.format(self.struc))
        d.to_csv(path_or_buf=filename)
项目:eigenfish    作者:sethdp    | 项目源码 | 文件源码
def train(self, data, labels):
        """
        Trains current classifier with matrix data and labels, where labels[i]
        describes data[:, i].

        :param data: Matrix of data, where each column is a separate sample.
        :param labels: List of labels, each corresponding to a column of data.
        """
        if self.use_pca:
            u, s, _ = scipy.sparse.linalg.svds(data)
            self.svc.fit(
                preprocessing.scale(u[:, :self.rank+1].T.dot(data).T), labels)
        else:
            self.svc.fit(preprocessing.scale(data.T), labels)
项目:eigenfish    作者:sethdp    | 项目源码 | 文件源码
def classify(self, data):
        """
        Classifies data based on current model.

        :param data: Matrix with each column a different sample.
        :returns: List of predictions, where return[i] describes data[:, i].
        """
        if self.use_pca:
            u, s, _ = scipy.sparse.linalg.svds(data)
            self.svc.predict(
                preprocessing.scale(u[:, :self.rank+1].T.dot(data).T))
        else:
            return self.svc.predict(preprocessing.scale(data.T))
项目:neural-finance    作者:Metnew    | 项目源码 | 文件源码
def sk_min_max(X):
    min_max_scaler = MinMaxScaler()
    # X = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
    return min_max_scaler.fit_transform(X)
项目:neural-finance    作者:Metnew    | 项目源码 | 文件源码
def min_max(X):
    min_max_scaler = MinMaxScaler()
    X = scale(X, axis=0, with_mean=True, with_std=True, copy=True )
    X = min_max_scaler.fit_transform(X)
    return X
项目:neural-finance    作者:Metnew    | 项目源码 | 文件源码
def sk_scale(X):
    return scale(X, axis=0, with_mean=True, with_std=True, copy=True )
项目:neural-finance    作者:Metnew    | 项目源码 | 文件源码
def sk_min_max(X):
    min_max_scaler = MinMaxScaler()
    # X = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
    return min_max_scaler.fit_transform(X)
项目:neural-finance    作者:Metnew    | 项目源码 | 文件源码
def sk_scale(X):
    return scale(X, axis=0, with_mean=True, with_std=True, copy=True )
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def Standardization(self):
        # feature 10: minimum price so far; feature 11: maximum price so far
        # feature 12: current price
        scaled = preprocessing.scale(self.X_train[:, 10:13])
        self.X_train[:, 10:13] = scaled

        scaled = preprocessing.scale(self.X_test[:, 10:13])
        self.X_test[:, 10:13] = scaled
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def Standardization(self):
        scaled = preprocessing.scale(self.X_train[:, 10:12])
        self.X_train[:, 10:12] = scaled

        scaled = preprocessing.scale(self.X_test[:, 10:12])
        self.X_test[:, 10:12] = scaled
项目:UVA    作者:chiachun    | 项目源码 | 文件源码
def prepare(self):
        with open('%s' % self.cfg.pca_pkl, 'r') as pklfile:
            self.pca = pickle.load(pklfile)
        try:
            self.df = self.df.query('face == 1')
        except:
            print 'Face column not found in the dataframe',
            print 'Treated as not being processed by skin_filter.'

        x = self.df[self.ftcols].as_matrix()
        x = preprocessing.scale(x)
        xp = self.pca.transform(x)
        self.dfp = pd.DataFrame(xp)
        self.dfp[['number','time']] = self.df[['number','time']]
项目:UVA    作者:chiachun    | 项目源码 | 文件源码
def fit(self, X, STANDARDIZE=True, n=10):
      if not isinstance(X, np.ndarray):
          X =  to_array(X)
      assert(X.ndim == 2), "Input array must have two dimensions."
      if not check_standardized(X):
          if STANDARDIZE:
              X = preprocessing.scale(X)
              print "Standardize input data for fit."
          else:
              print "WARNING: data is not standardized and you switch off STANDARDIZE option.",
              print "Make sure this is what you intended."
      self.model = PCA(n_components=n)
      self.model.fit(X)
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def svc_rbf_xyat(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"]
        df_new["accuracy"] = df["accuracy"].apply(np.log10)
        return preprocessing.scale(df_new.values)

    logging.info("train svc_rbf_xyat model")
    clf = SVC(kernel='rbf', probability=True, cache_size=3000)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def svc_lin_xyat(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"]
        df_new["accuracy"] = df["accuracy"].apply(np.log10)
        return preprocessing.scale(df_new.values)

    logging.info("train svc_lin_xyat model")
    clf = SVC(kernel='linear', probability=True, cache_size=3000)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def svc_rbf_xyatu(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"]
        df_new["accuracy"] = df["accuracy"]
        return preprocessing.scale(df_new.values)

    logging.info("train svc_rbf_xyatu model")
    clf = SVC(kernel='rbf', probability=True, cache_size=3000)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:5th_place_solution_facebook_check_ins    作者:aikinogard    | 项目源码 | 文件源码
def svc_lin_xyatu(df_cell_train_feats, y_train, df_cell_test_feats):
    def prepare_feats(df):
        df_new = pd.DataFrame()
        df_new["x"] = df["x"]
        df_new["y"] = df["y"]
        df_new["hour"] = df["hour"]
        df_new["weekday"] = df["weekday"]
        df_new["accuracy"] = df["accuracy"]
        return preprocessing.scale(df_new.values)

    logging.info("train svc_lin_xyatu model")
    clf = SVC(kernel='linear', probability=True, cache_size=3000)
    clf.fit(prepare_feats(df_cell_train_feats), y_train)
    y_test_pred = clf.predict_proba(prepare_feats(df_cell_test_feats))
    return y_test_pred
项目:Clustering    作者:Ram81    | 项目源码 | 文件源码
def scaleDataset(data):
    '''
        Scaling the dataset between 1 to -1
    '''
    data = scale(data)

    return data
项目:Episodes    作者:guptachetan1997    | 项目源码 | 文件源码
def get_recommendations():
    module_dir = os.path.dirname(__file__)

    train_df = build_training_set()
    if train_df is None:
        return []
    x_train = train_df.iloc[:, 5:]
    try:
        x_train = scale(x_train)
    except:
        print("First migrations")
    y_train = train_df.iloc[:, 3]
    x_train_labels = train_df.iloc[:, 0]

    target_df = pd.read_csv(os.path.join(module_dir,'data.csv'))
    target_df = pd.DataFrame(target_df)
    target_df = target_df.append(train_df)
    target_df = target_df.append(train_df)
    target_df = target_df.drop_duplicates('SeriesName', keep=False)

    x_target = scale(target_df.iloc[:, 5:])
    x_target_labels = target_df.iloc[:, 0]

    clf = RandomForestClassifier()
    clf.fit(x_train,y_train)

    y_target = clf.predict(x_target)

    new_df = pd.DataFrame()
    new_df['seriesName'] = x_target_labels
    new_df['tvdbID'] = target_df.iloc[:, 1]
    new_df['PredictedRating'] = y_target
    new_df['indicator'] = (target_df.iloc[:, 4]/target_df.iloc[:, 3])*new_df['PredictedRating']

    new_df = new_df.sort_values(['indicator'], ascending=False)
    initial_list = list(new_df.iloc[:4, 1])
    latter_list =  list(new_df.iloc[5:15, 1])
    shuffle(latter_list)
    return list(initial_list + latter_list[:5])
项目:xplore    作者:fahd09    | 项目源码 | 文件源码
def scale_feature(self, col=None, scaling=None, scaling_parms=None):
        '''
        Scales a given set  of numerical columns. This only works for columns 
        with numerical values. 

        Parameters
        ----------
        col : a string of a column name, or a list of many columns names or
                None (default). If col is None, all numerical columns will 
                be used.
        scaling  : {'zscore', 'minmax_scale' (default), 'scale', 'maxabs_scale', 
                    'robust_scale'}
            User-defined scaling functions can also be used through self.transform_feature
        scaling_parms : dictionary
            any additional parameters to be used for sklearn's scaling functions.

        '''            
        self._validate_params(params_list   = {'col':col,'scaling':scaling},
                              expected_types= {'col':[str,list,type(None)], 'scaling':[str,type(None)]})        

        if scaling is None: scaling = 'minmax_scale'

        if scaling == 'zscore':
            scaling = 'lambda x: (x - x.mean()) / x.std()'
        elif scaling ==  'minmax_scale' and scaling_parms is None:
            scaling_parms = {'feature_range':(0, 1),'axis':0}
        elif scaling ==  'scale' and scaling_parms is None:
            scaling_parms = {'with_mean':True, 'with_std':True,'axis':0}
        elif scaling ==  'maxabs_scale' and scaling_parms is None:
            scaling_parms = {'axis':0}
        elif scaling ==  'robust_scale' and scaling_parms is None:
            scaling_parms = {'with_centering':True, 'with_scaling':True, 'axis':0} # 'quantile_range':(25.0, 75.0), 
        else:
            raise TypeError('UNSUPPORTED scaling TYPE')

        self.transform_feature(col=col, func_str=scaling, addtional_params=scaling_parms)
项目:syracuse_public    作者:dssg    | 项目源码 | 文件源码
def gen_feature_imp_matrix(model_id_list, features_df):

    feature_imp_matrix = pd.DataFrame
    for model_id in model_id_list[:1]:
        feature_imp_matrix = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values
    for model_id in model_id_list[1:]:
        b = features_df[features_df.model_id == model_id].sort_values("feature", inplace=False).importance.values
        feature_imp_matrix = np.vstack((feature_imp_matrix, b))
    feature_imp_matrix_normd = scale(np.transpose(feature_imp_matrix),axis=0,with_mean=True, with_std=True, copy=True)

    return feature_imp_matrix_normd
项目:Automatic_Speech_Recognition    作者:zzw922cn    | 项目源码 | 文件源码
def spectrogramPower(audio, window_size=0.02, window_stride=0.01):
    """ short time fourier transform

    Details:
        audio - This is the input time-domain signal you wish to find the spectrogram of. It can't get much simpler than that. In your case, the 
                signal you want to find the spectrogram of is defined in the following code:

        win_length - If you recall, we decompose the image into chunks, and each chunk has a specified width.  window defines the width of each 
                 chunkin terms of samples. As this is a discrete-time signal, you know that this signal was sampled with a particular sampling 
                 frequency and sampling period. You can determine how large the window is in terms of samples by:

                 window_samples = window_time/Ts
        hop_length - the same as stride in convolution network, overlapping width

    """
    samplingRate, samples = wav.read(audio)
    win_length = int(window_size * samplingRate)
    hop_length = int(window_stride * samplingRate)
    n_fft = win_length
    D = librosa.core.stft(samples, n_fft=n_fft,hop_length=hop_length,
                      win_length=win_length)
    mag = np.abs(D)
    log_mag = np.log1p(mag)
    # normalization
    log_mag = preprocessing.scale(log_mag)
    # size: frequency_bins*time_len
    return log_mag
项目:House-Pricing    作者:playing-kaggle    | 项目源码 | 文件源码
def standardize(data):

    numeric_list = ['BsmtFullBath', 'LotArea', 'YearRemodAdd', 'GrLivArea', 'BsmtHalfBath', 'MiscVal', 'YearBuilt',
                    'WoodDeckSF', 'KitchenAbvGr', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'OpenPorchSF', 'MoSold',
                    'LowQualFinSF', 'BedroomAbvGr', 'Fireplaces', '1stFlrSF', 'FullBath', 'BsmtFinSF1', 'BsmtFinSF2',
                    'HalfBath',
                    'Porch', '2ndFlrSF', 'MasVnrArea', 'YrSold', 'BsmtUnfSF', 'LotFrontage', 'TotRmsAbvGrd']

    data.loc[:, numeric_list] = preprocessing.scale(data.loc[:, numeric_list])
项目:Tencent_Social_Advertising_Algorithm_Competition    作者:guicunbin    | 项目源码 | 文件源码
def get_concat_data(label_csv, label_col, other_csvs, is_rate, important_feats):
    print 'important_feats :  ',len(important_feats)
    rank_feats  = [f for f in get_csv_header(dataset1_csv) if 'click' in f]
    rank_feats  = [f for f in rank_feats if f in important_feats] if important_feats else rank_feats
    X           = pd.read_csv(label_csv, usecols = rank_feats+[label_col]).apply(small_dtype)
    X           = X[:1000000] if is_tiny else X
    print 'concat csvs ......'
    X           = pd.concat([X, get_need_feats(other_csvs, is_rate, is_tiny, important_feats)], axis=1)
    #if label_csv.split('/')[-1] == 'dataset2.csv':
    #    for c in X.columns:
    #        if c.endswith('_fset_total_cnt'):
    #            X = X.drop(X[X[c]==0].index, axis=0)
    feat_cols   = [f for f in X.columns if f != label_col]
    if is_to_csv:
        save_file = label_csv.split('.csv')[0]+'_concat.csv'
        if os.path.exists(save_file):
            print save_file + " has exists"
        else:
            print 'to csv ........'
            X            = X.replace(np.nan, -1)
            X            = X.replace(np.inf, -2)
            X[feat_cols] = scale(X[feat_cols]).astype('float16')
            X.to_csv(save_file, index=False, chunksize = 50000)
    print X.shape
    # TODO cate_feats = [f for f in X.columns if 'click' in f] 
    ## ????????? 3 ???????,??rank_feats ????????
    X,      = change_to_category([X], cate_feats)
    y       = X[label_col].values
    X       = X[feat_cols]
    if label_col == 'label':
        print 'positive percent ',y.mean()
    return X, y
项目:Data-Mining-Project    作者:mrsan22    | 项目源码 | 文件源码
def windowCharacter(x):
    tmp = np.zeros((x.shape[0]))
    n=0
    for row in x.iterrows():
        tmp[n] = signalMag(row[1]['X'],row[1]['Y'],row[1]['Z'])
        n=n+1

    # if np.std(tmp) > 5:
    #     return None
    # else:

    p_25 = np.percentile(tmp,25)
    p_75 = np.percentile(tmp,75)
    tmp_25 = [each for each in tmp if each < p_25]
    tmp_75 = [each for each in tmp if each < p_75]

    data_dm = scale(tmp,with_mean=True, with_std=False) # demean data

    (freq_1,power_1) = butterFilter(data_dm,lowcut_1,highcut_1)
    idx_1 = np.argmax(power_1)
    freq_1_sec = np.delete(freq_1,idx_1)
    power_1_sec = np.delete(power_1,idx_1)
    idx_1_sec = np.argmax(power_1_sec)

    (freq_2,power_2) = butterFilter(data_dm,lowcut_2,highcut_2)
    idx_2 = np.argmax(power_2)

    return np.mean(tmp), np.std(tmp), np.median(tmp), np.linalg.norm(tmp_25), np.linalg.norm(tmp_75),np.sum(power_1), freq_1[idx_1],power_1[idx_1], freq_1_sec[idx_1_sec], power_1_sec[idx_1_sec], freq_2[idx_2],power_2[idx_2],freq_1[idx_1]/np.sum(power_1)
项目:countae    作者:gokceneraslan    | 项目源码 | 文件源码
def normalize(x, sf, logtrans=True, sfnorm=True, zeromean=True):
    if sfnorm:
        assert len(sf.shape) == 1
        x = x / (sf[:, None]+1e-8)  # colwise div

    if logtrans:
        x = np.log1p(x)

    if zeromean:
        x = scale(x)

    return x
项目:betasqaud    作者:AJacobs15    | 项目源码 | 文件源码
def test_scale():
    matrix = [[0,30], [1, 27], [3, 24]]

    scaled = pre.scale(matrix)
    print(scaled)