Python sklearn.preprocessing 模块，Imputer() 实例源码

我们从Python开源项目中，提取了以下44个代码示例，用于说明如何使用sklearn.preprocessing.Imputer()。

项目：coremltools 作者：apple | 项目源码 | 文件源码

def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Imputer(strategy='most_frequent', axis=0)
        scikit_data['data'][1,8] = np.NaN

        input_data = scikit_data['data'][:,8].reshape(-1, 1)
        scikit_model.fit(input_data, scikit_data['target'])

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：mlbootcamp_5 作者：ivan-filonov | 项目源码 | 文件源码

def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()
        le.fit(train[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test

项目：Kaggle-Competition-Sberbank 作者：LenzDu | 项目源码 | 文件源码

def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))

    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df

项目：AutoFolio 作者：mlindauer | 项目源码 | 文件源码

def fit(self, scenario: ASlibScenario, config: Configuration):
        '''
            fit pca object to ASlib scenario data

            Arguments
            ---------
            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas
            config: ConfigSpace.Configuration
                configuration
        '''

        self.imputer = Imputer(strategy=config.get("imputer_strategy"))
        self.imputer.fit(scenario.feature_data.values)
        self.active = True

项目：predictive_imputer 作者：log0ymxm | 项目源码 | 文件源码

def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.initial_imputer = Imputer(strategy=initial_strategy)
        self.tol = tol
        self.f_model = f_model

项目：train-occupancy 作者：datamindedbe | 项目源码 | 文件源码

def build_model_random_forest(df, features, categorical_features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]


    # one_hot_encoding because it doesn't work in pipeline for some reason
    # for f in categorical_features:
    #     dummies = pd.get_dummies(df[f], prefix=f)
    #     for dummy in dummies.columns:
    #         df[dummy] = dummies[dummy]
    #         features.append(dummy)
    #     df = df.drop(f, 1)
    #     features.remove(f)

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=5)),
        ("forest", RandomForestClassifier())])
    clf.fit(train[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    print "Random Forest score: %f" % score
    print "confusion_matrix : \n%s" % cm
    return clf

项目：train-occupancy 作者：datamindedbe | 项目源码 | 文件源码

def make_predictions_random_forest(df, features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    # print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
    # print "columns: ", '\n\t\t'.join(df.columns)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=200)),
        ("forest", RandomForestClassifier(
            min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])
    clf.fit(train[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    # print classification_report(test[target], predicted)

    return score, cm


# Utility function to report best scores

项目：hyperparam-search-guides 作者：wenyangfu | 项目源码 | 文件源码

def preprocess_data(X_train, X_test):
    """ Impute missing values. """
    # Impute using the mean of every column for now. However,
    # I would've liked to impute 'F5' using mode instead.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_xform = imp.fit_transform(X_train)

    X_train = pd.DataFrame(train_xform, columns=X_train.columns)
    test_xform = imp.transform(X_test)
    X_test = pd.DataFrame(test_xform, columns=X_test.columns)

    return X_train, X_test

项目：hyperparam-search-guides 作者：wenyangfu | 项目源码 | 文件源码

def preprocess_data(X_train, X_test):
    """ Impute missing values. """
    # Impute using the mean of every column for now. However,
    # I would've liked to impute 'F5' using mode instead.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_xform = imp.fit_transform(X_train)

    X_train = pd.DataFrame(train_xform, columns=X_train.columns)
    test_xform = imp.transform(X_test)
    X_test = pd.DataFrame(test_xform, columns=X_test.columns)

    return X_train, X_test

项目：jingjuSingingPhraseMatching 作者：ronggong | 项目源码 | 文件源码

def imputerLabelEncoder_train(X,y):
    imputer = preprocessing.Imputer()
    X = imputer.fit_transform(X)

    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)
    return X,y,imputer,le

项目：false-friends 作者：pln-fing-udelar | 项目源码 | 文件源码

def build_classifier(base_clf=svm.SVC()):
    # The imputer is for "use_taxonomy", and shouldn't affect if it's False.
    # TODO: should also try with other imputer strategies
    return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(),
                                  base_clf)


# noinspection PyPep8Naming

项目：pandas-pipelines-custom-transformers 作者：jem1031 | 项目源码 | 文件源码

def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self

项目：dask-searchcv 作者：dask | 项目源码 | 文件源码

def test_grid_search_allows_nans():
    # Test dcv.GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)

项目：coremltools 作者：apple | 项目源码 | 文件源码

def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = Imputer()
            spec = converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            from sklearn.linear_model import LinearRegression
            model = LinearRegression()
            spec = converter.convert(model, 'data', 'out')

项目：coremltools 作者：apple | 项目源码 | 文件源码

def test_conversion_boston(self):

        from sklearn.datasets import load_boston

        scikit_data = load_boston()

        sh = scikit_data.data.shape 

        rn.seed(0)
        missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1])) 
                                    for k in range(sh[0])]

        for strategy in ["mean", "median", "most_frequent"]: 
            for missing_value in [0, 'NaN', -999]:

                X = np.array(scikit_data.data).copy()

                for i, j in missing_value_indices:
                    X[i,j] = missing_value

                model = Imputer(missing_values = missing_value, strategy = strategy)
                model = model.fit(X)

                tr_X = model.transform(X.copy())

                spec = converter.convert(model, scikit_data.feature_names, 'out')

                input_data = [dict(zip(scikit_data.feature_names, row)) 
                                for row in X]

                output_data = [{"out" : row} for row in tr_X]

                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0

项目：Optimus 作者：Yatoom | 项目源码 | 文件源码

def __init__(self, strategy_categorical="most_frequent", strategy_numerical="median", categorical=None):
        """
        An Imputer that can apply a different strategy for both categorical data and numerical data.
        :param strategy_categorical: "mean", "median" or "most_frequent"
        :param strategy_numerical: "mean", "median" or "most_frequent"
        :param categorical: A boolean mask for the categorical columns of a dataset
        """
        if categorical is None:
            categorical = []
        self.strategy_categorical = strategy_categorical
        self.strategy_numerical = strategy_numerical
        self.cat_imputer = Imputer(strategy=strategy_categorical)
        self.num_imputer = Imputer(strategy=strategy_numerical)
        self.categorical = categorical
        self._update_indices()

项目：ar-embeddings 作者：iamaziz | 项目源码 | 文件源码

def remove_nan(x):
        """remove NaN values from data vectors"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        x_clean = imp.fit_transform(x)
        return x_clean

项目：errorgeopy 作者：alpha-beta-soup | 项目源码 | 文件源码

def mean_shift(location, location_callback, bandwidth=None):
    """Returns one or more clusters of a set of points, using a mean shift
    algorithm.
    The result is sorted with the first value being the largest cluster.

    Kwargs:
        bandwidth (float): If bandwidth is None, a value is detected
        automatically from the input using estimate_bandwidth.

    Returns:
        A list of NamedTuples (see get_cluster_named_tuple for a definition
        of the tuple).
    """
    pts = location._tuple_points()
    if not pts:
        return None
    X = np.array(pts).reshape((len(pts), len(pts[0])))
    if np.any(np.isnan(X)) or not np.all(np.isfinite(X)):
        return None
    X = Imputer().fit_transform(X)
    X = X.astype(np.float32)
    if not bandwidth:
        bandwidth = estimate_bandwidth(X, quantile=0.3)
    ms = MeanShift(bandwidth=bandwidth or None, bin_seeding=False).fit(X)
    clusters = []
    for cluster_id, cluster_centre in enumerate(ms.cluster_centers_):
        locations = []
        for j, label in enumerate(ms.labels_):
            if not label == cluster_id:
                continue
            locations.append(location.locations[j])
        if not locations:
            continue
        clusters.append(cluster_named_tuple()(label=cluster_id,
                                              centroid=Point(cluster_centre),
                                              location=location_callback(
                                                  locations)))
    return clusters

项目：iris-Clustering-python-PTVS 作者：mjbahmani | 项目源码 | 文件源码

def GetFeatures(frame):
    #convert data to float
    arr = np.array(frame,dtype=np.float)
    #fill missing values
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    #normalize the entire data
    from sklearn.preprocessing import scale
    arr = scale(arr)
    return arr
#=================================================

项目：iris-Clustering-python-PTVS 作者：mjbahmani | 项目源码 | 文件源码

def GetFeatures(frame):
    #convert data to float
    arr = np.array(frame,dtype=np.float)
    #fill missing values
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    #normalize the entire data
    from sklearn.preprocessing import scale
    arr = scale(arr)
    return arr
#=================================================

项目：Benchmarks 作者：ECP-CANDLE | 项目源码 | 文件源码

def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)

    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)

    return df

项目：Benchmarks 作者：ECP-CANDLE | 项目源码 | 文件源码

def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)

    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)

    return df

项目：Benchmarks 作者：ECP-CANDLE | 项目源码 | 文件源码

def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df

项目：sia-cog 作者：deepakkumar1984 | 项目源码 | 文件源码

def data_handlemissing(dataframe, pipeline):
    try:
        if pipeline['options']['type'] == "dropcolumns":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=1, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=1, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=1, thresh=thresh, inplace=True)
        elif pipeline['options']['type'] == "droprows":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=0, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=0, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=0, thresh=thresh)
        elif pipeline['options']['type'] == "fillmissing":
            strategy = pipeline['options']['strategy']
            imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
            array = imp.fit_transform(dataframe.values)
            dataframe = pandas.DataFrame(array, columns = dataframe.columns)

        return dataframe
    except Exception as e:
        raise Exception("data_handlemissing: " + str(e))

项目：CryptoCurrencyTrader 作者：llens | 项目源码 | 文件源码

def imputer_transform(data):
    imputer = Imputer()
    imputer.fit(data)
    return imputer.transform(data)

项目：Xserpy 作者：brmson | 项目源码 | 文件源码

def imputator(features):
    """Fill in missing values with mean of the remaining samples

    Keyword arguments:
    features -- feature matrix

    """
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(features)
    return imp.transform(features)

项目：few 作者：lacava | 项目源码 | 文件源码

def impute_data(self,x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x)