Python sklearn.preprocessing 模块,Imputer() 实例源码


项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(self):
        Set up the unit test by loading the dataset and training a model.
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Imputer(strategy='most_frequent', axis=0)
        scikit_data['data'][1,8] = np.NaN

        input_data = scikit_data['data'][:,8].reshape(-1, 1), scikit_data['target'])

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def gen_features(train, y, test):
    for c in ['active', 'alco', 'smoke']:
        le = preprocessing.LabelEncoder()[c].values.tolist() + test[c].values.tolist())
        train[c] = le.transform(train[c])
        test[c] = le.transform(test[c])

    train['ap_dif'] = train.ap_hi - train.ap_lo
    test['ap_dif'] = test.ap_hi - test.ap_lo

    h = train['height'] / 100
    train['BWI'] = train['weight'] / (h * h)
    h = test['height'] / 100
    test['BWI'] = test['weight'] / (h * h)

    imp = preprocessing.Imputer()
    train = imp.fit_transform(train)
    test = imp.transform(test)

    return train, y, test
项目:Kaggle-Competition-Sberbank    作者:LenzDu    | 项目源码 | 文件源码
def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            Df[c] = lbl.transform(list(Df[c].values))

    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    return Df
项目:AutoFolio    作者:mlindauer    | 项目源码 | 文件源码
def fit(self, scenario: ASlibScenario, config: Configuration):
            fit pca object to ASlib scenario data

            scenario: data.aslib_scenario.ASlibScenario
                ASlib Scenario with all data in pandas
            config: ConfigSpace.Configuration

        self.imputer = Imputer(strategy=config.get("imputer_strategy")) = True
项目:predictive_imputer    作者:log0ymxm    | 项目源码 | 文件源码
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.initial_imputer = Imputer(strategy=initial_strategy)
        self.tol = tol
        self.f_model = f_model
项目:train-occupancy    作者:datamindedbe    | 项目源码 | 文件源码
def build_model_random_forest(df, features, categorical_features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]

    # one_hot_encoding because it doesn't work in pipeline for some reason
    # for f in categorical_features:
    #     dummies = pd.get_dummies(df[f], prefix=f)
    #     for dummy in dummies.columns:
    #         df[dummy] = dummies[dummy]
    #         features.append(dummy)
    #     df = df.drop(f, 1)
    #     features.remove(f)

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=5)),
        ("forest", RandomForestClassifier())])[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    print "Random Forest score: %f" % score
    print "confusion_matrix : \n%s" % cm
    return clf
项目:train-occupancy    作者:datamindedbe    | 项目源码 | 文件源码
def make_predictions_random_forest(df, features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    # print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
    # print "columns: ", '\n\t\t'.join(df.columns)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=200)),
        ("forest", RandomForestClassifier(
            min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    # print classification_report(test[target], predicted)

    return score, cm

# Utility function to report best scores
项目:hyperparam-search-guides    作者:wenyangfu    | 项目源码 | 文件源码
def preprocess_data(X_train, X_test):
    """ Impute missing values. """
    # Impute using the mean of every column for now. However,
    # I would've liked to impute 'F5' using mode instead.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_xform = imp.fit_transform(X_train)

    X_train = pd.DataFrame(train_xform, columns=X_train.columns)
    test_xform = imp.transform(X_test)
    X_test = pd.DataFrame(test_xform, columns=X_test.columns)

    return X_train, X_test
项目:hyperparam-search-guides    作者:wenyangfu    | 项目源码 | 文件源码
def preprocess_data(X_train, X_test):
    """ Impute missing values. """
    # Impute using the mean of every column for now. However,
    # I would've liked to impute 'F5' using mode instead.
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_xform = imp.fit_transform(X_train)

    X_train = pd.DataFrame(train_xform, columns=X_train.columns)
    test_xform = imp.transform(X_test)
    X_test = pd.DataFrame(test_xform, columns=X_test.columns)

    return X_train, X_test
项目:jingjuSingingPhraseMatching    作者:ronggong    | 项目源码 | 文件源码
def imputerLabelEncoder_train(X,y):
    imputer = preprocessing.Imputer()
    X = imputer.fit_transform(X)

    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)
    return X,y,imputer,le
项目:false-friends    作者:pln-fing-udelar    | 项目源码 | 文件源码
def build_classifier(base_clf=svm.SVC()):
    # The imputer is for "use_taxonomy", and shouldn't affect if it's False.
    # TODO: should also try with other imputer strategies
    return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(),

# noinspection PyPep8Naming
项目:pandas-pipelines-custom-transformers    作者:jem1031    | 项目源码 | 文件源码
def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_grid_search_allows_nans():
    # Test dcv.GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_bad_inputs(self):

        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = Imputer()
            spec = converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            from sklearn.linear_model import LinearRegression
            model = LinearRegression()
            spec = converter.convert(model, 'data', 'out')
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def test_conversion_boston(self):

        from sklearn.datasets import load_boston

        scikit_data = load_boston()

        sh = 

        missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1])) 
                                    for k in range(sh[0])]

        for strategy in ["mean", "median", "most_frequent"]: 
            for missing_value in [0, 'NaN', -999]:

                X = np.array(

                for i, j in missing_value_indices:
                    X[i,j] = missing_value

                model = Imputer(missing_values = missing_value, strategy = strategy)
                model =

                tr_X = model.transform(X.copy())

                spec = converter.convert(model, scikit_data.feature_names, 'out')

                input_data = [dict(zip(scikit_data.feature_names, row)) 
                                for row in X]

                output_data = [{"out" : row} for row in tr_X]

                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0
项目:Optimus    作者:Yatoom    | 项目源码 | 文件源码
def __init__(self, strategy_categorical="most_frequent", strategy_numerical="median", categorical=None):
        An Imputer that can apply a different strategy for both categorical data and numerical data.
        :param strategy_categorical: "mean", "median" or "most_frequent"
        :param strategy_numerical: "mean", "median" or "most_frequent"
        :param categorical: A boolean mask for the categorical columns of a dataset
        if categorical is None:
            categorical = []
        self.strategy_categorical = strategy_categorical
        self.strategy_numerical = strategy_numerical
        self.cat_imputer = Imputer(strategy=strategy_categorical)
        self.num_imputer = Imputer(strategy=strategy_numerical)
        self.categorical = categorical
项目:ar-embeddings    作者:iamaziz    | 项目源码 | 文件源码
def remove_nan(x):
        """remove NaN values from data vectors"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        x_clean = imp.fit_transform(x)
        return x_clean
项目:errorgeopy    作者:alpha-beta-soup    | 项目源码 | 文件源码
def mean_shift(location, location_callback, bandwidth=None):
    """Returns one or more clusters of a set of points, using a mean shift
    The result is sorted with the first value being the largest cluster.

        bandwidth (float): If bandwidth is None, a value is detected
        automatically from the input using estimate_bandwidth.

        A list of NamedTuples (see get_cluster_named_tuple for a definition
        of the tuple).
    pts = location._tuple_points()
    if not pts:
        return None
    X = np.array(pts).reshape((len(pts), len(pts[0])))
    if np.any(np.isnan(X)) or not np.all(np.isfinite(X)):
        return None
    X = Imputer().fit_transform(X)
    X = X.astype(np.float32)
    if not bandwidth:
        bandwidth = estimate_bandwidth(X, quantile=0.3)
    ms = MeanShift(bandwidth=bandwidth or None, bin_seeding=False).fit(X)
    clusters = []
    for cluster_id, cluster_centre in enumerate(ms.cluster_centers_):
        locations = []
        for j, label in enumerate(ms.labels_):
            if not label == cluster_id:
        if not locations:
    return clusters
项目:iris-Clustering-python-PTVS    作者:mjbahmani    | 项目源码 | 文件源码
def GetFeatures(frame):
    #convert data to float
    arr = np.array(frame,dtype=np.float)
    #fill missing values
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    #normalize the entire data
    from sklearn.preprocessing import scale
    arr = scale(arr)
    return arr
项目:iris-Clustering-python-PTVS    作者:mjbahmani    | 项目源码 | 文件源码
def GetFeatures(frame):
    #convert data to float
    arr = np.array(frame,dtype=np.float)
    #fill missing values
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy='mean')
    arr = imputer.fit_transform(arr)
    #normalize the entire data
    from sklearn.preprocessing import scale
    arr = scale(arr)
    return arr
项目:Benchmarks    作者:ECP-CANDLE    | 项目源码 | 文件源码
def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.

    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)

    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)

    return df
项目:Benchmarks    作者:ECP-CANDLE    | 项目源码 | 文件源码
def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.

    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)

    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)

    return df
项目:Benchmarks    作者:ECP-CANDLE    | 项目源码 | 文件源码
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
项目:sia-cog    作者:deepakkumar1984    | 项目源码 | 文件源码
def data_handlemissing(dataframe, pipeline):
        if pipeline['options']['type'] == "dropcolumns":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=1, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=1, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=1, thresh=thresh, inplace=True)
        elif pipeline['options']['type'] == "droprows":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=0, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=0, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=0, thresh=thresh)
        elif pipeline['options']['type'] == "fillmissing":
            strategy = pipeline['options']['strategy']
            imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
            array = imp.fit_transform(dataframe.values)
            dataframe = pandas.DataFrame(array, columns = dataframe.columns)

        return dataframe
    except Exception as e:
        raise Exception("data_handlemissing: " + str(e))
项目:CryptoCurrencyTrader    作者:llens    | 项目源码 | 文件源码
def imputer_transform(data):
    imputer = Imputer()
    return imputer.transform(data)
项目:Xserpy    作者:brmson    | 项目源码 | 文件源码
def imputator(features):
    """Fill in missing values with mean of the remaining samples

    Keyword arguments:
    features -- feature matrix

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    return imp.transform(features)
项目:few    作者:lacava    | 项目源码 | 文件源码
def impute_data(self,x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_allows_nans():
    # Test GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    permutation_test_score(p, X, y, cv=5)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    cross_val_score(p, X, y, cv=5)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    cval.cross_val_score(p, X, y, cv=5)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    cval.permutation_test_score(p, X, y, cv=5)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_grid_search_allows_nans():
    # Test GridSearchCV with Imputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
项目:coremltools    作者:gsabran    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a DictVectorizer model to the protobuf spec.

    model: DictVectorizer
        A fitted DictVectorizer model.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION

    assert len(input_features) == 1
    assert isinstance(input_features[0][1], datatypes.Array)

    # feature name in and out are the same here
    spec = set_transform_interface_params(spec, input_features, output_features)

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Imputer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_'))

    if model.axis != 0:
        raise ValueError("Imputation is only supported along axis = 0.")

    # The imputer in our framework only works on single columns, so
    # we need to translate that over.  The easiest way to do that is to 
    # put it in a nested pipeline with a feature extractor and a 

    tr_spec = spec.imputer

    for v in model.statistics_:

        tr_spec.replaceDoubleValue = float(model.missing_values)
    except ValueError:
        raise ValueError("Only scalar values or NAN as missing_values "
                "in _imputer are supported.")

    return _MLModel(spec)
项目:AlphaPy    作者:ScottFreeLLC    | 项目源码 | 文件源码
def impute_values(features, dt, sentinel):
    r"""Impute values for a given data type. The *median* strategy
    is applied for floating point values, and the *most frequent*
    strategy is applied for integer or Boolean values.

    features : pandas.DataFrame
        Dataframe containing the features for imputation.
    dt : str
        The values ``'float64'``, ``'int64'``, or ``'bool'``.
    sentinel : float
        The number to be imputed for NaN values.

    imputed_features : numpy array
        The features after imputation.

        Data type ``dt`` is invalid for imputation.

    You can find more information on feature imputation here [IMP]_.

    .. [IMP]

        nfeatures = features.shape[1]
        features = features.values.reshape(-1, 1)
    if dt == 'float64':
        imp = Imputer(missing_values='NaN', strategy='median', axis=0)
    elif dt == 'int64' or dt == 'bool':
        imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
        raise TypeError("Data Type %s is invalid for imputation" % dt)
    imputed = imp.fit_transform(features)
    if imputed.shape[1] == 0:
        nans = np.isnan(features)
        features[nans] = sentinel
        imputed_features = features
        imputed_features = imputed
    return imputed_features

# Function get_numerical_features
项目:stock_trend_prediction    作者:r12543    | 项目源码 | 文件源码
def prepareDataForClassification(dataset):
    generates categorical output column, attach to dataframe 
    label the categories and split into train and test
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = dataset['Return_Out']
    dataset.UpDown[dataset.UpDown >= 0] = 'Up'
    dataset.UpDown[dataset.UpDown < 0] = 'Down'
    dataset.UpDown =

    features = dataset.columns[1:-1]
    X = dataset[features]    
    y = dataset.UpDown
    # print y
    print X.shape
    # print y.shape
    # for i in range(len(X.columns)):  
    #   print X.columns[i] 
    # X.to_csv("X.csv", sep='\t', encoding='utf-8')
    # y.to_csv("y.csv", sep='\t', encoding='utf-8')
    # print X.iloc[2:5, 78:84]  
    # X = X.fillna(X.mean())
    # print X.iloc[2:5, 78:84]
    # print X.index

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    # print X_train.iloc[2:5, 78:84]
    X_train = X_train.fillna(X_train.mean())
    X_test = X_test.fillna(X_test.mean())
    # print X_train.iloc[2:5, 78:84]
    # X_train.to_csv("X_train.csv", sep='\t', encoding='utf-8')
    # y_train.to_csv("y_train.csv", sep='\t', encoding='utf-8')

    # imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    # #
    # #
    # X_train = imp.fit_transform(X_train)
    # # y_train = imp.fit_transform(y_train)
    # X_test = imp.fit_transform(X_test)
    # # y_test = imp.fit_transform(y_test)

    # imp = Imputer(missing_values=0, strategy='mean', axis=0)
    # #
    # #
    # X_train = imp.fit_transform(X_train)
    # # y_train = imp.fit_transform(y_train)
    # X_test = imp.fit_transform(X_test)
    # y

    return X_train, y_train, X_test, y_test
项目:stock_trend_prediction    作者:r12543    | 项目源码 | 文件源码
def prepareDataForClassification(dataset):
    generates categorical output column, attach to dataframe
    label the categories and split into train and test
    le = preprocessing.LabelEncoder()

    dataset['UpDown'] = dataset['Return_Out']
    dataset.UpDown[dataset.UpDown >= 0] = 'Up'
    dataset.UpDown[dataset.UpDown < 0] = 'Down'
    dataset.UpDown =

    features = dataset.columns[1:-1]
    X = dataset[features]
    y = dataset.UpDown

    # print X.shape
    # print y.shape

    # print X.index

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    X_train = imp.fit_transform(X_train)
    # y_train = imp.fit_transform(y_train)
    X_test = imp.fit_transform(X_test)
    # y_test = imp.fit_transform(y_test)

    imp = Imputer(missing_values=0, strategy='mean', axis=0)
    X_train = imp.fit_transform(X_train)
    # y_train = imp.fit_transform(y_train)
    X_test = imp.fit_transform(X_test)
    # y

    return X_train, y_train, X_test, y_test
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def convert(model, input_features, output_features):
    """Convert a DictVectorizer model to the protobuf spec.

    model: DictVectorizer
        A fitted DictVectorizer model.

    input_features: str
        Name of the input column.

    output_features: str
        Name of the output column.

    model_spec: An object of type Model_pb.
        Protobuf representation of the model
    if not(_HAS_SKLEARN):
        raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.')

    # Set the interface params.
    spec = _Model_pb2.Model()
    spec.specificationVersion = SPECIFICATION_VERSION

    assert len(input_features) == 1
    assert isinstance(input_features[0][1], datatypes.Array)

    # feature name in and out are the same here
    spec = set_transform_interface_params(spec, input_features, output_features)

    # Test the scikit-learn model
    _sklearn_util.check_expected_type(model, Imputer)
    _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_'))

    if model.axis != 0:
        raise ValueError("Imputation is only supported along axis = 0.")

    # The imputer in our framework only works on single columns, so
    # we need to translate that over.  The easiest way to do that is to 
    # put it in a nested pipeline with a feature extractor and a 

    tr_spec = spec.imputer

    for v in model.statistics_:

        tr_spec.replaceDoubleValue = float(model.missing_values)
    except ValueError:
        raise ValueError("Only scalar values or NAN as missing_values "
                "in _imputer are supported.")

    return _MLModel(spec)
项目:errorgeopy    作者:alpha-beta-soup    | 项目源码 | 文件源码
def affinity_propagation(location, location_callback):
    """Returns one or more clusters of a set of points, using an affinity
    propagation algorithm.
    The result is sorted with the first value being the largest cluster.

        A list of NamedTuples (see get_cluster_named_tuple for a definition
        of the tuple).
    pts = location._tuple_points()
    if not pts:
        return None
    X = np.array(pts).reshape((len(pts), len(pts[0])))
    if np.any(np.isnan(X)) or not np.all(np.isfinite(X)):
        return None
    X = Imputer().fit_transform(X)
    X = X.astype(np.float32)
    afkwargs = {
        'damping': 0.5,
        'convergence_iter': 15,
        'max_iter': 200,
        'copy': True,
        'preference': None,
        'affinity': 'euclidean',
        'verbose': False
    af = AffinityPropagation(**afkwargs).fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    clusters = []
    for cluster_id, cluster_centre in enumerate(af.cluster_centers_):
        locations = []
        for j, label in enumerate(af.labels_):
            if not label == cluster_id:
        if not locations:
    return clusters
项目:mlbootcamp_5    作者:ivan-filonov    | 项目源码 | 文件源码
def model(self):
        #cname = sys._getframe().f_code.co_name
        cname = 'keras'
        train, y, test = self.train_, self.y_, self.test_

        train.drop('id', axis=1, inplace=True)
        test.drop('id', axis=1, inplace=True)

        from sklearn import pipeline
        pipe = pipeline.make_pipeline(preprocessing.Imputer(),

        train = pipe.fit_transform(train)
        test = pipe.transform(test)

        self.input_dims_ = train.shape[1]
        def build_model():
            return self.build_keras_model()
        batch_size = self.batch_size_
        ss = model_selection.StratifiedKFold(n_splits = self.num_splits_,
                                             random_state = 11,
                                             shuffle = True)
        scores = list()
        model_path = self.temp_name('keras_mlp_weights')
        v, z = self.v_, self.z_
        v[cname] = 0
        z[cname] = 0
        for n, (itrain, ival) in enumerate(ss.split(train, y)):
            xtrain, xval = train[itrain], train[ival]
            ytrain, yval = y[itrain], y[ival]
            model = build_model()
                    xtrain, ytrain,
                    batch_size = batch_size,
                    epochs = 10000,
                    validation_data = (xval, yval),
                    verbose = 0,
                    callbacks = build_keras_fit_callbacks(model_path),
                    shuffle = True
            p = model.predict(xval)
            v.loc[ival, cname] += p.ravel()
            score = metrics.log_loss(y[ival], p)
            if score != score:
                raise Exception('NaN score!!!')
            print(cname, 'fold %d: '%(n+1), score,
            z[cname] += model.predict(test).ravel()
            del model
            for i in range(3): gc.collect(i)
        print('scores:', scores, np.mean(scores), np.std(scores))
        z[cname] /= self.num_splits_
        z['y'] = z[cname]

        return cv, None
项目:sport-news-retrieval    作者:Andyccs    | 项目源码 | 文件源码
def gensim_classifier():
  logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  label_list = get_labels()
  tweet_list = get_labelled_tweets()

  # split all sentences to list of words
  sentences = []
  for tweet in tweet_list:
    temp_doc = tweet.split()

  # parameters for model
  num_features = 100
  min_word_count = 1
  num_workers = 4
  context = 2
  downsampling = 1e-3

  # Initialize and train the model
  w2v_model = Word2Vec(sentences, workers=num_workers, \
              size=num_features, min_count = min_word_count, \
              window = context, sample = downsampling, seed=1)

  index_value, train_set, test_set = train_test_split(0.80, sentences)
  train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features)
  test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features)
  train_vector = Imputer().fit_transform(train_vector)
  test_vector = Imputer().fit_transform(test_vector)

  # train model and predict
  model = LinearSVC()
  classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value])
  result = classifier_fitted.predict(test_vector)

  # output result to csv
  result.tofile("data/w2v_linsvc.csv", sep=',')

  # store the model to mmap-able files
  joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc')

  # evaluation
  label_score = classifier_fitted.decision_function(test_vector)
  binarise_result = label_binarize(result, classes=class_list)
  binarise_labels = label_binarize(label_list, classes=class_list)

  evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')