Python sklearn.feature_selection 模块，VarianceThreshold() 实例源码

我们从Python开源项目中，提取了以下18个代码示例，用于说明如何使用sklearn.feature_selection.VarianceThreshold()。

项目：MENGEL 作者：CodeSpaceHQ | 项目源码 | 文件源码

def format_selector(selector,data, target):
    x_train, x_test, y_train, y_test = data_splitting.get_train_test(data, target)

    # Fit the model
    data.drop(target, 1, inplace=True)  # Remove target feature
    selector.fit(x_train, y_train)

    # Retain the feature names
    features = selector.get_support(indices = True)  # Returns array of indexes of nonremoved features
    features = [column for column in data[features] if column != target]  # Gets feature names

    # Transform, Format, Return
    selector = pd.DataFrame(selector.transform(data))
    selector.columns = features
    return selector


# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

项目：ML-note 作者：JasonK93 | 项目源码 | 文件源码

def test_VarianceThreshold():
    '''
    test the method of VarianceThreshold
    :return:  None
    '''
    X=[[100,1,2,3],
       [100,4,5,6],
       [100,7,8,9],
       [101,11,12,13]]
    selector=VarianceThreshold(1)
    selector.fit(X)
    print("Variances is %s"%selector.variances_)
    print("After transform is %s"%selector.transform(X))
    print("The surport is %s"%selector.get_support(True))
    print("After reverse transform is %s"%
            selector.inverse_transform(selector.transform(X)))

项目：kdd99-scikit 作者：PENGZhaoqing | 项目源码 | 文件源码

def feature_selection(self, data_set):
        """

        :param data_set:
        :return:
        """

        sel = VarianceThreshold(threshold=(.5 * (1 - .5)))
        feature_set = sel.fit_transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[attr_list[i]] = data_set[0][i]
        print check

        return data_set

项目：kdd99-scikit 作者：PENGZhaoqing | 项目源码 | 文件源码

def feature_selection(self, data_set, feature_names):
        """

        :param data_set:
        :return:
        """
        sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
        feature_set = sel.fit_transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index

项目：rdocChallenge 作者：Elyne | 项目源码 | 文件源码

def varianceFilter(train_data, train_classes, threshold):
    #if True:
    #    return frequencyFilter(train_data, train_classes, threshold)
    '''
    Variance filter
    '''
    vectorizer = DictVectorizer()  
    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    #y_train = train_classes

    sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
    x_new = sel.fit_transform(x_train)
    return vectorizer.inverse_transform(sel.inverse_transform(x_new))

项目：dmon-adp 作者：igabriel85 | 项目源码 | 文件源码

def varianceSelection(self, df, threashold=.8):
        if not isinstance(df, pandas.core.frame.DataFrame):
            logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s',
                                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df))
            sys.exit(1)
        sel = VarianceThreshold(threshold=(threashold * (1 - threashold)))
        sel.fit_transform(df)
        return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def __init__(self, conf):
        UnsupervisedFeatureSelection.__init__(self, conf)
        self.projection = VarianceThreshold()

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def createPipeline(self):
        # Remove features with null variance
        self.var_filter = VarianceThreshold()
        self.pipeline = Pipeline([
            ('var_filter', self.var_filter),
            ('projection', self.projection)])

项目：MENGEL 作者：CodeSpaceHQ | 项目源码 | 文件源码

def variance_threshold_selector(data,target):

    # Select Model
    selector = VarianceThreshold(0)  # Defaults to 0.0, e.g. only remove features with the same value in all samples

    # Fit, Format, and Return
    return format_selector(selector,data,target)


# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html

项目：hyperparam-search-guides 作者：wenyangfu | 项目源码 | 文件源码

def drop_features(X_train, X_test):
    # Drop some features from the get-go. No idea how these were found.
    X_train = X_train.drop(['F6', 'F26'], 1)
    X_test = X_test.drop(['F6', 'F26'], 1)

    # Drop additional low-variance features. This *may* be overfitting to the
    # test data, since the hyperparameters are different for train/test.
    X_train = VarianceThreshold(1.3).fit_transform(X_train)
    X_test = VarianceThreshold(1.25).fit_transform(X_test)

    return X_train, X_test

项目：hyperparam-search-guides 作者：wenyangfu | 项目源码 | 文件源码

def drop_features(X_train, X_test):
    # Drop some features from the get-go. No idea how these were found.
    X_train = X_train.drop(['F6', 'F26'], 1)
    X_test = X_test.drop(['F6', 'F26'], 1)

    # Drop additional low-variance features. This *may* be overfitting to the
    # test data, since the hyperparameters are different for train/test.
    X_train = VarianceThreshold(1.3).fit_transform(X_train)
    X_test = VarianceThreshold(1.25).fit_transform(X_test)

    return X_train, X_test

项目：Machine-learning-for-cybersecurity 作者：Logo252 | 项目源码 | 文件源码

def remove_unneeded_features(samples):
    """
    Removes features that have the same value in given data samples.
    :param samples: data samples
    :return: samples with updated features
    """
    selector = VarianceThreshold()
    selector.fit_transform(samples)
    # Array of integers corresponding to non removed features
    features = selector.get_support(indices=True)

    # Array of all non removed features names
    feature_names = [column for column in samples[features]]

    return pd.DataFrame(selector.fit_transform(samples), columns=feature_names)

项目：scienceie17 作者：OC-ScienceIE | 项目源码 | 文件源码

def fit_classifier(feat_dicts=None, y_true=None, weights=None):
    # clf = MultinomialNB()
    clf = LogisticRegression(class_weight='balanced')

    pipeline = Pipeline([
        ('vectorizer', DictVectorizer()),
        ('selection', VarianceThreshold()),
        ('classifier', clf)
    ])

    # cf. http://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t
    pipeline.fit(feat_dicts, y_true, **{'classifier__sample_weight': weights})

    return pipeline

项目：clue-hackathon 作者：adrinjalali | 项目源码 | 文件源码

def run():
    data = load_binary()

    # Extract features
    user_feat_matrix = process_level2(data)  # X

    del user_feat_matrix['X']['user_id']
    X = user_feat_matrix['X'].values
    X[np.isnan(X)] = 0
    Y = user_feat_matrix['Y']
    Y.fillna(0, inplace=True)
    del user_feat_matrix['X_all']['user_id']
    X_all = user_feat_matrix['X_all'].values
    X_all[np.isnan(X_all)] = 0

    cols = list(Y.columns.values)
    symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
                'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
                'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
    with open("result.txt", 'w') as f:
        f.write("user_id,day_in_cycle,symptom,probability\n")

    for symptom in symptoms:
        print(symptom)

        pipeline = Pipeline([
            ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
            #('standard_scale', StandardScaler()),
            ('estimator', Lasso()),
        ])

        param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
        model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
                             verbose=2)
        model.fit(X, s_Y.values)

        print("dumping...")
        data_dir = 'data'
        cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
        c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)}
        dump(symptom, model, X_all, c_length, data['users'].user_id)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_zero_variance():
    # Test VarianceThreshold with default setting, zero variance.

    for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]:
        sel = VarianceThreshold().fit(X)
        assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True))

    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1, 2, 3]])
    assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_variance_threshold():
    # Test VarianceThreshold with custom variance.
    for X in [data, csr_matrix(data)]:
        X = VarianceThreshold(threshold=.4).fit_transform(X)
        assert_equal((len(data), 1), X.shape)

项目：AlphaPy 作者：ScottFreeLLC | 项目源码 | 文件源码

def remove_lv_features(model, X):
    r"""Remove low-variance features.

    Parameters
    ----------
    model : alphapy.Model
        Model specifications for removing features.
    X : numpy array
        The feature matrix.

    Returns
    -------
    X_reduced : numpy array
        The reduced feature matrix.

    References
    ----------
    You can find more information on low-variance feature selection here [LV]_.

    .. [LV] http://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold

    """

    logger.info("Removing Low-Variance Features")

    # Extract model parameters

    lv_remove = model.specs['lv_remove']
    lv_threshold = model.specs['lv_threshold']
    predict_mode = model.specs['predict_mode']

    # Remove low-variance features

    if lv_remove:
        logger.info("Low-Variance Threshold  : %.2f", lv_threshold)
        logger.info("Original Feature Count  : %d", X.shape[1])
        if not predict_mode:
            selector = VarianceThreshold(threshold=lv_threshold)
            selector.fit(X)
            support = selector.get_support()
            model.feature_map['lv_support'] = support
        else:
            support = model.feature_map['lv_support']
        X_reduced = X[:, support]
        logger.info("Reduced Feature Count   : %d", X_reduced.shape[1])
    else:
        X_reduced = X
        logger.info("Skipping Low-Variance Features")

    return X_reduced

项目：clue-hackathon 作者：adrinjalali | 项目源码 | 文件源码

def run():
    data = load_binary()

    # Extract features
    user_feat_matrix = process_level2(data)  # X

    del user_feat_matrix['X']['user_id']
    X = user_feat_matrix['X'].values
    X[np.isnan(X)] = 0
    Y = user_feat_matrix['Y']
    Y.fillna(0, inplace=True)
    del user_feat_matrix['X_all']['user_id']
    X_all = user_feat_matrix['X_all'].values
    X_all[np.isnan(X_all)] = 0

    cols = list(Y.columns.values)
    symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted',
                'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain',
                'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin']
    with open("result.txt", 'w') as f:
        f.write("user_id,day_in_cycle,symptom,probability\n")

    labels = final_labels['labels']

    for symptom in symptoms:
        print(symptom)
        s_Y = Y[[x for x in cols if x[1] == symptom]]

        pipeline = Pipeline([
            ('remove_low_variance_features', VarianceThreshold(threshold=0.0)),
            #('standard_scale', StandardScaler()),
            ('estimator', Lasso()),
        ])

        for cluster in range(3): #number of clusters
            print (cluster)


            param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]}
            model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4,
                             verbose=2)
            model.fit(X[labels == cluster], s_Y.values[labels == cluster])

            print("dumping...")
            data_dir = 'data'
            cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv'))
            c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])}
            dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])