我们从Python开源项目中,提取了以下18个代码示例,用于说明如何使用sklearn.feature_selection.VarianceThreshold()。
def format_selector(selector,data, target): x_train, x_test, y_train, y_test = data_splitting.get_train_test(data, target) # Fit the model data.drop(target, 1, inplace=True) # Remove target feature selector.fit(x_train, y_train) # Retain the feature names features = selector.get_support(indices = True) # Returns array of indexes of nonremoved features features = [column for column in data[features] if column != target] # Gets feature names # Transform, Format, Return selector = pd.DataFrame(selector.transform(data)) selector.columns = features return selector # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html
def test_VarianceThreshold(): ''' test the method of VarianceThreshold :return: None ''' X=[[100,1,2,3], [100,4,5,6], [100,7,8,9], [101,11,12,13]] selector=VarianceThreshold(1) selector.fit(X) print("Variances is %s"%selector.variances_) print("After transform is %s"%selector.transform(X)) print("The surport is %s"%selector.get_support(True)) print("After reverse transform is %s"% selector.inverse_transform(selector.transform(X)))
def feature_selection(self, data_set): """ :param data_set: :return: """ sel = VarianceThreshold(threshold=(.5 * (1 - .5))) feature_set = sel.fit_transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[attr_list[i]] = data_set[0][i] print check return data_set
def feature_selection(self, data_set, feature_names): """ :param data_set: :return: """ sel = VarianceThreshold(threshold=(.8 * (1 - .8))) feature_set = sel.fit_transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def varianceFilter(train_data, train_classes, threshold): #if True: # return frequencyFilter(train_data, train_classes, threshold) ''' Variance filter ''' vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) #y_train = train_classes sel = VarianceThreshold(threshold=(threshold * (1 - threshold))) x_new = sel.fit_transform(x_train) return vectorizer.inverse_transform(sel.inverse_transform(x_new))
def varianceSelection(self, df, threashold=.8): if not isinstance(df, pandas.core.frame.DataFrame): logger.error('[%s] : [ERROR] Variance selection only possible on Dataframe not %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(df)) sys.exit(1) sel = VarianceThreshold(threshold=(threashold * (1 - threashold))) sel.fit_transform(df) return df[[c for (s, c) in zip(sel.get_support(), df.columns.values) if s]]
def __init__(self, conf): UnsupervisedFeatureSelection.__init__(self, conf) self.projection = VarianceThreshold()
def createPipeline(self): # Remove features with null variance self.var_filter = VarianceThreshold() self.pipeline = Pipeline([ ('var_filter', self.var_filter), ('projection', self.projection)])
def variance_threshold_selector(data,target): # Select Model selector = VarianceThreshold(0) # Defaults to 0.0, e.g. only remove features with the same value in all samples # Fit, Format, and Return return format_selector(selector,data,target) # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
def drop_features(X_train, X_test): # Drop some features from the get-go. No idea how these were found. X_train = X_train.drop(['F6', 'F26'], 1) X_test = X_test.drop(['F6', 'F26'], 1) # Drop additional low-variance features. This *may* be overfitting to the # test data, since the hyperparameters are different for train/test. X_train = VarianceThreshold(1.3).fit_transform(X_train) X_test = VarianceThreshold(1.25).fit_transform(X_test) return X_train, X_test
def remove_unneeded_features(samples): """ Removes features that have the same value in given data samples. :param samples: data samples :return: samples with updated features """ selector = VarianceThreshold() selector.fit_transform(samples) # Array of integers corresponding to non removed features features = selector.get_support(indices=True) # Array of all non removed features names feature_names = [column for column in samples[features]] return pd.DataFrame(selector.fit_transform(samples), columns=feature_names)
def fit_classifier(feat_dicts=None, y_true=None, weights=None): # clf = MultinomialNB() clf = LogisticRegression(class_weight='balanced') pipeline = Pipeline([ ('vectorizer', DictVectorizer()), ('selection', VarianceThreshold()), ('classifier', clf) ]) # cf. http://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t pipeline.fit(feat_dicts, y_true, **{'classifier__sample_weight': weights}) return pipeline
def run(): data = load_binary() # Extract features user_feat_matrix = process_level2(data) # X del user_feat_matrix['X']['user_id'] X = user_feat_matrix['X'].values X[np.isnan(X)] = 0 Y = user_feat_matrix['Y'] Y.fillna(0, inplace=True) del user_feat_matrix['X_all']['user_id'] X_all = user_feat_matrix['X_all'].values X_all[np.isnan(X_all)] = 0 cols = list(Y.columns.values) symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted', 'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain', 'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'] with open("result.txt", 'w') as f: f.write("user_id,day_in_cycle,symptom,probability\n") for symptom in symptoms: print(symptom) pipeline = Pipeline([ ('remove_low_variance_features', VarianceThreshold(threshold=0.0)), #('standard_scale', StandardScaler()), ('estimator', Lasso()), ]) param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]} model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4, verbose=2) model.fit(X, s_Y.values) print("dumping...") data_dir = 'data' cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv')) c_length = {k:v for k,v in zip(cycles0.user_id.values, cycles0.expected_cycle_length)} dump(symptom, model, X_all, c_length, data['users'].user_id)
def test_zero_variance(): # Test VarianceThreshold with default setting, zero variance. for X in [data, csr_matrix(data), csc_matrix(data), bsr_matrix(data)]: sel = VarianceThreshold().fit(X) assert_array_equal([0, 1, 3, 4], sel.get_support(indices=True)) assert_raises(ValueError, VarianceThreshold().fit, [[0, 1, 2, 3]]) assert_raises(ValueError, VarianceThreshold().fit, [[0, 1], [0, 1]])
def test_variance_threshold(): # Test VarianceThreshold with custom variance. for X in [data, csr_matrix(data)]: X = VarianceThreshold(threshold=.4).fit_transform(X) assert_equal((len(data), 1), X.shape)
def remove_lv_features(model, X): r"""Remove low-variance features. Parameters ---------- model : alphapy.Model Model specifications for removing features. X : numpy array The feature matrix. Returns ------- X_reduced : numpy array The reduced feature matrix. References ---------- You can find more information on low-variance feature selection here [LV]_. .. [LV] http://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold """ logger.info("Removing Low-Variance Features") # Extract model parameters lv_remove = model.specs['lv_remove'] lv_threshold = model.specs['lv_threshold'] predict_mode = model.specs['predict_mode'] # Remove low-variance features if lv_remove: logger.info("Low-Variance Threshold : %.2f", lv_threshold) logger.info("Original Feature Count : %d", X.shape[1]) if not predict_mode: selector = VarianceThreshold(threshold=lv_threshold) selector.fit(X) support = selector.get_support() model.feature_map['lv_support'] = support else: support = model.feature_map['lv_support'] X_reduced = X[:, support] logger.info("Reduced Feature Count : %d", X_reduced.shape[1]) else: X_reduced = X logger.info("Skipping Low-Variance Features") return X_reduced
def run(): data = load_binary() # Extract features user_feat_matrix = process_level2(data) # X del user_feat_matrix['X']['user_id'] X = user_feat_matrix['X'].values X[np.isnan(X)] = 0 Y = user_feat_matrix['Y'] Y.fillna(0, inplace=True) del user_feat_matrix['X_all']['user_id'] X_all = user_feat_matrix['X_all'].values X_all[np.isnan(X_all)] = 0 cols = list(Y.columns.values) symptoms = ['happy', 'pms', 'sad', 'sensitive_emotion', 'energized', 'exhausted', 'high_energy', 'low_energy', 'cramps', 'headache', 'ovulation_pain', 'tender_breasts', 'acne_skin', 'good_skin', 'oily_skin', 'dry_skin'] with open("result.txt", 'w') as f: f.write("user_id,day_in_cycle,symptom,probability\n") labels = final_labels['labels'] for symptom in symptoms: print(symptom) s_Y = Y[[x for x in cols if x[1] == symptom]] pipeline = Pipeline([ ('remove_low_variance_features', VarianceThreshold(threshold=0.0)), #('standard_scale', StandardScaler()), ('estimator', Lasso()), ]) for cluster in range(3): #number of clusters print (cluster) param_grid = {'estimator__alpha': [.1, .3, .5, .7, .8]} model = GridSearchCV(pipeline, param_grid = param_grid, n_jobs = 4, verbose=2) model.fit(X[labels == cluster], s_Y.values[labels == cluster]) print("dumping...") data_dir = 'data' cycles0 = pd.read_csv(join(data_dir, 'cycles0.csv')) c_length = {k:v for k,v in zip(cycles0.user_id.values[labels == cluster], cycles0.expected_cycle_length[labels == cluster])} dump(symptom, model, X_all[labels == cluster], c_length, data['users'].user_id[labels == cluster])