我们从Python开源项目中,提取了以下10个代码示例,用于说明如何使用sklearn.feature_selection.RFE。
def greedy_elim(df): # do feature selection using boruta X = df[[x for x in df.columns if x!='SalePrice']] y = df['SalePrice'] #model = RandomForestRegressor(n_estimators=50) model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.05) # 150 features seems to be the best at the moment. Why this is is unclear. feat_selector = RFE(estimator=model, step=1, n_features_to_select=150) # find all relevant features feat_selector.fit_transform(X.as_matrix(), y.as_matrix()) # check selected features features_bool = np.array(feat_selector.support_) features = np.array(X.columns) result = features[features_bool] #print(result) # check ranking of features features_rank = feat_selector.ranking_ #print(features_rank) rank = features_rank[features_bool] #print(rank) return result
def test_compare_with_no_feature_selection(): ''' compare the result before the selection and after :return: None ''' iris=load_iris() X,y=iris.data,iris.target estimator=LinearSVC() selector=RFE(estimator=estimator,n_features_to_select=2) X_t=selector.fit_transform(X,y) X_train,X_test,y_train,y_test=cross_validation.train_test_split(X, y, test_size=0.25,random_state=0,stratify=y) X_train_t,X_test_t,y_train_t,y_test_t=cross_validation.train_test_split(X_t, y, test_size=0.25,random_state=0,stratify=y) clf=LinearSVC() clf_t=LinearSVC() clf.fit(X_train,y_train) clf_t.fit(X_train_t,y_train_t) print("Original DataSet: test score=%s"%(clf.score(X_test,y_test))) print("Selected DataSet: test score=%s"%(clf_t.score(X_test_t,y_test_t)))
def recurvise_index(self, clf,): # rank all features, i.e continue the elimination until the last one rfe = RFE(clf, n_features_to_select=1) rfe.fit(self.features, self.labels) # map recursive feature score to the feature names rfedict = {k: v for k, v in zip(self.features.columns.tolist(), map(lambda x: round(x, 4), rfe.ranking_ ) ) } return rfedict
def rfe_selection(X,y,n_features): """ Performs the Recursive Feature Elimination method and selects the top ranking features Keyword arguments: X -- The feature vectors y -- The target vector n_features -- n best ranked features """ if verbose: print '\nPerforming Feature Selection based on the Recursive Feature Elimination method ...' clf=RandomForestClassifierWithCoef(n_estimators=10,n_jobs=-1) fs= RFE(clf, n_features, step=1) fs= fs.fit(X,y) ranks=fs.ranking_ feature_indexes=[] for i in xrange(len(ranks)): if ranks[i]==1: feature_indexes+=[i] return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features] #return selected features and original index features
def __init__(self, conf): SemiSupervisedFeatureSelection.__init__(self, conf) self.projection = RFE(estimator = conf.model, n_features_to_select = conf.num_components, step = conf.step)
def featureRank(useFeature,trueSet,falseSet): # load data and split X_true = [] for dn in trueSet: fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb") X_true.append(pickle.load(fin)) fin.close() X_true = np.vstack(X_true) print(X_true.shape) X_false = [] for dn in falseSet: fin = open("./learn/data/"+useFeature+"_"+dn+".pkl","rb") X_false.append(pickle.load(fin)) fin.close() X_false = np.vstack(X_false) print(X_false.shape) test_size = 0.3 X_true_train,X_true_test = train_test_split(X_true ,test_size=test_size) X_false_train, X_false_test = train_test_split(X_false ,train_size=len(X_true_train),test_size=len(X_true_test)) X = np.vstack([X_true_train,X_false_train]) X_ = np.vstack([X_true_test,X_false_test]) Y = [1]*len(X_true_train)+[0]*len(X_false_train) Y_ = [1]*len(X_true_test)+[0]*len(X_false_test) X,Y = shuffle(X,Y) X_,Y_ = shuffle(X_,Y_) featNames = ml_feature_name.getFeatureName(useFeature) clf = LinearSVC(C=0.1) rfe = RFE(estimator =clf, n_features_to_select=1,step=1) rfe.fit(X,Y) ranks = rfe.ranking_ if(useFeature =="rp"): fout = open("./learn/feature/rp_feature_rank.txt","w") for i,r in enumerate(ranks): fout.write("{0} {1}\n".format(i,r)) fout.close() rankFeat = list(zip(ranks,featNames)) rankFeat.sort() for rf in rankFeat: if(useFeature in ["tfidf_1gram","tfidf_2gram","tfidf_3gram","tfidf_4gram"]): if(ml_feature_name.isDiatonic(rf[1])): print(rf) else: print(rf)
def test_RFE(): ''' test the method of RFE, the number of feature aim to 2 :return: None ''' iris=load_iris() X=iris.data y=iris.target estimator=LinearSVC() selector=RFE(estimator=estimator,n_features_to_select=2) selector.fit(X,y) print("N_features %s"%selector.n_features_) print("Support is %s"%selector.support_) print("Ranking %s"%selector.ranking_)
def sk_feature_ref(): # load the iris datasets dataset = datasets.load_iris() # create a base classifier used to evaluate a subset of attributes model_lr = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model_lr, 3) rfe = rfe.fit(dataset.data, dataset.target) # summarize the selection of the attributes print rfe.support_ # [False True True True] print rfe.ranking_ # [2 1 1 1] print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), dataset.feature_names)) # [(1.0, 'petal length (cm)'), (1.0, 'petal width (cm)'), (1.0, 'sepal width (cm)'), (2.0, 'sepal length (cm)')]
def sk_feature_ref_v2(): X, Y = get_dummy_data() names = ['f1', 'f2', 'f3'] model_lr = LogisticRegression() rfe = RFE(model_lr, 2) rfe = rfe.fit(X, Y) print rfe.support_ print rfe.ranking_ print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))
def recursive_feature_elimination(self, nfeat=None, step=1, inplace=False): """A method to implement recursive feature elimination on the model. Note that CV is not performed in this function. The method will continue to eliminate some features (specified by step parameter) at each step until the specified number of features are reached. Parameters __________ nfeat : int or None, default=None The num of top features to select. If None, half of the features are selected. step : int or float, default=1 If int, then step corresponds to the number of features to remove at each iteration. If float and within (0.0, 1.0), then step corresponds to the percentage (rounded down) of features to remove at each iteration. If float and greater than one, then integral part will be considered as an integer input inplace : bool, default=False If True, the predictors of the class are modified to those selected by the RFE procedure. Returns _______ selected : A series object containing the selected features as index and their rank in selection as values """ rfe = RFE(self.alg, n_features_to_select=nfeat, step=step) rfe.fit( self.datablock.train[self.predictors], self.datablock.train[self.datablock.target] ) ranks = pd.Series(rfe.ranking_, index=self.predictors) selected = ranks.loc[rfe.support_] if inplace: self.set_predictors(selected.index.tolist()) return selected