我们从Python开源项目中,提取了以下34个代码示例,用于说明如何使用sklearn.feature_selection.SelectFromModel()。
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def fit(self, X, Y): from sklearn.ensemble import ExtraTreesRegressor from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) preprocessor = ExtraTreesRegressor( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state) preprocessor.fit(X, Y) self.preprocessor = SelectFromModel(preprocessor, prefit=True) return self
def fit(self, X, Y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int( float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) preprocessor = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight ) preprocessor.fit(X, Y, sample_weight=sample_weight) self.preprocessor = SelectFromModel(preprocessor, prefit=True) return self
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0,len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample,normalSample): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) clf = ExtraTreesClassifier() clf = clf.fit(data,target) model = SelectFromModel(clf,prefit=True) outcome = model.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): data = anamolySample target = [] for i in range(0, len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) clf = ExtraTreesClassifier() clf = clf.fit(data, target) model = SelectFromModel(clf, prefit=True) outcome = model.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " print warnstr return warnstr
def __init__(self, type_of_estimator, column_descriptions, feature_selection_model='SelectFromModel'): self.column_descriptions = column_descriptions self.type_of_estimator = type_of_estimator self.feature_selection_model = feature_selection_model
def test(): #?????? np.random.seed(13) X=pd.DataFrame(np.random.randn(20,10)) X.columns=['x%d'%i for i in range(10)] y=pd.Series(np.random.choice([0,1],20)) #??sklearn????????????? clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4) clf=SklearnSelector(estimator=clf_sklearn) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) clf_sklearn=SelectFromModel(LogisticRegression()) clf=SklearnSelector(estimator=clf_sklearn) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) #????? clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4) clf_selectfrommodel=SelectFromModel(LogisticRegression()) clf_baseselector=SklearnSelector(clf_selectkbest) clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest), ('clf_selectfrommodel',clf_selectfrommodel), ('clf_baseselector',clf_baseselector)],threshold=0.5) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) print(clf.df_voting) print(clf.score)
def fit(self, X, Y): import sklearn.svm from sklearn.feature_selection import SelectFromModel self.C = float(self.C) self.tol = float(self.tol) self.dual = self.dual == 'True' self.fit_intercept = self.fit_intercept == 'True' self.intercept_scaling = float(self.intercept_scaling) if self.class_weight == "None": self.class_weight = None preprocessor = sklearn.svm.LinearSVC(penalty=self.penalty, loss=self.loss, dual=self.dual, tol=self.tol, C=self.C, class_weight=self.class_weight, fit_intercept=self.fit_intercept, intercept_scaling=self.intercept_scaling, multi_class=self.multi_class, random_state=self.random_state) preprocessor.fit(X, Y) self.preprocessor = SelectFromModel(preprocessor, prefit=True) return self
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): # SelectfromModel clf = LGBMClassifier(n_estimators=400) clf.fit(matrix_x_temp, label_y) sfm = SelectFromModel(clf, prefit=True, threshold=th) matrix_x = sfm.transform(matrix_x_temp) # ???????????????? feature_score_dict = {} for fn, s in zip(fe_name, clf.feature_importances_): feature_score_dict[fn] = s m = 0 for k in feature_score_dict: if feature_score_dict[k] == 0.0: m += 1 print 'number of not-zero features:' + str(len(feature_score_dict) - m) # ???????? feature_score_dict_sorted = sorted(feature_score_dict.items(), key=lambda d: d[1], reverse=True) print 'feature_importance:' for ii in range(len(feature_score_dict_sorted)): print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] print '\n' f = open('../eda/lgb_feature_importance.txt', 'w') f.write(th) f.write('\nRank\tFeature Name\tFeature Importance\n') for i in range(len(feature_score_dict_sorted)): f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') f.close() # ??????????? how_long = matrix_x.shape[1] # matrix_x ? ?????? ???? feature_used_dict_temp = feature_score_dict_sorted[:how_long] feature_used_name = [] for ii in range(len(feature_used_dict_temp)): feature_used_name.append(feature_used_dict_temp[ii][0]) print 'feature_chooesed:' for ii in range(len(feature_used_name)): print feature_used_name[ii] print '\n' f = open('../eda/lgb_feature_chose.txt', 'w') f.write('Feature Chose Name :\n') for i in range(len(feature_used_name)): f.write(str(feature_used_name[i]) + '\n') f.close() # ?????????? feature_not_used_name = [] for i in range(len(fe_name)): if fe_name[i] not in feature_used_name: feature_not_used_name.append(fe_name[i]) return matrix_x, feature_not_used_name[:], len(feature_used_name)
def fit(): X, y = generate() dX = dd.from_pandas(X, npartitions=10) y = dd.from_pandas(y, npartitions=10) pre_pipe = make_pipeline( CategoricalEncoder(), DummyEncoder(), Imputer(), SGDRegressor(), ) pipe = make_pipeline( SelectFromModel(pre_pipe), GradientBoostingRegressor(), ) X_ = pre_pipe.fit_transform(dX) for i in range(X_.npartitions): for j in range(5): print(i, j) X_sub = X_.get_partition(i).compute() y_sub = y.get_partition(i).compute() clf.partial_fit(X_sub, y_sub) sfm = SelectFromModel(clf, prefit=True) return pipe, clf, sfm
def featuresByInformationGain(features,labels): treeCL = tree.DecisionTreeClassifier(criterion="entropy") treeCL = treeCL.fit(features,labels) transformed_features = SelectFromModel(treeCL,prefit=True).transform(features) return transformed_features
def plot_feature_importances(columns, X_train, y_train): feat_labels = columns[1:] forest = RandomForestClassifier(n_estimators=10000, random_state=0) forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % ( f+1, 30, feat_labels[indices[f]], importances[indices[f]], )) print() plt.title('Feature Importances') plt.bar( range(X_train.shape[1]), importances[indices], color='lightblue', align='center', ) plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.show() feature_selector = SelectFromModel(forest, threshold=0.15, prefit=True) X_selected = feature_selector.transform(X_train) print(X_selected.shape)
def tree_based_selection(self, data_set, data_target, feature_names): """ :param data_set: :return: """ clf = ExtraTreesClassifier() clf = clf.fit(data_set, data_target) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) feature_set = model.transform(data_set) fea_index = [] for A_col in np.arange(data_set.shape[1]): for B_col in np.arange(feature_set.shape[1]): if (data_set[:, A_col] == feature_set[:, B_col]).all(): fea_index.append(A_col) check = {} for i in fea_index: check[feature_names[i]] = data_set[0][i] print np.array(check) return feature_set, fea_index
def test_SelectFromModel(): ''' test the method of SelectFromModel :return: None ''' digits=load_digits() X=digits.data y=digits.target estimator=LinearSVC(penalty='l1',dual=False) selector=SelectFromModel(estimator=estimator,threshold='mean') selector.fit(X,y) selector.transform(X) print("Threshold %s"%selector.threshold_) print("Support is %s"%selector.get_support(indices=True))
def test_invalid_input(): clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None) for threshold in ["gobbledigook", ".5 * gobbledigook"]: model = SelectFromModel(clf, threshold=threshold) model.fit(data, y) assert_raises(ValueError, model.transform, data)
def test_input_estimator_unchanged(): """ Test that SelectFromModel fits on a clone of the estimator. """ est = RandomForestClassifier() transformer = SelectFromModel(estimator=est) transformer.fit(data, y) assert_true(transformer.estimator is est)
def test_feature_importances(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, 'feature_importances_')) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=sample_weight) importances = transformer.estimator_.feature_importances_ transformer.fit(X, y, sample_weight=3 * sample_weight) importances_bis = transformer.estimator_.feature_importances_ assert_almost_equal(importances, importances_bis) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_equal(X_new, X[:, mask])
def test_partial_fit(): est = PassiveAggressiveClassifier(random_state=0, shuffle=False) transformer = SelectFromModel(estimator=est) transformer.partial_fit(data, y, classes=np.unique(y)) old_model = transformer.estimator_ transformer.partial_fit(data, y, classes=np.unique(y)) new_model = transformer.estimator_ assert_true(old_model is new_model) X_transform = transformer.transform(data) transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data))
def test_warm_start(): est = PassiveAggressiveClassifier(warm_start=True, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(data, y) old_model = transformer.estimator_ transformer.fit(data, y) new_model = transformer.estimator_ assert_true(old_model is new_model)
def test_threshold_string(): est = RandomForestClassifier(n_estimators=50, random_state=0) model = SelectFromModel(est, threshold="0.5*mean") model.fit(data, y) X_transform = model.transform(data) # Calculate the threshold from the estimator directly. est.fit(data, y) threshold = 0.5 * np.mean(est.feature_importances_) mask = est.feature_importances_ > threshold assert_array_equal(X_transform, data[:, mask])
def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) model = SelectFromModel(clf, threshold=0.1) model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. model.threshold = 1.0 assert_greater(X_transform.shape[1], model.transform(data).shape[1])
def feature_selection(self): # use .csv replace .mat # vector = mat.loadmat('model\\vector.mat') # vector = vector['data'] with open('model\\happy_other.csv', 'r') as f: reader = csv.reader(f) vector_happy = [] for line in reader: for i in range(len(line) - 1): line[i] = float(line[i]) vector_happy.append(line) vector_happy = np.array(vector_happy) print(vector_happy) with open('model\\normal_sad.csv', 'r') as f: reader = csv.reader(f) vector_sad = [] for line in reader: for i in range(len(line) - 1): line[i] = float(line[i]) vector_sad.append(line) vector_sad = np.array(vector_sad) self.train_vector_happy = vector_happy[:, 0:28] self.target_vector_happy = vector_happy[:, 28:29] self.train_vector_sad = vector_sad[:, 0:28] self.target_vector_sad = vector_sad[:, 28:29] clf = ExtraTreesClassifier() clf = clf.fit(self.train_vector_happy, self.target_vector_happy.ravel()) model = SelectFromModel(clf, threshold='1.25*mean', prefit=True) joblib.dump(model, 'model\\vector_select.m') self.ex_vector_happy = model.transform(self.train_vector_happy) # after extract print(self.ex_vector_happy) self.ex_vector_sad = model.transform(self.train_vector_sad) # after extract
def fit(self, X, y=None): self.selector = get_feature_selection_model_from_name(self.type_of_estimator, self.feature_selection_model) if self.selector == 'KeepAll': if scipy.sparse.issparse(X): num_cols = X.shape[0] else: num_cols = len(X[0]) self.support_mask = [True for col_idx in range(num_cols) ] else: if self.feature_selection_model == 'SelectFromModel': num_cols = X.shape[1] num_rows = X.shape[0] if self.type_of_estimator == 'regressor': self.estimator = RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15) else: self.estimator = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15) self.estimator.fit(X, y) feature_importances = self.estimator.feature_importances_ # Two ways of doing feature selection # 1. Any feature with a feature importance of at least 1/100th of our max feature max_feature_importance = max(feature_importances) threshold_by_relative_importance = 0.01 * max_feature_importance # 2. 1/4 the number of rows (so 100 rows means 25 columns) sorted_importances = sorted(feature_importances, reverse=True) max_cols = int(num_rows * 0.25) try: threshold_by_max_cols = sorted_importances[max_cols] except IndexError: threshold_by_max_cols = sorted_importances[-1] threshold = max(threshold_by_relative_importance, threshold_by_max_cols) self.support_mask = [True if x > threshold else False for x in feature_importances] else: self.selector.fit(X, y) self.support_mask = self.selector.get_support() # Get a mask of which indices it is we want to keep self.index_mask = [idx for idx, val in enumerate(self.support_mask) if val == True] return self
def fit(self, X, y=None): print('Performing feature selection') self.selector = get_feature_selection_model_from_name(self.type_of_estimator, self.feature_selection_model) if self.selector == 'KeepAll': if scipy.sparse.issparse(X): num_cols = X.shape[0] else: num_cols = len(X[0]) self.support_mask = [True for col_idx in range(num_cols) ] else: if self.feature_selection_model == 'SelectFromModel': num_cols = X.shape[1] num_rows = X.shape[0] if self.type_of_estimator == 'regressor': self.estimator = RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15) else: self.estimator = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15) self.estimator.fit(X, y) feature_importances = self.estimator.feature_importances_ # Two ways of doing feature selection # 1. Any feature with a feature importance of at least 1/100th of our max feature max_feature_importance = max(feature_importances) threshold_by_relative_importance = 0.01 * max_feature_importance # 2. 1/4 the number of rows (so 100 rows means 25 columns) sorted_importances = sorted(feature_importances, reverse=True) max_cols = int(num_rows * 0.25) try: threshold_by_max_cols = sorted_importances[max_cols] except IndexError: threshold_by_max_cols = sorted_importances[-1] threshold = max(threshold_by_relative_importance, threshold_by_max_cols) self.support_mask = [True if x > threshold else False for x in feature_importances] else: self.selector.fit(X, y) self.support_mask = self.selector.get_support() # Get a mask of which indices it is we want to keep self.index_mask = [idx for idx, val in enumerate(self.support_mask) if val == True] return self
def train_sentence_classifier(self, pairtype): self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(7,20), min_df=0.2, max_df=0.5)), #('vect', CountVectorizer(analyzer='word', ngram_range=(1,5), stop_words="english", min_df=0.1)), # ('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(6,20))), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.SVC(kernel='rbf', C=10, verbose=True, tol=1e-5)) #('clf', RandomForestClassifier(n_estimators=10)) #('feature_selection', feature_selection.SelectFromModel(LinearSVC(penalty="l1"))), ('clf', MultinomialNB(alpha=0.1, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ]) f, labels, sids = self.get_features(pairtype) half_point = int(len(f)*0.5) self.train_sentences = sids[:half_point] """ch2 = SelectKBest(chi2, k=20) X_train = text_clf.named_steps["vect"].fit_transform(f[:half_point]) X_test = text_clf.named_steps["vect"].transform(f[half_point:]) X_train = ch2.fit_transform(X_train, labels[:half_point]) X_test = ch2.transform(X_test) feature_names = text_clf.named_steps["vect"].get_feature_names() feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] # print feature_names""" # train text_clf = self.text_clf.fit(f[:half_point], labels[:half_point]) #save model if not os.path.exists("models/kernel_models/" + pairtype + "_sentence_classifier/"): os.makedirs("models/kernel_models/" + pairtype + "_sentence_classifier/") logging.info("Training complete, saving to {}/{}/{}.pkl".format("models/kernel_models/", pairtype + "_sentence_classifier/", pairtype)) joblib.dump(text_clf, "{}/{}/{}.pkl".format("models/kernel_models/", pairtype + "_sentence_classifier/", pairtype)) # evaluate pred = text_clf.predict(f[half_point:]) # print len(pred), sum(pred) self.type_sentences[pairtype] = [] for ip, p in enumerate(pred): if p: self.type_sentences[pairtype].append(sids[half_point + ip]) res = metrics.confusion_matrix(labels[half_point:], pred) return res[1][1], res[0][1], res[1][0]