Python sklearn.feature_selection 模块,SelectFromModel() 实例源码

我们从Python开源项目中,提取了以下34个代码示例,用于说明如何使用sklearn.feature_selection.SelectFromModel()

项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLogisticRegression(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'RandomizedSparse': RandomizedLasso(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, Y):
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        preprocessor = ExtraTreesRegressor(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            random_state=self.random_state)
        preprocessor.fit(X, Y)
        self.preprocessor = SelectFromModel(preprocessor, prefit=True)

        return self
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, Y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesClassifier
        from sklearn.feature_selection import SelectFromModel

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        preprocessor = ExtraTreesClassifier(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            random_state=self.random_state, class_weight=self.class_weight
        )
        preprocessor.fit(X, Y, sample_weight=sample_weight)
        self.preprocessor = SelectFromModel(preprocessor, prefit=True)
        return self
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)


    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    data = anamolySample
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample, normalSample, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data, target)
    model = SelectFromModel(clf, prefit=True)
    outcome = model.get_support()

    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    print warnstr
    return warnstr
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithTreeBaesd(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = ExtraTreesClassifier()
    clf = clf.fit(data,target)   
    model = SelectFromModel(clf,prefit=True) 
    outcome = model.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def __init__(self, type_of_estimator, column_descriptions, feature_selection_model='SelectFromModel'):

        self.column_descriptions = column_descriptions
        self.type_of_estimator = type_of_estimator
        self.feature_selection_model = feature_selection_model
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def test():
    #??????
    np.random.seed(13)
    X=pd.DataFrame(np.random.randn(20,10))
    X.columns=['x%d'%i for i in range(10)]
    y=pd.Series(np.random.choice([0,1],20))

    #??sklearn?????????????
    clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
    clf=SklearnSelector(estimator=clf_sklearn)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)

    clf_sklearn=SelectFromModel(LogisticRegression())
    clf=SklearnSelector(estimator=clf_sklearn)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)

    #?????
    clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
    clf_selectfrommodel=SelectFromModel(LogisticRegression())
    clf_baseselector=SklearnSelector(clf_selectkbest)
    clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest),
                                  ('clf_selectfrommodel',clf_selectfrommodel),
                                  ('clf_baseselector',clf_baseselector)],threshold=0.5)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)
    print(clf.df_voting)
    print(clf.score)
项目:AutoML-Challenge    作者:postech-mlg-exbrain    | 项目源码 | 文件源码
def fit(self, X, Y):
        import sklearn.svm
        from sklearn.feature_selection import SelectFromModel

        self.C = float(self.C)
        self.tol = float(self.tol)

        self.dual = self.dual == 'True'
        self.fit_intercept = self.fit_intercept == 'True'
        self.intercept_scaling = float(self.intercept_scaling)

        if self.class_weight == "None":
            self.class_weight = None

        preprocessor = sklearn.svm.LinearSVC(penalty=self.penalty,
                                             loss=self.loss,
                                             dual=self.dual,
                                             tol=self.tol,
                                             C=self.C,
                                             class_weight=self.class_weight,
                                             fit_intercept=self.fit_intercept,
                                             intercept_scaling=self.intercept_scaling,
                                             multi_class=self.multi_class,
                                             random_state=self.random_state)
        preprocessor.fit(X, Y)
        self.preprocessor = SelectFromModel(preprocessor, prefit=True)
        return self
项目:QH_FInSight    作者:yzkang    | 项目源码 | 文件源码
def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
    # SelectfromModel
    clf = LGBMClassifier(n_estimators=400)
    clf.fit(matrix_x_temp, label_y)
    sfm = SelectFromModel(clf, prefit=True, threshold=th)
    matrix_x = sfm.transform(matrix_x_temp)

    # ????????????????
    feature_score_dict = {}
    for fn, s in zip(fe_name, clf.feature_importances_):
        feature_score_dict[fn] = s
    m = 0
    for k in feature_score_dict:
        if feature_score_dict[k] == 0.0:
            m += 1
    print 'number of not-zero features:' + str(len(feature_score_dict) - m)

    # ????????
    feature_score_dict_sorted = sorted(feature_score_dict.items(),
                                       key=lambda d: d[1], reverse=True)
    print 'feature_importance:'
    for ii in range(len(feature_score_dict_sorted)):
        print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
    print '\n'

    f = open('../eda/lgb_feature_importance.txt', 'w')
    f.write(th)
    f.write('\nRank\tFeature Name\tFeature Importance\n')
    for i in range(len(feature_score_dict_sorted)):
        f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
    f.close()

    # ???????????
    how_long = matrix_x.shape[1]  # matrix_x ? ?????? ????
    feature_used_dict_temp = feature_score_dict_sorted[:how_long]
    feature_used_name = []
    for ii in range(len(feature_used_dict_temp)):
        feature_used_name.append(feature_used_dict_temp[ii][0])
    print 'feature_chooesed:'
    for ii in range(len(feature_used_name)):
        print feature_used_name[ii]
    print '\n'

    f = open('../eda/lgb_feature_chose.txt', 'w')
    f.write('Feature Chose Name :\n')
    for i in range(len(feature_used_name)):
        f.write(str(feature_used_name[i]) + '\n')
    f.close()

    # ??????????
    feature_not_used_name = []
    for i in range(len(fe_name)):
        if fe_name[i] not in feature_used_name:
            feature_not_used_name.append(fe_name[i])

    return matrix_x, feature_not_used_name[:], len(feature_used_name)
项目:sktransformers    作者:TomAugspurger    | 项目源码 | 文件源码
def fit():
    X, y = generate()
    dX = dd.from_pandas(X, npartitions=10)
    y = dd.from_pandas(y, npartitions=10)

    pre_pipe = make_pipeline(
        CategoricalEncoder(),
        DummyEncoder(),
        Imputer(),
        SGDRegressor(),
    )

    pipe = make_pipeline(
        SelectFromModel(pre_pipe),
        GradientBoostingRegressor(),
    )
    X_ = pre_pipe.fit_transform(dX)

    for i in range(X_.npartitions):
        for j in range(5):
            print(i, j)
            X_sub = X_.get_partition(i).compute()
            y_sub = y.get_partition(i).compute()
            clf.partial_fit(X_sub, y_sub)

    sfm = SelectFromModel(clf, prefit=True)
    return pipe, clf, sfm
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def __init__(self, type_of_estimator, column_descriptions, feature_selection_model='SelectFromModel'):

        self.column_descriptions = column_descriptions
        self.type_of_estimator = type_of_estimator
        self.feature_selection_model = feature_selection_model
项目:DocumentClassification    作者:bahmanh    | 项目源码 | 文件源码
def featuresByInformationGain(features,labels):
    treeCL = tree.DecisionTreeClassifier(criterion="entropy")
    treeCL = treeCL.fit(features,labels)
    transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
    return transformed_features
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def plot_feature_importances(columns, X_train, y_train):
    feat_labels = columns[1:]

    forest = RandomForestClassifier(n_estimators=10000, random_state=0)

    forest.fit(X_train, y_train)
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]

    for f in range(X_train.shape[1]):
        print("%2d) %-*s %f" % (
            f+1,
            30,
            feat_labels[indices[f]],
            importances[indices[f]],
        ))
    print()

    plt.title('Feature Importances')
    plt.bar(
        range(X_train.shape[1]),
        importances[indices],
        color='lightblue',
        align='center',
    )
    plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()

    feature_selector = SelectFromModel(forest, threshold=0.15, prefit=True)
    X_selected = feature_selector.transform(X_train)
    print(X_selected.shape)
项目:kdd99-scikit    作者:PENGZhaoqing    | 项目源码 | 文件源码
def tree_based_selection(self, data_set, data_target, feature_names):
        """

        :param data_set:
        :return:
        """

        clf = ExtraTreesClassifier()
        clf = clf.fit(data_set, data_target)
        print clf.feature_importances_

        model = SelectFromModel(clf, prefit=True)
        feature_set = model.transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print np.array(check)

        return feature_set, fea_index
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_SelectFromModel():
    '''
    test the method of SelectFromModel
    :return: None
    '''
    digits=load_digits()
    X=digits.data
    y=digits.target
    estimator=LinearSVC(penalty='l1',dual=False)
    selector=SelectFromModel(estimator=estimator,threshold='mean')
    selector.fit(X,y)
    selector.transform(X)
    print("Threshold %s"%selector.threshold_)
    print("Support is %s"%selector.get_support(indices=True))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_invalid_input():
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=None)
    for threshold in ["gobbledigook", ".5 * gobbledigook"]:
        model = SelectFromModel(clf, threshold=threshold)
        model.fit(data, y)
        assert_raises(ValueError, model.transform, data)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_input_estimator_unchanged():
    """
    Test that SelectFromModel fits on a clone of the estimator.
    """
    est = RandomForestClassifier()
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    assert_true(transformer.estimator is est)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert_true(hasattr(transformer.estimator_, 'feature_importances_'))

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=sample_weight)
    importances = transformer.estimator_.feature_importances_
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    importances_bis = transformer.estimator_.feature_importances_
    assert_almost_equal(importances, importances_bis)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_equal(X_new, X[:, mask])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_partial_fit():
    est = PassiveAggressiveClassifier(random_state=0, shuffle=False)
    transformer = SelectFromModel(estimator=est)
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    old_model = transformer.estimator_
    transformer.partial_fit(data, y,
                            classes=np.unique(y))
    new_model = transformer.estimator_
    assert_true(old_model is new_model)

    X_transform = transformer.transform(data)
    transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
    assert_array_equal(X_transform, transformer.transform(data))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_warm_start():
    est = PassiveAggressiveClassifier(warm_start=True, random_state=0)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(data, y)
    old_model = transformer.estimator_
    transformer.fit(data, y)
    new_model = transformer.estimator_
    assert_true(old_model is new_model)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_threshold_string():
    est = RandomForestClassifier(n_estimators=50, random_state=0)
    model = SelectFromModel(est, threshold="0.5*mean")
    model.fit(data, y)
    X_transform = model.transform(data)

    # Calculate the threshold from the estimator directly.
    est.fit(data, y)
    threshold = 0.5 * np.mean(est.feature_importances_)
    mask = est.feature_importances_ > threshold
    assert_array_equal(X_transform, data[:, mask])
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_threshold_without_refitting():
    """Test that the threshold can be set without refitting the model."""
    clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0)
    model = SelectFromModel(clf, threshold=0.1)
    model.fit(data, y)
    X_transform = model.transform(data)

    # Set a higher threshold to filter out more features.
    model.threshold = 1.0
    assert_greater(X_transform.shape[1], model.transform(data).shape[1])
项目:emotion-recognition    作者:yinxiaojian    | 项目源码 | 文件源码
def feature_selection(self):
        # use .csv replace .mat
        # vector = mat.loadmat('model\\vector.mat')
        # vector = vector['data']

        with open('model\\happy_other.csv', 'r') as f:
            reader = csv.reader(f)
            vector_happy = []
            for line in reader:
                for i in range(len(line) - 1):
                    line[i] = float(line[i])
                vector_happy.append(line)
        vector_happy = np.array(vector_happy)
        print(vector_happy)
        with open('model\\normal_sad.csv', 'r') as f:
            reader = csv.reader(f)
            vector_sad = []
            for line in reader:
                for i in range(len(line) - 1):
                    line[i] = float(line[i])
                vector_sad.append(line)
        vector_sad = np.array(vector_sad)

        self.train_vector_happy = vector_happy[:, 0:28]
        self.target_vector_happy = vector_happy[:, 28:29]
        self.train_vector_sad = vector_sad[:, 0:28]
        self.target_vector_sad = vector_sad[:, 28:29]

        clf = ExtraTreesClassifier()
        clf = clf.fit(self.train_vector_happy, self.target_vector_happy.ravel())
        model = SelectFromModel(clf, threshold='1.25*mean', prefit=True)
        joblib.dump(model, 'model\\vector_select.m')

        self.ex_vector_happy = model.transform(self.train_vector_happy)   # after extract
        print(self.ex_vector_happy)
        self.ex_vector_sad = model.transform(self.train_vector_sad)  # after extract
项目:auto_ml    作者:doordash    | 项目源码 | 文件源码
def fit(self, X, y=None):


        self.selector = get_feature_selection_model_from_name(self.type_of_estimator, self.feature_selection_model)

        if self.selector == 'KeepAll':
            if scipy.sparse.issparse(X):
                num_cols = X.shape[0]
            else:
                num_cols = len(X[0])

            self.support_mask = [True for col_idx in range(num_cols) ]
        else:
            if self.feature_selection_model == 'SelectFromModel':
                num_cols = X.shape[1]
                num_rows = X.shape[0]
                if self.type_of_estimator == 'regressor':
                    self.estimator = RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15)
                else:
                    self.estimator = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15)

                self.estimator.fit(X, y)

                feature_importances = self.estimator.feature_importances_

                # Two ways of doing feature selection

                # 1. Any feature with a feature importance of at least 1/100th of our max feature
                max_feature_importance = max(feature_importances)
                threshold_by_relative_importance = 0.01 * max_feature_importance

                # 2. 1/4 the number of rows (so 100 rows means 25 columns)
                sorted_importances = sorted(feature_importances, reverse=True)
                max_cols = int(num_rows * 0.25)
                try:
                    threshold_by_max_cols = sorted_importances[max_cols]
                except IndexError:
                    threshold_by_max_cols = sorted_importances[-1]

                threshold = max(threshold_by_relative_importance, threshold_by_max_cols)
                self.support_mask = [True if x > threshold else False for x in feature_importances]

            else:
                self.selector.fit(X, y)
                self.support_mask = self.selector.get_support()

        # Get a mask of which indices it is we want to keep
        self.index_mask = [idx for idx, val in enumerate(self.support_mask) if val == True]
        return self
项目:auto_ml    作者:ClimbsRocks    | 项目源码 | 文件源码
def fit(self, X, y=None):
        print('Performing feature selection')


        self.selector = get_feature_selection_model_from_name(self.type_of_estimator, self.feature_selection_model)

        if self.selector == 'KeepAll':
            if scipy.sparse.issparse(X):
                num_cols = X.shape[0]
            else:
                num_cols = len(X[0])

            self.support_mask = [True for col_idx in range(num_cols) ]
        else:
            if self.feature_selection_model == 'SelectFromModel':
                num_cols = X.shape[1]
                num_rows = X.shape[0]
                if self.type_of_estimator == 'regressor':
                    self.estimator = RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15)
                else:
                    self.estimator = RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15)

                self.estimator.fit(X, y)

                feature_importances = self.estimator.feature_importances_

                # Two ways of doing feature selection

                # 1. Any feature with a feature importance of at least 1/100th of our max feature
                max_feature_importance = max(feature_importances)
                threshold_by_relative_importance = 0.01 * max_feature_importance

                # 2. 1/4 the number of rows (so 100 rows means 25 columns)
                sorted_importances = sorted(feature_importances, reverse=True)
                max_cols = int(num_rows * 0.25)
                try:
                    threshold_by_max_cols = sorted_importances[max_cols]
                except IndexError:
                    threshold_by_max_cols = sorted_importances[-1]

                threshold = max(threshold_by_relative_importance, threshold_by_max_cols)
                self.support_mask = [True if x > threshold else False for x in feature_importances]

            else:
                self.selector.fit(X, y)
                self.support_mask = self.selector.get_support()

        # Get a mask of which indices it is we want to keep
        self.index_mask = [idx for idx, val in enumerate(self.support_mask) if val == True]
        return self
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def train_sentence_classifier(self, pairtype):
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(7,20), min_df=0.2, max_df=0.5)),
                             #('vect', CountVectorizer(analyzer='word', ngram_range=(1,5), stop_words="english", min_df=0.1)),
                             #     ('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('tfidf', TfidfVectorizer(analyzer='char_wb', ngram_range=(6,20))),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.SVC(kernel='rbf', C=10, verbose=True, tol=1e-5))
                                  #('clf', RandomForestClassifier(n_estimators=10))
                                    #('feature_selection', feature_selection.SelectFromModel(LinearSVC(penalty="l1"))),
                                  ('clf', MultinomialNB(alpha=0.1, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
        f, labels, sids = self.get_features(pairtype)
        half_point = int(len(f)*0.5)
        self.train_sentences = sids[:half_point]
        """ch2 = SelectKBest(chi2, k=20)
        X_train = text_clf.named_steps["vect"].fit_transform(f[:half_point])
        X_test = text_clf.named_steps["vect"].transform(f[half_point:])
        X_train = ch2.fit_transform(X_train, labels[:half_point])
        X_test = ch2.transform(X_test)
        feature_names = text_clf.named_steps["vect"].get_feature_names()
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        # print feature_names"""
        # train
        text_clf = self.text_clf.fit(f[:half_point], labels[:half_point])

        #save model
        if not os.path.exists("models/kernel_models/" + pairtype + "_sentence_classifier/"):
            os.makedirs("models/kernel_models/" + pairtype + "_sentence_classifier/")
        logging.info("Training complete, saving to {}/{}/{}.pkl".format("models/kernel_models/",
                                                                        pairtype + "_sentence_classifier/", pairtype))
        joblib.dump(text_clf, "{}/{}/{}.pkl".format("models/kernel_models/",
                                                                        pairtype + "_sentence_classifier/", pairtype))

        # evaluate
        pred = text_clf.predict(f[half_point:])
        # print len(pred), sum(pred)
        self.type_sentences[pairtype] = []
        for ip, p in enumerate(pred):
            if p:
                self.type_sentences[pairtype].append(sids[half_point + ip])

        res = metrics.confusion_matrix(labels[half_point:], pred)
        return res[1][1], res[0][1], res[1][0]