Python sklearn.feature_selection 模块，SelectKBest() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用sklearn.feature_selection.SelectKBest()。

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

项目：ML-note 作者：JasonK93 | 项目源码 | 文件源码

def test_SelectKBest():
    '''
    test the method of SelectKBert
    :return:  None
    '''
    X=[   [1,2,3,4,5],
          [5,4,3,2,1],
          [3,3,3,3,3,],
          [1,1,1,1,1] ]
    y=[0,1,0,1]
    print("before transform:",X)
    selector=SelectKBest(score_func=f_classif,k=3)
    selector.fit(X,y)
    print("scores_:",selector.scores_)
    print("pvalues_:",selector.pvalues_)
    print("selected index:",selector.get_support(True))
    print("after transform:",selector.transform(X))

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample,normalSample,topk):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

项目：Stock-Market-Analysis-and-Prediction 作者：samshara | 项目源码 | 文件源码

def select_kbest_reg(data_frame, target, k=5):
    """
    Selecting K-Best features regression
    :param data_frame: A pandas dataFrame with the training data
    :param target: target variable name in DataFrame
    :param k: desired number of features from the data
    :returns feature_scores: scores for each feature in the data as 
    pandas DataFrame
    """
    feat_selector = SelectKBest(f_regression, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])

    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns

    return feat_scores

项目：searchgrid 作者：jnothman | 项目源码 | 文件源码

def test_build_param_grid_set_estimator():
    clf1 = SVC()
    clf2 = LogisticRegression()
    clf3 = SVC()
    clf4 = SGDClassifier()
    estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])),
                                   ('clf', None)]),
                         clf=[set_grid(clf1, kernel=['linear']),
                              clf2,
                              set_grid(clf3, kernel=['poly'], degree=[2, 3]),
                              clf4])
    param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]},
                  {'clf': [clf3], 'clf__kernel': ['poly'],
                   'clf__degree': [2, 3], 'sel__k': [2, 3]},
                  {'clf': [clf2, clf4], 'sel__k': [2, 3]}]
    assert build_param_grid(estimator) == param_grid

项目：MENGEL 作者：CodeSpaceHQ | 项目源码 | 文件源码

def select_percentile_selector(data,target):

    # Select Model
    selector = SelectPercentile(percentile = 75)  # Default is 10%

    # Fit, Format, and Return
    return format_selector(selector, data, target)


# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_c2(X,y):
    """ chi2 """
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf  = SelectKBest(chi2, k = de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1), clf.transform(X2)
    return _func

# def de_mic(X,y):
#     """ MIC """
#     dim  = X.shape[1]
#     de   = min(2000,dim) 
#     clf = SelectKBest(MIC, k=de)
#     clf.fit(X,y)
#     def _func(X1,X2):
#         return clf.transform(X1),clf.transform(X2)
#     return _func

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    name = []
    for i in data.columns:
        name.append(i)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):

    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample, normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    print 'x2:',warnstr
    return warnstr

项目：Stock-Market-Analysis-and-Prediction 作者：samshara | 项目源码 | 文件源码

def select_kbest_clf(data_frame, target, k=4):
    """
    Selecting K-Best features for classification
    :param data_frame: A pandas dataFrame with the training data
    :param target: target variable name in DataFrame
    :param k: desired number of features from the data
    :returns feature_scores: scores for each feature in the data as 
    pandas DataFrame
    """
    feat_selector = SelectKBest(f_classif, k=k)
    _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target])

    feat_scores = pd.DataFrame()
    feat_scores["F Score"] = feat_selector.scores_
    feat_scores["P Value"] = feat_selector.pvalues_
    feat_scores["Support"] = feat_selector.get_support()
    feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns

    return feat_scores

项目：Oedipus 作者：tum-i22 | 项目源码 | 文件源码

def reduceDimensionality(X, y, method="selectkbest", targetDim=10):
    """ Reduces the dimensionality of [X] to [targetDim] """
    try:
        # Check for the required methodology first
        if method.lower() == "selectkbest":
            prettyPrint("Selecting %s best features from dataset" % targetDim, "debug")
            kBestSelector = SelectKBest(k=targetDim)
            X_new = kBestSelector.fit_transform(X, y).tolist()
        elif method.lower() == "pca":
            prettyPrint("Extracting %s features from dataset using PCA" % targetDim, "debug")
            pcaExtractor = PCA(n_components=targetDim)
            # Make sure vectors in X are positive
            X_new = pcaExtractor.fit_transform(X, y).tolist()
        else:
            prettyPrint("Unknown dimensionality reduction method \"%s\"" % method, "warning")
            return X

    except Exception as e:
        prettyPrint("Error encountered in \"reduceDimensionality\": %s" % e, "error")
        return X

    # Return the reduced dataset
    return X_new

项目：deeppavlov 作者：deepmipt | 项目源码 | 文件源码

def ngrams_selection(train_data, train_labels, ind, model_file,
                     ngram_range_=(1, 1), max_num_features=100,
                     analyzer_type='word'):
    """Create and save vectorizers and feature selectors on given train data.

    Args:
        train_data: list of train text samples
        train_labels: list of train labels
        ind: index of vectorizer/selector to save file
        model_file: model filename
        ngram_range_: range of n-grams
        max_num_features: maximum number of features to select
        analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char'

    Returns:
        nothing
    """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type)

    X_train = vectorizer.fit_transform(train_data)

    if max_num_features < X_train.shape[1]:
        ch2 = SelectKBest(chi2, k=max_num_features)
        ch2.fit(X_train, train_labels)
        data_struct = {'vectorizer': vectorizer, 'selector': ch2}
        print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    else:
        data_struct = {'vectorizer': vectorizer}
        print ('creating', model_file + '_ngrams_vect_' + ind + '.bin')
        with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f:
            pickle.dump(data_struct, f)
    return

项目：rdocChallenge 作者：Elyne | 项目源码 | 文件源码

def chiSquare(train_data, train_classes, topK):
    vectorizer = DictVectorizer()  

    # Fit and transform the train data.        
    x_train = vectorizer.fit_transform(train_data)
    y_train = train_classes

    if (x_train.shape[1] < topK):
        topK = x_train.shape[1]

    selector = SelectKBest(chi2, k=topK)
    x_new = selector.fit_transform(x_train, y_train)

    return vectorizer.inverse_transform(selector.inverse_transform(x_new))

项目：MultimodalAutoencoder 作者：natashamjaques | 项目源码 | 文件源码

def transform_select_K_best(X_train,Y_train, X_all, K=100):
    """Selects the best K features given the training data.

    Args:
        X_train: A matrix containing training data
        Y_train: Classification labels for the training data
        X_all: A matrix containing all the data
        K: The number of features to select
    """
    skb = SelectKBest(f_classif,K)
    skb.fit(X_train,Y_train)

    return skb.transform(X_all)

项目：searchgrid 作者：jnothman | 项目源码 | 文件源码

def test_make_pipeline():
    t1 = SelectKBest()
    t2 = SelectKBest()
    t3 = SelectKBest()
    t4 = SelectKBest()
    t5 = SelectPercentile()
    t6 = SelectKBest()
    t7 = SelectKBest()
    t8 = SelectKBest()
    t9 = SelectPercentile()
    in_steps = [[t1, None],
                [t2, t3],
                [t4, t5],  # mixed
                t6,
                [None, t7],
                [t8, None, t9],  # mixed
                None]
    pipe = make_pipeline(*in_steps, memory='/path/to/nowhere')
    union = make_union(*in_steps)

    for est, est_steps in [(pipe, pipe.steps),
                           (union, union.transformer_list)]:
        names, steps = zip(*est_steps)
        assert names == ('selectkbest-1', 'selectkbest-2', 'alt-1',
                         'selectkbest-3', 'selectkbest-4', 'alt-2', 'nonetype')
        assert steps == (t1, t2, t4, t6, None, t8, None)

        assert len(est._param_grid) == 5
        assert est._param_grid[names[0]] == [t1, None]
        assert est._param_grid[names[1]] == [t2, t3]
        assert est._param_grid[names[2]] == [t4, t5]
        assert est._param_grid[names[4]] == [None, t7]
        assert est._param_grid[names[5]] == [t8, None, t9]

    assert type(pipe) is Pipeline
    assert type(union) is FeatureUnion
    assert pipe.memory == '/path/to/nowhere'

项目：oss-github-analysis-project 作者：itu-oss-project-team | 项目源码 | 文件源码

def export_best_feature_names(self, df, labels, out_folder_path, k):
        columns, repos, observations = self.decompose_df(df)
        feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_
        feature_scores = np.nan_to_num(feature_scores)
        k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:]
        k_best_feature_names = columns[k_best_features]

        out_file_path = os.path.join(out_folder_path, "feature_selection.txt")
        with open(out_file_path, "w") as output_file:
            for feature_name in k_best_feature_names:
                output_file.write(feature_name + "\n")

项目：ModelFlow 作者：yuezPrincetechs | 项目源码 | 文件源码

def __init__(self,selector,return_array=False):
        '''
        ??sklearn??????????????????sklearn?
        selector: sklearn.feature_selection????????????sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif,k=4)?
        return_array: True?????numpy.ndarray?Fasle??????X?

        ????: 
        selector_: fit??selector?
        '''
        BaseSelector.__init__(self,return_array=return_array)
        self.selector=selector

项目：ModelFlow 作者：yuezPrincetechs | 项目源码 | 文件源码

def test():
    #??????
    np.random.seed(13)
    X=pd.DataFrame(np.random.randn(20,10))
    X.columns=['x%d'%i for i in range(10)]
    y=pd.Series(np.random.choice([0,1],20))

    #??sklearn?????????????
    clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
    clf=SklearnSelector(estimator=clf_sklearn)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)

    clf_sklearn=SelectFromModel(LogisticRegression())
    clf=SklearnSelector(estimator=clf_sklearn)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)

    #?????
    clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4)
    clf_selectfrommodel=SelectFromModel(LogisticRegression())
    clf_baseselector=SklearnSelector(clf_selectkbest)
    clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest),
                                  ('clf_selectfrommodel',clf_selectfrommodel),
                                  ('clf_baseselector',clf_baseselector)],threshold=0.5)
    clf.fit(X,y)
    clf.transform(X)
    print(clf.feature_selected)
    print(clf.df_voting)
    print(clf.score)

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def __init__(self, conf):
        SemiSupervisedFeatureSelection.__init__(self, conf)
        self.projection = SelectKBest(mutual_info_classif, k = conf.num_components)

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def __init__(self, conf):
        SemiSupervisedFeatureSelection.__init__(self, conf)
        self.projection = SelectKBest(chi2, k = conf.num_components)

项目：SecuML 作者：ANSSI-FR | 项目源码 | 文件源码

def __init__(self, conf):
        SemiSupervisedFeatureSelection.__init__(self, conf)
        self.projection = SelectKBest(f_classif, k = conf.num_components)

项目：USTC_AILab2 作者：overflocat | 项目源码 | 文件源码

def getFeature():
    fileData = open("data")
    row = []
    col = []
    data = []
    evalRes = []
    rowIndex = -1
    fileList = fileData.readlines()
    random.shuffle(fileList)
    for line in fileList:
        line = line.rstrip('\n')
        dataList = re.split(' |:', line)

        if int(dataList[0]) >= 7:
            evalRes.append(1)
        else:
            if int(dataList[0]) <= 4:
                evalRes.append(-1)
            else:
                continue
        del dataList[0]

        rowIndex = rowIndex + 1
        row.extend([rowIndex] * int(len(dataList) / 2))
        col.extend(map(int, dataList[::2]))
        data.extend(map(int, dataList[1::2]))

    featureMatrix = csr_matrix((data, (row, col)))
    featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes)
    return featureMNew, evalRes

项目：train-occupancy 作者：datamindedbe | 项目源码 | 文件源码

def build_model_random_forest(df, features, categorical_features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]


    # one_hot_encoding because it doesn't work in pipeline for some reason
    # for f in categorical_features:
    #     dummies = pd.get_dummies(df[f], prefix=f)
    #     for dummy in dummies.columns:
    #         df[dummy] = dummies[dummy]
    #         features.append(dummy)
    #     df = df.drop(f, 1)
    #     features.remove(f)

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=5)),
        ("forest", RandomForestClassifier())])
    clf.fit(train[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    print "Random Forest score: %f" % score
    print "confusion_matrix : \n%s" % cm
    return clf

项目：train-occupancy 作者：datamindedbe | 项目源码 | 文件源码

def make_predictions_random_forest(df, features, target, split=0.70):
    print "using %d features (%d columns) on %d rows and target %s. Split %f." % (
    len(features), len(df.columns), len(df), target, split)
    # print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features])
    # print "columns: ", '\n\t\t'.join(df.columns)
    df['is_train'] = np.random.uniform(0, 1, len(df)) <= split
    train, test = df[df['is_train'] == True], df[df['is_train'] == False]

    clf = Pipeline([
        ("imputer", Imputer(strategy="mean", axis=0)),
        ('feature_selection', SelectKBest(k=200)),
        ("forest", RandomForestClassifier(
            min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))])
    clf.fit(train[features], train[target])
    score = clf.score(test[features], test[target])
    predicted = clf.predict(test[features])

    cm = confusion_matrix(test[target], predicted)
    # print classification_report(test[target], predicted)

    return score, cm


# Utility function to report best scores

项目：MENGEL 作者：CodeSpaceHQ | 项目源码 | 文件源码

def select_k_best_selector(data,target):

    # Select Model
    selector = SelectKBest(k=3)  # default is 10 features

    # Fit, Format, and Return
    return format_selector(selector, data, target)

项目：jingjuSingingPhraseMatching 作者：ronggong | 项目源码 | 文件源码

def buildEstimators(mode):
    if mode == 'train' or mode == 'cv':
        # best parameters got by gridsearchCV, best score: 1
        estimators = [('anova_filter', SelectKBest(f_classif, k='all')),
                      ('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))]
        clf = Pipeline(estimators)
    elif mode == 'test':
        clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r"))
    return clf

项目：FLASH 作者：yuyuz | 项目源码 | 文件源码

def main():
    from sklearn import svm
    from sklearn.datasets import samples_generator
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_regression
    from sklearn.preprocessing import MinMaxScaler

    X, y = samples_generator.make_classification(n_samples=1000, n_informative=5, n_redundant=4, random_state=_random_state)
    anova_filter = SelectKBest(f_regression, k=5)
    scaler = MinMaxScaler()
    clf = svm.SVC(kernel='linear')

    steps = [scaler, anova_filter, clf]
    cached_run(steps, X, y)

项目：semeval2016-task4 作者：aesuli | 项目源码 | 文件源码

def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)

项目：RIDDLE 作者：jisungk | 项目源码 | 文件源码

def select_feats(X, y, nb_features, nb_features_to_keep=2048):
    X, y = preproc_for_sklearn(X, y, nb_features)

    if nb_features < nb_features_to_keep:
        nb_features_to_keep = nb_features_to_keep / 4

    feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y)
    selected_indices = feature_selector.get_support(indices=True) 

    return selected_indices

项目：Aion 作者：aleisalem | 项目源码 | 文件源码

def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0):
    """
    Classifies the data using K-nearest neighbors and k-fold CV
    :param X: The list of feature vectors
    :type X: list
    :param y: The list of labels corresponding to the feature vectors
    :type y: list
    :param K: The number of nearest neighbors to consider in classification
    :type K: int
    :param kfold: The number of folds in the CV
    :type kfold: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: An array of predicted classes
    """
    try:
        # Prepare data 
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = neighbors.KNeighborsClassifier(n_neighbors=K)
        # Select K Best features if enabled
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()

    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

项目：Aion 作者：aleisalem | 项目源码 | 文件源码

def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0):
    """
    Trains a K-NN using the training data and tests it using the test data using K-fold cross validation
    :type X: list
    :param y: The labels corresponding to the training feature vectors
    :type y: list
    :param Xtest: The matrix of test feature vectors
    :type Xtest: list
    :param ytest: The labels corresponding to the test feature vectors
    :type ytest: list
    :param K: The number of nearest neighbors to consider in classification
    :type K: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: Two lists of the validation and test accuracies across the k-folds
    """
    try:
        predicted, predicted_test = [], []
        # Define classifier and cross validation iterator
        clf = neighbors.KNeighborsClassifier(n_neighbors=K)
        # Start the cross validation learning
        X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
        # Select K Best features if enabled
        prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest
        # Fit model
        prettyPrint("Fitting model")
        clf.fit(X_new, y)
        # Validate and test model
        prettyPrint("Validating model using training data")
        predicted = clf.predict(X_new)
        prettyPrint("Testing model")
        predicted_test = clf.predict(Xtest_new)

    except Exception as e:
        prettyPrintError(e)
        return [], []

    return predicted, predicted_test

项目：Aion 作者：aleisalem | 项目源码 | 文件源码

def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0):
    """Classifies the data using Support vector machines with the SSK kernel and k-fold CV
    :param X: The list of text documents containing traces
    :type X: list
    :param y: The labels of documents in 'X'
    :type y: list
    :param kfold: The number of folds
    :type kfold: int (default: 10)
    :param subseqLength: Length of subsequence used by the SSK
    :type subseqLength: int (default: 3)
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: An array of predicted classes 
    """
    try:
        predicted = []
        # Retrieve Gram Matrix from string kernel
        if verboseON():
            prettyPrint("Generating Gram Matrix from documents", "debug")
        X_gram = string_kernel(X, X)
        y = numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel="precomputed")
        X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram
        prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest))
        predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

项目：Aion 作者：aleisalem | 项目源码 | 文件源码

def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10):
    """
    Classifies the data using Support vector machines and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing the labels corresponding to feature vectors
    :type y: list
    :param kernel: The kernel used to elevate data into higher dimensionalities
    :type kernel: str
    :param C: The penalty parameter of the error term
    :type C: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int 
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data 
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = svm.SVC(kernel=kernel, C=C)
        # Select K Best features if enabled
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

项目：Aion 作者：aleisalem | 项目源码 | 文件源码

def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
    """
    Classifies the data using decision trees and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing labels corresponding to the feature vectors
    :type y: list
    :param estimators: The number of random trees to use in classification
    :type estimators: int
    :param criterion: The splitting criterion employed by the decision tree
    :type criterion: str
    :param splitter: The method used to split the data
    :type splitter: str
    :param maxDepth: The maximum depth the tree is allowed to grow
    :type maxDepth: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def MIC(x, y):
     # Maximal Information Coefficient 
     base = MINE()
     base.compute_score(x, y)
     return base.mic(),0.5  # just one kind of factor,
                            # but SelectKBest needs two factors to fit datas.

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_ps(X,y):
    dim = X.shape[1]
    de = min(2000,dim)
    clf = SelectKBest(lambda X, Y: np.array(map(lambda x:pearsonr(x, Y), X.T)).T, k=de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_c2(X,y):
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf  = SelectKBest(chi2, k = de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_mic(X,y):
    dim  = X.shape[1]
    de   = min(2000,dim) 
    clf = SelectKBest(MIC, k=de)
    clf.fit(X,y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_f_and_p_value(X,y):
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = SelectKBest(f_classif,k=de)
    clf.fit(X, y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

项目：NLPWorks 作者：thautwarm | 项目源码 | 文件源码

def de_f_and_p_value(X,y):
    """ f&p value """
    dim = X.shape[1]
    de  = min(2000,dim)
    clf = SelectKBest(f_classif,k=de)
    clf.fit(X, y)
    def _func(X1,X2):
        return clf.transform(X1),clf.transform(X2)
    return _func

项目：pantip-libr 作者：starcolon | 项目源码 | 文件源码

def new(method='centroid',n_features=8):

  # Clustering method
  nc = METHODS[method]

  # Orthogonal feature selector
  if n_features is None: n_features = 'all'
  selector = SelectKBest(f_classif, k=n_features)

  # NOTE: The only last operation of the list
  # must be a classifier or clustering model
  print(colored('Cluster model created','yellow'))
  return [selector, nc]

项目：mtl 作者：zhenhongChen | 项目源码 | 文件源码

def get_local_words(word_count, threshold, y_train, train_seq, num_words):

    feature_index = delete_low_freq_words(word_count, threshold)
    print(len(train_seq), len(feature_index))
    word_freq_matrix = np.zeros([len(train_seq), len(feature_index)])

    for (seq_idx, seq) in enumerate(train_seq):
        word_freq_list = np.zeros(len(feature_index))

        for word in seq:
            if (word not in feature_index):
                continue
            else:
                word_idx = feature_index[word]
                word_freq_matrix[seq_idx][word_idx] += 1

    sk = SelectKBest(chi2, k="all")
    sk.fit_transform(csr_matrix(word_freq_matrix), y_train)
    score_list = sk.scores_

    word_score = {}
    for (feature, idx) in feature_index.items():
        word_score[feature] = score_list[idx]

    word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True)

    local_word_list = []
    for (word, score) in word_score[:num_words]:
        local_word_list.append(word)

    del word_freq_matrix

    return local_word_list

项目：tcsl 作者：machinelearningnanodegree | 项目源码 | 文件源码

def k_best_features(self):
        # get total number of features.
        num_features = self.features.shape[1]
        feature_list = []
        # find k-best features, with k from 1 to num_features.
        for i in range(num_features):
            skBest = SelectKBest(k=i)
            skBest.fit_transform(self.features, self.labels)
            # get boolean indices of the best features.
            k_features = skBest.get_support()
            # append the features to the feature list.
            feature_list += self.features.columns[k_features].tolist()
        return feature_list

项目：tcsl 作者：machinelearningnanodegree | 项目源码 | 文件源码

def selectFeatures(k_features=5, *args):
    """
    # Select k best features using the SelectKBest class in Sklearn.
    # Inputs: k=no. of features to select, args=(XTrain,yTrain)
    # returns: np array of k features.
    """
    X, y = args
    skb = SelectKBest(k=k_features)
    return skb.fit_transform(X, y)

项目：DocumentClassification 作者：bahmanh | 项目源码 | 文件源码

def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

项目：student-performance-prediction 作者：sachanganesh | 项目源码 | 文件源码

def train_and_score(X, y):
    X_train, X_test, y_train, y_test = split_data(X, y)

    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100))
    ])

    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy:", np.array(scores).mean())

    clf.fit(X_train, y_train)

    confuse(y_test, clf.predict(X_test))
    print()

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    data = anamolySample
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0, len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0, len(name)):
        if outcome[i]:
            print name[i]

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample,normalSample,topk,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    for i in range(0,len(name)):
        if outcome[i]:
            print name[i]

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithXsqure(anamolySample, normalSample, topk, name):
    target = []
    for i in range(0, len(anamolySample)):
        target.append(1)
    data = pd.concat([anamolySample,normalSample])
    for i in range(0, len(normalSample)):
        target.append(0)
    X_new = SelectKBest(chi2, topk).fit(data, target)
    outcome = X_new.get_support()
    warnstr = ""
    for i in range(0, len(name)):
        if outcome[i]:
            warnstr += name[i]
            warnstr += "   ;   "
    return warnstr