我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.feature_selection.SelectKBest()。
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def test_SelectKBest(): ''' test the method of SelectKBert :return: None ''' X=[ [1,2,3,4,5], [5,4,3,2,1], [3,3,3,3,3,], [1,1,1,1,1] ] y=[0,1,0,1] print("before transform:",X) selector=SelectKBest(score_func=f_classif,k=3) selector.fit(X,y) print("scores_:",selector.scores_) print("pvalues_:",selector.pvalues_) print("selected index:",selector.get_support(True)) print("after transform:",selector.transform(X))
def analyseReasonWithXsqure(anamolySample,normalSample,topk): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) name = [] for i in data.columns: name.append(i) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def select_kbest_reg(data_frame, target, k=5): """ Selecting K-Best features regression :param data_frame: A pandas dataFrame with the training data :param target: target variable name in DataFrame :param k: desired number of features from the data :returns feature_scores: scores for each feature in the data as pandas DataFrame """ feat_selector = SelectKBest(f_regression, k=k) _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target]) feat_scores = pd.DataFrame() feat_scores["F Score"] = feat_selector.scores_ feat_scores["P Value"] = feat_selector.pvalues_ feat_scores["Support"] = feat_selector.get_support() feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns return feat_scores
def test_build_param_grid_set_estimator(): clf1 = SVC() clf2 = LogisticRegression() clf3 = SVC() clf4 = SGDClassifier() estimator = set_grid(Pipeline([('sel', set_grid(SelectKBest(), k=[2, 3])), ('clf', None)]), clf=[set_grid(clf1, kernel=['linear']), clf2, set_grid(clf3, kernel=['poly'], degree=[2, 3]), clf4]) param_grid = [{'clf': [clf1], 'clf__kernel': ['linear'], 'sel__k': [2, 3]}, {'clf': [clf3], 'clf__kernel': ['poly'], 'clf__degree': [2, 3], 'sel__k': [2, 3]}, {'clf': [clf2, clf4], 'sel__k': [2, 3]}] assert build_param_grid(estimator) == param_grid
def select_percentile_selector(data,target): # Select Model selector = SelectPercentile(percentile = 75) # Default is 10% # Fit, Format, and Return return format_selector(selector, data, target) # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
def de_c2(X,y): """ chi2 """ dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(chi2, k = de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1), clf.transform(X2) return _func # def de_mic(X,y): # """ MIC """ # dim = X.shape[1] # de = min(2000,dim) # clf = SelectKBest(MIC, k=de) # clf.fit(X,y) # def _func(X1,X2): # return clf.transform(X1),clf.transform(X2) # return _func
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample, normalSample]) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " print 'x2:',warnstr return warnstr
def select_kbest_clf(data_frame, target, k=4): """ Selecting K-Best features for classification :param data_frame: A pandas dataFrame with the training data :param target: target variable name in DataFrame :param k: desired number of features from the data :returns feature_scores: scores for each feature in the data as pandas DataFrame """ feat_selector = SelectKBest(f_classif, k=k) _ = feat_selector.fit(data_frame.drop(target, axis=1), data_frame[target]) feat_scores = pd.DataFrame() feat_scores["F Score"] = feat_selector.scores_ feat_scores["P Value"] = feat_selector.pvalues_ feat_scores["Support"] = feat_selector.get_support() feat_scores["Attribute"] = data_frame.drop(target, axis=1).columns return feat_scores
def reduceDimensionality(X, y, method="selectkbest", targetDim=10): """ Reduces the dimensionality of [X] to [targetDim] """ try: # Check for the required methodology first if method.lower() == "selectkbest": prettyPrint("Selecting %s best features from dataset" % targetDim, "debug") kBestSelector = SelectKBest(k=targetDim) X_new = kBestSelector.fit_transform(X, y).tolist() elif method.lower() == "pca": prettyPrint("Extracting %s features from dataset using PCA" % targetDim, "debug") pcaExtractor = PCA(n_components=targetDim) # Make sure vectors in X are positive X_new = pcaExtractor.fit_transform(X, y).tolist() else: prettyPrint("Unknown dimensionality reduction method \"%s\"" % method, "warning") return X except Exception as e: prettyPrint("Error encountered in \"reduceDimensionality\": %s" % e, "error") return X # Return the reduced dataset return X_new
def ngrams_selection(train_data, train_labels, ind, model_file, ngram_range_=(1, 1), max_num_features=100, analyzer_type='word'): """Create and save vectorizers and feature selectors on given train data. Args: train_data: list of train text samples train_labels: list of train labels ind: index of vectorizer/selector to save file model_file: model filename ngram_range_: range of n-grams max_num_features: maximum number of features to select analyzer_type: analyzer type for TfidfVectorizer 'word' or 'char' Returns: nothing """ vectorizer = TfidfVectorizer(ngram_range=ngram_range_, sublinear_tf=True, analyzer=analyzer_type) X_train = vectorizer.fit_transform(train_data) if max_num_features < X_train.shape[1]: ch2 = SelectKBest(chi2, k=max_num_features) ch2.fit(X_train, train_labels) data_struct = {'vectorizer': vectorizer, 'selector': ch2} print ('creating ', model_file + '_ngrams_vect_' + ind + '.bin') with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f: pickle.dump(data_struct, f) else: data_struct = {'vectorizer': vectorizer} print ('creating', model_file + '_ngrams_vect_' + ind + '.bin') with open(model_file + '_ngrams_vect_' + ind + '.bin', 'wb') as f: pickle.dump(data_struct, f) return
def chiSquare(train_data, train_classes, topK): vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) y_train = train_classes if (x_train.shape[1] < topK): topK = x_train.shape[1] selector = SelectKBest(chi2, k=topK) x_new = selector.fit_transform(x_train, y_train) return vectorizer.inverse_transform(selector.inverse_transform(x_new))
def transform_select_K_best(X_train,Y_train, X_all, K=100): """Selects the best K features given the training data. Args: X_train: A matrix containing training data Y_train: Classification labels for the training data X_all: A matrix containing all the data K: The number of features to select """ skb = SelectKBest(f_classif,K) skb.fit(X_train,Y_train) return skb.transform(X_all)
def test_make_pipeline(): t1 = SelectKBest() t2 = SelectKBest() t3 = SelectKBest() t4 = SelectKBest() t5 = SelectPercentile() t6 = SelectKBest() t7 = SelectKBest() t8 = SelectKBest() t9 = SelectPercentile() in_steps = [[t1, None], [t2, t3], [t4, t5], # mixed t6, [None, t7], [t8, None, t9], # mixed None] pipe = make_pipeline(*in_steps, memory='/path/to/nowhere') union = make_union(*in_steps) for est, est_steps in [(pipe, pipe.steps), (union, union.transformer_list)]: names, steps = zip(*est_steps) assert names == ('selectkbest-1', 'selectkbest-2', 'alt-1', 'selectkbest-3', 'selectkbest-4', 'alt-2', 'nonetype') assert steps == (t1, t2, t4, t6, None, t8, None) assert len(est._param_grid) == 5 assert est._param_grid[names[0]] == [t1, None] assert est._param_grid[names[1]] == [t2, t3] assert est._param_grid[names[2]] == [t4, t5] assert est._param_grid[names[4]] == [None, t7] assert est._param_grid[names[5]] == [t8, None, t9] assert type(pipe) is Pipeline assert type(union) is FeatureUnion assert pipe.memory == '/path/to/nowhere'
def export_best_feature_names(self, df, labels, out_folder_path, k): columns, repos, observations = self.decompose_df(df) feature_scores = SelectKBest(chi2, k=k).fit(observations, labels).scores_ feature_scores = np.nan_to_num(feature_scores) k_best_features = np.argpartition(feature_scores.ravel(), (-1) * k)[(-1) * k:] k_best_feature_names = columns[k_best_features] out_file_path = os.path.join(out_folder_path, "feature_selection.txt") with open(out_file_path, "w") as output_file: for feature_name in k_best_feature_names: output_file.write(feature_name + "\n")
def __init__(self,selector,return_array=False): ''' ??sklearn??????????????????sklearn? selector: sklearn.feature_selection????????????sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif,k=4)? return_array: True?????numpy.ndarray?Fasle??????X? ????: selector_: fit??selector? ''' BaseSelector.__init__(self,return_array=return_array) self.selector=selector
def test(): #?????? np.random.seed(13) X=pd.DataFrame(np.random.randn(20,10)) X.columns=['x%d'%i for i in range(10)] y=pd.Series(np.random.choice([0,1],20)) #??sklearn????????????? clf_sklearn=feature_selection.SelectKBest(feature_selection.f_classif,k=4) clf=SklearnSelector(estimator=clf_sklearn) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) clf_sklearn=SelectFromModel(LogisticRegression()) clf=SklearnSelector(estimator=clf_sklearn) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) #????? clf_selectkbest=feature_selection.SelectKBest(feature_selection.f_classif,k=4) clf_selectfrommodel=SelectFromModel(LogisticRegression()) clf_baseselector=SklearnSelector(clf_selectkbest) clf=VotingSelector(selectors=[('clf_selectkbest',clf_selectkbest), ('clf_selectfrommodel',clf_selectfrommodel), ('clf_baseselector',clf_baseselector)],threshold=0.5) clf.fit(X,y) clf.transform(X) print(clf.feature_selected) print(clf.df_voting) print(clf.score)
def __init__(self, conf): SemiSupervisedFeatureSelection.__init__(self, conf) self.projection = SelectKBest(mutual_info_classif, k = conf.num_components)
def __init__(self, conf): SemiSupervisedFeatureSelection.__init__(self, conf) self.projection = SelectKBest(chi2, k = conf.num_components)
def __init__(self, conf): SemiSupervisedFeatureSelection.__init__(self, conf) self.projection = SelectKBest(f_classif, k = conf.num_components)
def getFeature(): fileData = open("data") row = [] col = [] data = [] evalRes = [] rowIndex = -1 fileList = fileData.readlines() random.shuffle(fileList) for line in fileList: line = line.rstrip('\n') dataList = re.split(' |:', line) if int(dataList[0]) >= 7: evalRes.append(1) else: if int(dataList[0]) <= 4: evalRes.append(-1) else: continue del dataList[0] rowIndex = rowIndex + 1 row.extend([rowIndex] * int(len(dataList) / 2)) col.extend(map(int, dataList[::2])) data.extend(map(int, dataList[1::2])) featureMatrix = csr_matrix((data, (row, col))) featureMNew = SelectKBest(chi2, k=20000).fit_transform(featureMatrix, evalRes) return featureMNew, evalRes
def build_model_random_forest(df, features, categorical_features, target, split=0.70): print "using %d features (%d columns) on %d rows and target %s. Split %f." % ( len(features), len(df.columns), len(df), target, split) df['is_train'] = np.random.uniform(0, 1, len(df)) <= split train, test = df[df['is_train'] == True], df[df['is_train'] == False] # one_hot_encoding because it doesn't work in pipeline for some reason # for f in categorical_features: # dummies = pd.get_dummies(df[f], prefix=f) # for dummy in dummies.columns: # df[dummy] = dummies[dummy] # features.append(dummy) # df = df.drop(f, 1) # features.remove(f) clf = Pipeline([ ("imputer", Imputer(strategy="mean", axis=0)), ('feature_selection', SelectKBest(k=5)), ("forest", RandomForestClassifier())]) clf.fit(train[features], train[target]) score = clf.score(test[features], test[target]) predicted = clf.predict(test[features]) cm = confusion_matrix(test[target], predicted) print "Random Forest score: %f" % score print "confusion_matrix : \n%s" % cm return clf
def make_predictions_random_forest(df, features, target, split=0.70): print "using %d features (%d columns) on %d rows and target %s. Split %f." % ( len(features), len(df.columns), len(df), target, split) # print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features]) # print "columns: ", '\n\t\t'.join(df.columns) df['is_train'] = np.random.uniform(0, 1, len(df)) <= split train, test = df[df['is_train'] == True], df[df['is_train'] == False] clf = Pipeline([ ("imputer", Imputer(strategy="mean", axis=0)), ('feature_selection', SelectKBest(k=200)), ("forest", RandomForestClassifier( min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))]) clf.fit(train[features], train[target]) score = clf.score(test[features], test[target]) predicted = clf.predict(test[features]) cm = confusion_matrix(test[target], predicted) # print classification_report(test[target], predicted) return score, cm # Utility function to report best scores
def select_k_best_selector(data,target): # Select Model selector = SelectKBest(k=3) # default is 10 features # Fit, Format, and Return return format_selector(selector, data, target)
def buildEstimators(mode): if mode == 'train' or mode == 'cv': # best parameters got by gridsearchCV, best score: 1 estimators = [('anova_filter', SelectKBest(f_classif, k='all')), ('xgb', xgb.XGBClassifier(learning_rate=0.1,n_estimators=300,max_depth=3))] clf = Pipeline(estimators) elif mode == 'test': clf = pickle.load(open(join(classifier_path,"xgb_classifier.plk"), "r")) return clf
def main(): from sklearn import svm from sklearn.datasets import samples_generator from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.preprocessing import MinMaxScaler X, y = samples_generator.make_classification(n_samples=1000, n_informative=5, n_redundant=4, random_state=_random_state) anova_filter = SelectKBest(f_regression, k=5) scaler = MinMaxScaler() clf = svm.SVC(kernel='linear') steps = [scaler, anova_filter, clf] cached_run(steps, X, y)
def main(): sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input', help='Input file', required=True) parser.add_argument('-t', '--test', help='Test file', required=True) parser.add_argument('-o', '--output', help='Output filename prefix', required=True) parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0) parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000) args = parser.parse_args() data = read_semeval_regression(args.input, encoding='windows-1252') analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4]) pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('sel', SelectKBest(chi2, k=args.k)), ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)), ]) test = read_test_data(args.test, encoding='windows-1252') regressor = pipeline.fit(data[0], data[1]) y = regressor.predict(test[2]) with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile: for id_, topic, rate in zip(test[0], test[1], y): print(id_, topic, rate, sep='\t', file=outfile)
def select_feats(X, y, nb_features, nb_features_to_keep=2048): X, y = preproc_for_sklearn(X, y, nb_features) if nb_features < nb_features_to_keep: nb_features_to_keep = nb_features_to_keep / 4 feature_selector = SelectKBest(chi2, k=nb_features_to_keep).fit(X, y) selected_indices = feature_selector.get_support(indices=True) return selected_indices
def predictKFoldKNN(X, y, K=10, kfold=10, selectKBest=0): """ Classifies the data using K-nearest neighbors and k-fold CV :param X: The list of feature vectors :type X: list :param y: The list of labels corresponding to the feature vectors :type y: list :param K: The number of nearest neighbors to consider in classification :type K: int :param kfold: The number of folds in the CV :type kfold: int :param selectKBest: The number of best features to select :type selectKBest: int :return: An array of predicted classes """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = neighbors.KNeighborsClassifier(n_neighbors=K) # Select K Best features if enabled X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictAndTestKNN(X, y, Xtest, ytest, K=10, selectKBest=0): """ Trains a K-NN using the training data and tests it using the test data using K-fold cross validation :type X: list :param y: The labels corresponding to the training feature vectors :type y: list :param Xtest: The matrix of test feature vectors :type Xtest: list :param ytest: The labels corresponding to the test feature vectors :type ytest: list :param K: The number of nearest neighbors to consider in classification :type K: int :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the k-folds """ try: predicted, predicted_test = [], [] # Define classifier and cross validation iterator clf = neighbors.KNeighborsClassifier(n_neighbors=K) # Start the cross validation learning X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Select K Best features if enabled prettyPrint("Selecting %s best features from feature vectors" % selectKBest) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest else Xtest # Fit model prettyPrint("Fitting model") clf.fit(X_new, y) # Validate and test model prettyPrint("Validating model using training data") predicted = clf.predict(X_new) prettyPrint("Testing model") predicted_test = clf.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def predictKFoldSVMSSK(X, y, kfold=10, subseqLength=3, selectKBest=0): """Classifies the data using Support vector machines with the SSK kernel and k-fold CV :param X: The list of text documents containing traces :type X: list :param y: The labels of documents in 'X' :type y: list :param kfold: The number of folds :type kfold: int (default: 10) :param subseqLength: Length of subsequence used by the SSK :type subseqLength: int (default: 3) :param selectKBest: The number of best features to select :type selectKBest: int :return: An array of predicted classes """ try: predicted = [] # Retrieve Gram Matrix from string kernel if verboseON(): prettyPrint("Generating Gram Matrix from documents", "debug") X_gram = string_kernel(X, X) y = numpy.array(y) # Define classifier clf = svm.SVC(kernel="precomputed") X_gram_new = SelectKBest(chi2, k=selectKBest).fit_transform(X_gram, y) if selectKBest > 0 else X_gram prettyPrint("Performing %s-fold CV on the %s best features" % (kfold, selectKBest)) predicted = cross_val_predict(clf, X_gram_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictKFoldSVM(X, y, kernel="linear", C=1, selectKBest=0, kfold=10): """ Classifies the data using Support vector machines and k-fold CV :param X: The matrix of feature vectors :type X: list :param y: The vector containing the labels corresponding to feature vectors :type y: list :param kernel: The kernel used to elevate data into higher dimensionalities :type kernel: str :param C: The penalty parameter of the error term :type C: int :param selectKBest: The number of best features to select :type selectKBest: int :param kfold: The number of folds to use in K-fold CV :type kfold: int :return: A list of predicted labels across the k-folds """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = svm.SVC(kernel=kernel, C=C) # Select K Best features if enabled X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10): """ Classifies the data using decision trees and k-fold CV :param X: The matrix of feature vectors :type X: list :param y: The vector containing labels corresponding to the feature vectors :type y: list :param estimators: The number of random trees to use in classification :type estimators: int :param criterion: The splitting criterion employed by the decision tree :type criterion: str :param splitter: The method used to split the data :type splitter: str :param maxDepth: The maximum depth the tree is allowed to grow :type maxDepth: int :param selectKBest: The number of best features to select :type selectKBest: int :param kfold: The number of folds to use in K-fold CV :type kfold: int :return: A list of predicted labels across the k-folds """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def MIC(x, y): # Maximal Information Coefficient base = MINE() base.compute_score(x, y) return base.mic(),0.5 # just one kind of factor, # but SelectKBest needs two factors to fit datas.
def de_ps(X,y): dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(lambda X, Y: np.array(map(lambda x:pearsonr(x, Y), X.T)).T, k=de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def de_c2(X,y): dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(chi2, k = de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def de_mic(X,y): dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(MIC, k=de) clf.fit(X,y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def de_f_and_p_value(X,y): dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(f_classif,k=de) clf.fit(X, y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def de_f_and_p_value(X,y): """ f&p value """ dim = X.shape[1] de = min(2000,dim) clf = SelectKBest(f_classif,k=de) clf.fit(X, y) def _func(X1,X2): return clf.transform(X1),clf.transform(X2) return _func
def new(method='centroid',n_features=8): # Clustering method nc = METHODS[method] # Orthogonal feature selector if n_features is None: n_features = 'all' selector = SelectKBest(f_classif, k=n_features) # NOTE: The only last operation of the list # must be a classifier or clustering model print(colored('Cluster model created','yellow')) return [selector, nc]
def get_local_words(word_count, threshold, y_train, train_seq, num_words): feature_index = delete_low_freq_words(word_count, threshold) print(len(train_seq), len(feature_index)) word_freq_matrix = np.zeros([len(train_seq), len(feature_index)]) for (seq_idx, seq) in enumerate(train_seq): word_freq_list = np.zeros(len(feature_index)) for word in seq: if (word not in feature_index): continue else: word_idx = feature_index[word] word_freq_matrix[seq_idx][word_idx] += 1 sk = SelectKBest(chi2, k="all") sk.fit_transform(csr_matrix(word_freq_matrix), y_train) score_list = sk.scores_ word_score = {} for (feature, idx) in feature_index.items(): word_score[feature] = score_list[idx] word_score = sorted(word_score.items(), key=lambda x: x[1], reverse=True) local_word_list = [] for (word, score) in word_score[:num_words]: local_word_list.append(word) del word_freq_matrix return local_word_list
def k_best_features(self): # get total number of features. num_features = self.features.shape[1] feature_list = [] # find k-best features, with k from 1 to num_features. for i in range(num_features): skBest = SelectKBest(k=i) skBest.fit_transform(self.features, self.labels) # get boolean indices of the best features. k_features = skBest.get_support() # append the features to the feature list. feature_list += self.features.columns[k_features].tolist() return feature_list
def selectFeatures(k_features=5, *args): """ # Select k best features using the SelectKBest class in Sklearn. # Inputs: k=no. of features to select, args=(XTrain,yTrain) # returns: np array of k features. """ X, y = args skb = SelectKBest(k=k_features) return skb.fit_transform(X, y)
def featuresByChiSq(features,labels,nFeature=5000): chi2_model = SelectKBest(chi2,k=nFeature) dtm = chi2_model.fit_transform(features,labels) return dtm,chi2_model
def train_and_score(X, y): X_train, X_test, y_train, y_test = split_data(X, y) clf = Pipeline([ ('reduce_dim', SelectKBest(chi2, k=2)), ('train', LinearSVC(C=100)) ]) scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2) print("Mean Model Accuracy:", np.array(scores).mean()) clf.fit(X_train, y_train) confuse(y_test, clf.predict(X_test)) print()
def analyseReasonWithXsqure(anamolySample,normalSample,topk,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0,len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0,len(name)): if outcome[i]: print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): data = anamolySample target = [] for i in range(0, len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() for i in range(0, len(name)): if outcome[i]: print name[i]
def analyseReasonWithXsqure(anamolySample, normalSample, topk, name): target = [] for i in range(0, len(anamolySample)): target.append(1) data = pd.concat([anamolySample,normalSample]) for i in range(0, len(normalSample)): target.append(0) X_new = SelectKBest(chi2, topk).fit(data, target) outcome = X_new.get_support() warnstr = "" for i in range(0, len(name)): if outcome[i]: warnstr += name[i] warnstr += " ; " return warnstr