我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用sklearn.feature_selection.SelectPercentile()。
def get_binary(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False ))) ])
def get_sgdc(self): return Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)), ('feat_select', SelectPercentile(percentile=10)), ('clf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=10, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)) ])
def select_percentile_selector(data,target): # Select Model selector = SelectPercentile(percentile = 75) # Default is 10% # Fit, Format, and Return return format_selector(selector, data, target) # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
def select_feat(X,y,percentile=20): "Select best 20 % of features using Anova F-value - *f_classif* from scikit.learn" selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X, y) return selector.transform(X)
def get_classification_data(self, division_dummies=True, samples=None, percentile=100): raw = PlayerCollection.filter_by_class(self.raw) df = PlayerCollection.raw_to_df(raw) players, divisions = PlayerCollection.aggregate_df(df) players, divisions = PlayerCollection.to_matrix(players, divisions) players, divisions = PlayerCollection.subsample(players, divisions, samples) X_train, X_test, y_train, y_test = train_test_split( players, divisions, random_state=42, stratify=divisions) selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(X_train, y_train) X_train = selector.transform(X_train) X_test = selector.transform(X_test) if division_dummies: y_train = pd.get_dummies(y_train).as_matrix() y_test = pd.get_dummies(y_test).as_matrix() scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) return X_train, X_test, y_train, y_test
def test_make_pipeline(): t1 = SelectKBest() t2 = SelectKBest() t3 = SelectKBest() t4 = SelectKBest() t5 = SelectPercentile() t6 = SelectKBest() t7 = SelectKBest() t8 = SelectKBest() t9 = SelectPercentile() in_steps = [[t1, None], [t2, t3], [t4, t5], # mixed t6, [None, t7], [t8, None, t9], # mixed None] pipe = make_pipeline(*in_steps, memory='/path/to/nowhere') union = make_union(*in_steps) for est, est_steps in [(pipe, pipe.steps), (union, union.transformer_list)]: names, steps = zip(*est_steps) assert names == ('selectkbest-1', 'selectkbest-2', 'alt-1', 'selectkbest-3', 'selectkbest-4', 'alt-2', 'nonetype') assert steps == (t1, t2, t4, t6, None, t8, None) assert len(est._param_grid) == 5 assert est._param_grid[names[0]] == [t1, None] assert est._param_grid[names[1]] == [t2, t3] assert est._param_grid[names[2]] == [t4, t5] assert est._param_grid[names[4]] == [None, t7] assert est._param_grid[names[5]] == [t8, None, t9] assert type(pipe) is Pipeline assert type(union) is FeatureUnion assert pipe.memory == '/path/to/nowhere'
def __init__(self, filename=None): super().__init__(filename) if not filename: self.clf = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'))), ('anova', SelectPercentile(f_classif)), ('clf', MultinomialNB()) ])
def variance_threshold_selector(data,target): # Select Model selector = VarianceThreshold(0) # Defaults to 0.0, e.g. only remove features with the same value in all samples # Fit, Format, and Return return format_selector(selector,data,target) # http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
def __init__(self, clf, params, name): self.clf = Pipeline([('select', SelectPercentile(score_func=mutual_info_classif, percentile=70)), ('clf', clf)]) params['select__percentile'] = [60, 70, 80, 90] self.clf = GridSearchCV(self.clf, param_grid=params, scoring='f1_macro') self.name = name self.scaler = MinMaxScaler()
def preprocess(X,y): ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) joblib.dump(vectorizer, 'vectorizer_intent.pkl') ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) joblib.dump(selector, 'selector_intent.pkl') features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed, labels_train, labels_test
def buildVectorizer(classes, examples, parameters): featureChoice = None doFeatureSelection = False tfidf = False featureSelectPerc = 10 if "featureChoice" in parameters: featureChoice = parameters["featureChoice"] if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True": doFeatureSelection = True if "featureSelectPerc" in parameters: featureSelectPerc = int(parameters["featureSelectPerc"]) if "tfidf" in parameters and parameters["tfidf"] == "True": tfidf = True print "Starting vectorizer..." vectorizer = Vectorizer(classes,examples,featureChoice,tfidf) vectors = vectorizer.getTrainingVectors() print "Vectors of size:", vectors.shape if doFeatureSelection: print "Trimming training vectors..." from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2 #featureSelector = SelectKBest(chi2, k=100)`: featureSelector = SelectPercentile(chi2,featureSelectPerc) vectorsTrimmed = featureSelector.fit_transform(vectors, classes) vectorsTrimmed = coo_matrix(vectorsTrimmed) print "Trimmed training vectors of size:", vectorsTrimmed.shape else: vectorsTrimmed = vectors featureSelector = None return vectorsTrimmed,vectorizer,featureSelector
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Chris training emails:", sum(labels_train) print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test