我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用sklearn.ensemble()。
def get_model_class(method): """ Returns the class associated with a method string. :param method: A string describing the method to use. :return: A class corresponding to the method. """ if method == 'logistic': return sklearn.linear_model.LogisticRegression elif method == 'svm': return sklearn.svm.SVC elif method == 'mirowski-svm': return sklearn.svm.SVC elif method == 'sgd': return sklearn.linear_model.SGDClassifier elif method == 'random-forest': return sklearn.ensemble.RandomForestClassifier elif method == 'nearest-centroid': return sklearn.neighbors.NearestCentroid elif method == 'knn': return sklearn.neighbors.KNeighborsClassifier elif method == 'bagging': return sklearn.ensemble.BaggingClassifier else: raise NotImplementedError("Method {} is not supported".format(method))
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10): """ Classifies the data using decision trees and k-fold CV :param X: The matrix of feature vectors :type X: list :param y: The vector containing labels corresponding to the feature vectors :type y: list :param estimators: The number of random trees to use in classification :type estimators: int :param criterion: The splitting criterion employed by the decision tree :type criterion: str :param splitter: The method used to split the data :type splitter: str :param maxDepth: The maximum depth the tree is allowed to grow :type maxDepth: int :param selectKBest: The number of best features to select :type selectKBest: int :param kfold: The number of folds to use in K-fold CV :type kfold: int :return: A list of predicted labels across the k-folds """ try: # Prepare data X, y = numpy.array(X), numpy.array(y) # Define classifier clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist() except Exception as e: prettyPrintError(e) return [] return predicted
def model_diagnostics(mod, X, y, type='single'): if type == 'ensemble': g = mod.predict(X, y) else: g = mod.predict(X) return diagnostics(g, y) #functions for accuracy statistics
def ensemble_regression(self, scoring_metric='neg_mean_squared_error', model_by_name=None): # TODO stub self.validate_regression('Ensemble Regression') raise HealthcareAIError('We apologize. An ensemble linear regression has not yet been implemented.')
def train_model(target, features, model_params, test_split): """ Train the Gradient Boosting model on the provided data and calculate the accuracy of the model Input: @param target: 1D Array of the variable that the model is to be trianed to predict @param features: 2D Array NSamples * NFeatures to use in trining the model @param model_params: A dictionary of model parameters, the full specification can be found on the scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray """ features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split) model = GradientBoostingRegressor(**model_params) model.fit(features_train, target_train) accuracy = calculate_model_accuracy(model, features, target) return model, accuracy
def predictAndTestEnsemble(X, y, Xtest, ytest, classifiers=[], selectKBest=0): """ Trains an Ensemble of classifiers (with default params) and using a training dataset, and returns majority vote using the same training dataset and an out-of-sample test dataset :type X: list :param y: The labels corresponding to the training feature vectors :type y: list :param Xtest: The matrix of test feature vectors :type Xtest: list :param ytest: The labels corresponding to the test feature vectors :type ytest: list :param classifiers: A list of classifiers to use in the ensemble :type classifiers: list of str :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the k-folds """ try: predicted, predicted_test = [], [] # Prepare the data X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Define classifiers ensembleClassifiers = [] for c in classifiers: if c.lower().find("knn") != -1: K = int(c.split('-')[-1]) clf = neighbors.KNeighborsClassifier(n_neighbors=K) elif c.lower().find("svm") != -1: clf = svm.SVC(kernel='linear', C=1) elif c.lower().find("forest") != -1: E = int(c.split('-')[-1]) clf = ensemble.RandomForestClassifier(n_estimators=E,) # Add to list ensembleClassifiers.append((c, clf)) # Select K Best features if applicable X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest # Train and fit the voting classifier voting = VotingClassifier(estimators=ensembleClassifiers, voting='hard') prettyPrint("Fitting ensemble model") voting = voting.fit(X_new, y) prettyPrint("Validating model") predicted = voting.predict(X_new) # Same for the test dataset prettyPrint("Testing the model") predicted_test = voting.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def predictAndTestRandomForest(X, y, Xtest, ytest, estimators=10, criterion="gini", maxdepth=None, selectKBest=0): """ Trains a tree using the training data and tests it using the test data using K-fold cross validation :param Xtr: The matrix of training feature vectors :type Xtr: list :param ytr: The labels corresponding to the training feature vectors :type ytr: list :param Xte: The matrix of test feature vectors :type yte: list :param estimators: The number of random trees to use in classification :type estimators: int :param criterion: The splitting criterion employed by the decision tree :type criterion: str :param maxdepth: The maximum depth the tree is allowed to grow :type maxdepth: int :param selectKBest: The number of best features to select :type selectKBest: int :return: Two lists of the validation and test accuracies across the 10 folds """ try: predicted, predicted_test = [], [] # Define classifier and cross validation iterator clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth) # Start the cross validation learning X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest) # Select K Best features if enabled prettyPrint("Selecting %s best features from feature vectors" % selectKBest) X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest # Fit model prettyPrint("Fitting model") clf.fit(X_new, y) # Validate and test model prettyPrint("Validating model using training data") predicted = clf.predict(X_new) prettyPrint("Testing model") predicted_test = clf.predict(Xtest_new) except Exception as e: prettyPrintError(e) return [], [] return predicted, predicted_test
def ensemble_classification(self, scoring_metric='roc_auc', trained_model_by_name=None): """ This provides a simple way to put data in and have healthcare.ai train a few models and pick the best one for your data. Args: scoring_metric (str): The metric used to rank the models. Defaults to 'roc_auc' trained_model_by_name (dict): A dictionary of trained models to compare for a custom ensemble Returns: TrainedSupervisedModel: The best TrainedSupervisedModel found. """ self.validate_classification('Ensemble Classification') self.validate_score_metric_for_number_of_classes(scoring_metric) score_by_name = {} # Here is the default list of algorithms to try for the ensemble # Adding an ensemble method is as easy as adding a new key:value pair in the `model_by_name` dictionary if trained_model_by_name is None: # TODO because these now all return TSMs it will be additionally slow by all the factor models. # TODO Could these be trained separately then after the best is found, train the factor model and add to TSM? trained_model_by_name = { 'KNN': self.knn(randomized_search=True, scoring_metric=scoring_metric), 'Logistic Regression': self.logistic_regression(randomized_search=True), 'Random Forest Classifier': self.random_forest_classifier( trees=200, randomized_search=True, scoring_metric=scoring_metric)} for name, model in trained_model_by_name.items(): # Unroll estimator from trained supervised model estimator = hcai_tsm.get_estimator_from_trained_supervised_model(model) # Get the score objects for the estimator score = self.metrics(estimator) self._console_log('{} algorithm: score = {}'.format(name, score)) # TODO this may need to ferret out each classification score separately score_by_name[name] = score[scoring_metric] sorted_names_and_scores = sorted(score_by_name.items(), key=lambda x: x[1]) best_algorithm_name, best_score = sorted_names_and_scores[-1] best_model = trained_model_by_name[best_algorithm_name] self._console_log('Based on the scoring metric {}, the best algorithm found is: {}'.format(scoring_metric, best_algorithm_name)) self._console_log('{} {} = {}'.format(best_algorithm_name, scoring_metric, best_score)) return best_model
def rand_forest_train(self): # ?????????? users = pd.read_csv('names.csv') # ??similarity?platform?reputation?entropy???????????? X = users[['similarity', 'platform', 'reputation', 'entropy']] y = users['human_or_machine'] # ?????????? 25%??????? from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) # ???????????????? from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) # ????????????????????? from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) # ??????????????????????? from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) # ??????????????????????? from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report # ??????????????????? ?????????? ??? F1?? print("??????????", dtc.score(X_test, y_test)) print(classification_report(dtc_y_pred, y_test)) # ??????????????????????????????? ??? F1?? print("????????????", rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) # ??????????????????????????????? ??? F1?? print("????????????", gbc.score(X_test, y_test)) print(classification_report(gbc_y_pred, y_test)) users = pd.read_csv('values.csv') # ?????????? X = users[['similarity', 'platform', 'reputation', 'entropy']] X = vec.transform(X.to_dict(orient='record')) print(rfc.predict(X)) self.dtc = dtc self.rfc = rfc self.gbc = gbc