Python sklearn 模块,ensemble() 实例源码

我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用sklearn.ensemble()

项目:kaggle-seizure-prediction    作者:sics-lm    | 项目源码 | 文件源码
def get_model_class(method):
    """
    Returns the class associated with a method string.
    :param method: A string describing the method to use.
    :return: A class corresponding to the method.
    """
    if method == 'logistic':
        return sklearn.linear_model.LogisticRegression
    elif method == 'svm':
        return sklearn.svm.SVC
    elif method == 'mirowski-svm':
        return sklearn.svm.SVC
    elif method == 'sgd':
        return sklearn.linear_model.SGDClassifier
    elif method == 'random-forest':
        return sklearn.ensemble.RandomForestClassifier
    elif method == 'nearest-centroid':
        return sklearn.neighbors.NearestCentroid
    elif method == 'knn':
        return sklearn.neighbors.KNeighborsClassifier
    elif method == 'bagging':
        return sklearn.ensemble.BaggingClassifier
    else:
        raise NotImplementedError("Method {} is not supported".format(method))
项目:Aion    作者:aleisalem    | 项目源码 | 文件源码
def predictKFoldRandomForest(X, y, estimators=10, criterion="gini", maxdepth=None, selectKBest=0, kfold=10):
    """
    Classifies the data using decision trees and k-fold CV
    :param X: The matrix of feature vectors
    :type X: list
    :param y: The vector containing labels corresponding to the feature vectors
    :type y: list
    :param estimators: The number of random trees to use in classification
    :type estimators: int
    :param criterion: The splitting criterion employed by the decision tree
    :type criterion: str
    :param splitter: The method used to split the data
    :type splitter: str
    :param maxDepth: The maximum depth the tree is allowed to grow
    :type maxDepth: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :param kfold: The number of folds to use in K-fold CV
    :type kfold: int
    :return: A list of predicted labels across the k-folds
    """
    try:
        # Prepare data
        X, y = numpy.array(X), numpy.array(y)
        # Define classifier
        clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        predicted = cross_val_predict(clf, X_new, y, cv=kfold).tolist()
    except Exception as e:
        prettyPrintError(e)
        return []

    return predicted
项目:document_classification    作者:scotthlee    | 项目源码 | 文件源码
def model_diagnostics(mod, X, y, type='single'):
    if type == 'ensemble':
        g = mod.predict(X, y)
    else:
        g = mod.predict(X)
    return diagnostics(g, y)

#functions for accuracy statistics
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def ensemble_regression(self, scoring_metric='neg_mean_squared_error', model_by_name=None):
        # TODO stub
        self.validate_regression('Ensemble Regression')
        raise HealthcareAIError('We apologize. An ensemble linear regression has not yet been implemented.')
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:crankshaft    作者:CartoDB    | 项目源码 | 文件源码
def train_model(target, features, model_params, test_split):
    """
        Train the Gradient Boosting model on the provided data and calculate the accuracy of the model
        Input:
            @param target: 1D Array of the variable that the model is to be trianed to predict
            @param features: 2D Array NSamples * NFeatures to use in trining the model
            @param model_params: A dictionary of model parameters, the full specification can be found on the
                scikit learn page for [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
            @parma test_split: The fraction of the data to be withheld for testing the model / calculating the accuray
    """
    features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_split)
    model = GradientBoostingRegressor(**model_params)
    model.fit(features_train, target_train)
    accuracy = calculate_model_accuracy(model, features, target)
    return model, accuracy
项目:Aion    作者:aleisalem    | 项目源码 | 文件源码
def predictAndTestEnsemble(X, y, Xtest, ytest, classifiers=[], selectKBest=0):
    """
    Trains an Ensemble of classifiers (with default params) and using a training dataset, 
    and returns majority vote using the same training dataset and an out-of-sample test dataset
    :type X: list
    :param y: The labels corresponding to the training feature vectors
    :type y: list
    :param Xtest: The matrix of test feature vectors
    :type Xtest: list
    :param ytest: The labels corresponding to the test feature vectors
    :type ytest: list
    :param classifiers: A list of classifiers to use in the ensemble
    :type classifiers: list of str
    :param selectKBest: The number of best features to select
    :type selectKBest: int
    :return: Two lists of the validation and test accuracies across the k-folds
    """
    try:
        predicted, predicted_test = [], []
        # Prepare the data
        X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
        # Define classifiers
        ensembleClassifiers = []
        for c in classifiers:
            if c.lower().find("knn") != -1:
                K = int(c.split('-')[-1])
                clf = neighbors.KNeighborsClassifier(n_neighbors=K)
            elif c.lower().find("svm") != -1:
                clf = svm.SVC(kernel='linear', C=1)
            elif c.lower().find("forest") != -1:
                E = int(c.split('-')[-1])
                clf = ensemble.RandomForestClassifier(n_estimators=E,)
            # Add to list
            ensembleClassifiers.append((c, clf))
        # Select K Best features if applicable
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest
        # Train and fit the voting classifier
        voting = VotingClassifier(estimators=ensembleClassifiers, voting='hard')
        prettyPrint("Fitting ensemble model")
        voting = voting.fit(X_new, y)
        prettyPrint("Validating model")
        predicted = voting.predict(X_new)
        # Same for the test dataset
        prettyPrint("Testing the model")
        predicted_test = voting.predict(Xtest_new)

    except Exception as e:
        prettyPrintError(e) 
        return [], []

    return predicted, predicted_test
项目:Aion    作者:aleisalem    | 项目源码 | 文件源码
def predictAndTestRandomForest(X, y, Xtest, ytest, estimators=10, criterion="gini", maxdepth=None, selectKBest=0):
    """
    Trains a tree using the training data and tests it using the test data using K-fold cross validation
    :param Xtr: The matrix of training feature vectors
    :type Xtr: list
    :param ytr: The labels corresponding to the training feature vectors
    :type ytr: list
    :param Xte: The matrix of test feature vectors
    :type yte: list
    :param estimators: The number of random trees to use in classification
    :type estimators: int
    :param criterion: The splitting criterion employed by the decision tree
    :type criterion: str
    :param maxdepth: The maximum depth the tree is allowed to grow
    :type maxdepth: int
    :param selectKBest: The number of best features to select
    :type selectKBest: int 
    :return: Two lists of the validation and test accuracies across the 10 folds
    """
    try:
        predicted, predicted_test = [], []
        # Define classifier and cross validation iterator
        clf = ensemble.RandomForestClassifier(n_estimators=estimators, criterion=criterion, max_depth=maxdepth)
        # Start the cross validation learning
        X, y, Xtest, ytest = numpy.array(X), numpy.array(y), numpy.array(Xtest), numpy.array(ytest)
        # Select K Best features if enabled
        prettyPrint("Selecting %s best features from feature vectors" % selectKBest)
        X_new = SelectKBest(chi2, k=selectKBest).fit_transform(X, y) if selectKBest > 0 else X
        Xtest_new = SelectKBest(chi2, k=selectKBest).fit_transform(Xtest, ytest) if selectKBest > 0 else Xtest
        # Fit model
        prettyPrint("Fitting model")
        clf.fit(X_new, y)
        # Validate and test model
        prettyPrint("Validating model using training data")
        predicted = clf.predict(X_new)
        prettyPrint("Testing model")
        predicted_test = clf.predict(Xtest_new)

    except Exception as e:
        prettyPrintError(e)
        return [], []

    return predicted, predicted_test
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def ensemble_classification(self, scoring_metric='roc_auc', trained_model_by_name=None):
        """
        This provides a simple way to put data in and have healthcare.ai train a few models and pick the best one for
        your data.

        Args:
            scoring_metric (str): The metric used to rank the models. Defaults to 'roc_auc'
            trained_model_by_name (dict): A dictionary of trained models to compare for a custom ensemble

        Returns:
            TrainedSupervisedModel: The best TrainedSupervisedModel found.
        """
        self.validate_classification('Ensemble Classification')
        self.validate_score_metric_for_number_of_classes(scoring_metric)
        score_by_name = {}

        # Here is the default list of algorithms to try for the ensemble
        # Adding an ensemble method is as easy as adding a new key:value pair in the `model_by_name` dictionary
        if trained_model_by_name is None:
            # TODO because these now all return TSMs it will be additionally slow by all the factor models.
            # TODO Could these be trained separately then after the best is found, train the factor model and add to TSM?
            trained_model_by_name = {
                'KNN': self.knn(randomized_search=True, scoring_metric=scoring_metric),
                'Logistic Regression': self.logistic_regression(randomized_search=True),
                'Random Forest Classifier': self.random_forest_classifier(
                    trees=200,
                    randomized_search=True,
                    scoring_metric=scoring_metric)}

        for name, model in trained_model_by_name.items():
            # Unroll estimator from trained supervised model
            estimator = hcai_tsm.get_estimator_from_trained_supervised_model(model)

            # Get the score objects for the estimator
            score = self.metrics(estimator)
            self._console_log('{} algorithm: score = {}'.format(name, score))

            # TODO this may need to ferret out each classification score separately
            score_by_name[name] = score[scoring_metric]

        sorted_names_and_scores = sorted(score_by_name.items(), key=lambda x: x[1])
        best_algorithm_name, best_score = sorted_names_and_scores[-1]
        best_model = trained_model_by_name[best_algorithm_name]

        self._console_log('Based on the scoring metric {}, the best algorithm found is: {}'.format(scoring_metric,
                                                                                                   best_algorithm_name))
        self._console_log('{} {} = {}'.format(best_algorithm_name, scoring_metric, best_score))

        return best_model
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def rand_forest_train(self):
        # ??????????
        users = pd.read_csv('names.csv')
        # ??similarity?platform?reputation?entropy????????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        y = users['human_or_machine']

        # ?????????? 25%???????
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

        # ????????????????
        from sklearn.feature_extraction import DictVectorizer
        vec = DictVectorizer(sparse=False)
        X_train = vec.fit_transform(X_train.to_dict(orient='record'))
        X_test = vec.transform(X_test.to_dict(orient='record'))

        # ?????????????????????
        from sklearn.tree import DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        dtc_y_pred = dtc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import GradientBoostingClassifier
        gbc = GradientBoostingClassifier()
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_test)

        from sklearn.metrics import classification_report
        # ??????????????????? ?????????? ??? F1??
        print("??????????", dtc.score(X_test, y_test))
        print(classification_report(dtc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", rfc.score(X_test, y_test))
        print(classification_report(rfc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", gbc.score(X_test, y_test))
        print(classification_report(gbc_y_pred, y_test))


        users = pd.read_csv('values.csv')

        # ??????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        X = vec.transform(X.to_dict(orient='record'))
        print(rfc.predict(X))

        self.dtc = dtc
        self.rfc = rfc
        self.gbc = gbc