Python sklearn.tree 模块,DecisionTreeClassifier() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.tree.DecisionTreeClassifier()

项目:MachineLearningBasics    作者:zoebchhatriwala    | 项目源码 | 文件源码
def main():

    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)

    clrTree = tree.DecisionTreeClassifier()
    clrTree = clrTree.fit(x_train, y_train)
    outTree = clrTree.predict(x_test)

    clrKN = KNeighborsClassifier()
    clrKN = clrKN.fit(x_train, y_train)
    outKN = clrKN.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%")
    print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_feature_importance(self,clf, model_name ):
        clfs = {'RandomForestClassifier':'feature_importances',
                'ExtraTreesClassifier': 'feature_importances',
                'AdaBoostClassifier': 'feature_importances',
                'LogisticRegression': 'coef',
                'svm.SVC': 'coef',
                'GradientBoostingClassifier': 'feature_importances',
                'GaussianNB': None,
                'DecisionTreeClassifier': 'feature_importances',
                'SGDClassifier': 'coef',
                'KNeighborsClassifier': None,
                'linear.SVC': 'coef'}

        if clfs[model_name] == 'feature_importances':
            return  list(clf.feature_importances_)
        elif clfs[model_name] == 'coef':
            return  list(clf.coef_.tolist())
        else:
            return None
项目:rltk    作者:usc-isi-i2    | 项目源码 | 文件源码
def get_classifier_class(class_name):
    name_table = {
        'svm': SVC,
        'k_neighbors': KNeighborsClassifier,
        'gaussian_process': GaussianProcessClassifier,
        'decision_tree': DecisionTreeClassifier,
        'random_forest': RandomForestClassifier,
        'ada_boost': AdaBoostClassifier,
        'mlp': MLPClassifier,
        'gaussian_naive_bayes': GaussianNB,
        'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis
    }

    if class_name not in name_table:
        raise ValueError('No such classifier')

    return name_table[class_name]
项目:oss-github-analysis-project    作者:itu-oss-project-team    | 项目源码 | 文件源码
def __create_classifiers(self):
        classifiers = list()
        classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
                            "name": "sgd"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
                            "name": "knn1"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'),
                            "name": "knn3"})
        classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'),
                            "name": "knn5"})
        classifiers.append({"func": GaussianNB(),
                            "name": "naive_bayes"})

        # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"})
        # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"})
        # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"})
        return classifiers
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def define_model(self, model, parameters, n_cores = 0):
        clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
                'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
                'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
                'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
                'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
                'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
                'GaussianNB': GaussianNB(),
                'DecisionTreeClassifier': DecisionTreeClassifier(),
                'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7),
                'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 
                'linear.SVC': svm.LinearSVC() }

        if model not in clfs:
            raise ConfigError("Unsupported model {}".format(model))

        clf = clfs[model]
        clf.set_params(**parameters)
        return clf
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def __init__(self, isTrain, isOutlierRemoval):
        super(ClassificationAdaBoost, self).__init__(isTrain, isOutlierRemoval)
        # data preprocessing
        self.dataPreprocessing()

        self.dt_stump = DecisionTreeClassifier(max_depth=10)
        self.ada = AdaBoostClassifier(
            base_estimator=self.dt_stump,
            learning_rate=1,
            n_estimators=7,
            algorithm="SAMME.R")
        # self.dt_stump = DecisionTreeClassifier(max_depth=14)
        # self.ada = AdaBoostClassifier(
        #     base_estimator=self.dt_stump,
        #     learning_rate=1,
        #     n_estimators=50,
        #     algorithm="SAMME")
项目:playground    作者:Pennsy    | 项目源码 | 文件源码
def learn_decision_tree(data):
    DT = tree.DecisionTreeClassifier(max_depth=7)
    scorer = make_scorer(matthews_corrcoef)
    for i in range(5):
        scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
        print("iteration",i, "dt mean:", scores.mean())
        scores = list(scores)
        print("Decision Tree train scores:\n", scores)
    return DT
    # DT = DT.fit(train_data[:, :-1], train_data[:, -1])
    # predictionsDT = DT.predict(validation_data[:, :-1])

    # validating predicions
    # dtError = 0
    # for i in range(0, len(validation_data)):
    #         if(validation_data[i][20] != predictionsDT[i]):
    #                 dtError = dtError + 1
    # print("DT Error : ", float(dtError)/len(validation_data)*100.0)
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data,target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")
项目:MachineLearningBasics    作者:zoebchhatriwala    | 项目源码 | 文件源码
def main():
    iris = load_iris()
    test_idx = [0, 50, 100]

    # training Data
    train_target = np.delete(iris.target, test_idx)
    train_data = np.delete(iris.data, test_idx, axis=0)

    # testing data
    test_target = iris.target[test_idx]
    test_data = iris.data[test_idx]

    # Train Classifier
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(train_data, train_target)

    print(clf.predict(test_data))


# Run main
项目:MachineLearningBasics    作者:zoebchhatriwala    | 项目源码 | 文件源码
def main():
    #  0: smooth, 1: bumpy
    features = [[130, 0], [140, 0], [150, 1], [170, 1]]

    # 0: apple, 1: orange
    labels = [0, 0, 1, 1]

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(features, labels)

    # 160, smooth
    predict = [[160, 0]]

    if (clf.predict(predict)[0]) == int(0):
        print('you are describing orange')
    elif (clf.predict(predict)[0]) == int(1):
        print('you are describing apple')
    else:
        print('Can\'t Guess')
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False):
    if use_tree:
        train_clf = Classifier(tree.DecisionTreeClassifier())
    else:
        train_clf = Classifier()

    print train_clf.clf
    print ''

    t_start = time.clock()
    train_clf.learn(train_features, train_labels)
    t_end = time.clock()
    if save_file:
        train_clf.save_to_file(open(save_file, 'w'))

    p_start = time.clock()
    predicted = train_clf.clf.predict(test_features)
    p_end = time.clock()

    test_labels_t = train_clf.labels.transform(test_labels)
    print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_)
    print 'Training time: %fs' % (t_end - t_start)
    print 'Predicting time: %fs' % (p_end - p_start)
    print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted)
    return train_clf.score(test_features, test_labels)
项目:easyML    作者:aarshayj    | 项目源码 | 文件源码
def __init__(
        self,data_block, predictors=[],cv_folds=10,
        scoring_metric='accuracy',additional_display_metrics=[]):

        base_classification.__init__(
            self, alg=DecisionTreeClassifier(), data_block=data_block, 
            predictors=predictors,cv_folds=cv_folds,
            scoring_metric=scoring_metric, 
            additional_display_metrics=additional_display_metrics
            )

        self.model_output = pd.Series(self.default_parameters)
        self.model_output['Feature_Importance'] = "-"

        #Set parameters to default values:
        self.set_parameters(set_default=True)
项目:XTREE    作者:ai-se    | 项目源码 | 文件源码
def learns(tests,trains,indep=lambda x: x[:-1],
                    dep = lambda x: x[-1],
                    rf  = Abcd(),
                    lg  = Abcd(),
                    dt  = Abcd(),
                    nb  = Abcd()):
  x1,y1,x2,y2= trainTest(tests,trains,indep,dep) 
  forest = RandomForestClassifier(n_estimators = 50)  
  forest = forest.fit(x1,y1)
  for n,got in enumerate(forest.predict(x2)):
    rf(predicted = got, actual = y2[n])
  logreg = linear_model.LogisticRegression(C=1e5)
  logreg.fit(x1, y1)
  for n,got in enumerate(logreg.predict(x2)):
    lg(predicted = got, actual = y2[n])
  bayes =  GaussianNB()
  bayes.fit(x1,y1)
  for n,got in enumerate(bayes.predict(x2)):
    nb(predicted = got, actual = y2[n])
  dectree = DecisionTreeClassifier(criterion="entropy",
                         random_state=1)
  dectree.fit(x1,y1)
  for n,got in enumerate(dectree.predict(x2)):
    dt(predicted = got, actual = y2[n])
项目:XTREE    作者:ai-se    | 项目源码 | 文件源码
def CART(train, test, tunings=None, smoteit=True, duplicate=True):
  "  CART"
  # Apply random forest Classifier to predict the number of bugs.
  if smoteit:
    train = SMOTE(train, atleast=50, atmost=101, resample=duplicate)

  if not tunings:
    clf = DecisionTreeClassifier()
  else:
    clf = DecisionTreeClassifier(max_depth=int(tunings[0]),
                                 min_samples_split=int(tunings[1]),
                                 min_samples_leaf=int(tunings[2]),
                                 max_features=float(tunings[3] / 100),
                                 max_leaf_nodes=int(tunings[4]),
                                 criterion='entropy')
  train_DF = formatData(train)
  test_DF = formatData(test)
  features = train_DF.columns[:-2]
  klass = train_DF[train_DF.columns[-2]]
  # set_trace()
  clf.fit(train_DF[features].astype('float32'), klass.astype('float32'))
  preds = clf.predict(test_DF[test_DF.columns[:-2]].astype('float32')).tolist()
  return preds
项目:MLBox    作者:AxeldeRomblay    | 项目源码 | 文件源码
def __init__(self,
                 threshold=0.6,
                 subsample=1.,
                 estimator=DecisionTreeClassifier(max_depth=6),
                 n_folds=2,
                 stratify=True,
                 random_state=1,
                 n_jobs=-1):

        self.threshold = threshold
        self.subsample = subsample
        self.estimator = estimator
        self.n_folds = n_folds
        self.stratify = stratify
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.__Ddrifts = dict()
        self.__fitOK = False
项目:skboost    作者:hbldh    | 项目源码 | 文件源码
def __init__(self,
                 base_estimator=DecisionTreeClassifier(max_depth=10),
                 softmax=None,
                 n_estimators=50,
                 learning_rate=1.0,
                 random_state=None,
                 verbose=False):

        super(MILBoostClassifier, self).__init__(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_state)

        if not isinstance(softmax, SoftmaxFunction):
            raise TypeError("Softmax input must be an object of class `SoftmaxFunction`")
        self.softmax_fcn = softmax
        self._verbose = verbose

        self._bag_labels = None
        self._inferred_y = None
        self._bag_partitioning = None
项目:AirTicketPredicting    作者:junlulocky    | 项目源码 | 文件源码
def parameterChoosing(self):
        # Set the parameters by cross-validation
        tuned_parameters = [{'max_depth': range(2,60),
                             'max_features': ['sqrt', 'log2', None]
                             }
                            ]

        clf = GridSearchCV(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted')
        clf.fit(self.X_train, self.y_train.ravel())

        print "Best parameters set found on development set:\n"
        print clf.best_params_

        print "Grid scores on development set:\n"
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)

        print "Detailed classification report:\n"
        y_true, y_pred = self.y_test, clf.predict(self.X_test)
        print classification_report(y_true, y_pred)
项目:SentiCR    作者:senticr    | 项目源码 | 文件源码
def get_classifier(self):
        algo=self.algo

        if algo=="GBT":
            return GradientBoostingClassifier()
        elif algo=="RF":
            return  RandomForestClassifier()
        elif algo=="ADB":
            return AdaBoostClassifier()
        elif algo =="DT":
            return  DecisionTreeClassifier()
        elif algo=="NB":
            return  BernoulliNB()
        elif algo=="SGD":
            return  SGDClassifier()
        elif algo=="SVC":
            return LinearSVC()
        elif algo=="MLPC":
            return MLPClassifier(activation='logistic',  batch_size='auto',
            early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
            learning_rate_init=0.1, max_iter=5000, random_state=1,
            solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
            warm_start=False)
        return 0
项目:Adaboost    作者:shzygmyx    | 项目源码 | 文件源码
def __init__(self, X, y, estimator = DecisionTreeClassifier, itern = 20, mode = "sign"):
        self.X = X
        self.y = y.copy()
        self.estimator = estimator
        self.mode = mode
        self.itern = itern
        self.estimators = [] # estimators produced by boosting algorithm
        self.alphas = np.array([])  # weights of each boost estimator
        self.m = self.X.shape[0] # number of samples
        self.w = np.array([1/self.m] * self.m) # weights of samples
        self.cls_list = [] # list used to store classes' name and numbers
        self.cls0 = y[0]
        for i in range(self.m):
            if y[i] not in self.cls_list:
                self.cls_list.append(y[i])
            if y[i] == self.cls0:
                self.y[i] = 1
            else:
                self.y[i] = -1
        if len(self.cls_list) != 2:
            raise TypeError(
            '''This Adaboost only support two-class problem, for multiclass 
            problem, please use AdaboostMH.''')
        self.train()
项目:Adaboost    作者:shzygmyx    | 项目源码 | 文件源码
def __init__(self, X, y, estimator = DecisionTreeClassifier, itern = 20, mode = "sign"):
        self.X = X
        self.y = y
        self.estimator = estimator
        self.itern = itern
        self.mode = mode
        self.m = self.X.shape[0] # number of samples
        self.cls_list = [] # list used to store classes' name and numbers
#        if type(y[0]) != np.ndarray:
#           self.y = y.reshape(len(y),-1)
        for i in range(self.m):
            for cls in self.y[i]:
                if cls not in self.cls_list:
                    self.cls_list.append(cls)
        self.k = len(self.cls_list) # number of classes
        self.boost = self.train()
项目:Adaboost    作者:shzygmyx    | 项目源码 | 文件源码
def __init__(self, X, y, code_dic = None, estimator = DecisionTreeClassifier, itern = 20):
        self.X = X
        self.y = y
        self.estimator = estimator
        self.itern = itern
        self.m = self.X.shape[0] # number of samples
        self.cls_list = [] # list used to store classes' name and numbers
        for i in range(self.m):
            if y[i] not in self.cls_list:
                self.cls_list.append(y[i])
        if code_dic != None:
            self.k = len(code_dic[cls_list[0]]) # dimension of encoding space
        else:
            self.k = len(self.cls_list)
            if code_dic == None: # generate default encode dictionary
                code_dic = {} 
                for i in range(self.k):
                    code = np.array([-1] * self.k)
                    code[i] = 1
                    code_dic[self.cls_list[i]] = code
        self.code_dic = code_dic #store {label: array-like code}
        self.boost = self.train()
项目:dask-searchcv    作者:dask    | 项目源码 | 文件源码
def test_no_refit_multiple_metrics():
    clf = DecisionTreeClassifier()
    scoring = {'score_1': 'accuracy', 'score_2': 'accuracy'}

    gs = dcv.GridSearchCV(clf, {'max_depth': [1, 2, 3]}, refit=False,
                          scoring=scoring)
    gs.fit(da_X, da_y)
    assert not hasattr(gs, "best_estimator_")
    assert not hasattr(gs, "best_index_")
    assert not hasattr(gs, "best_score_")
    assert not hasattr(gs, "best_params_")

    for fn_name in ('predict', 'predict_proba', 'predict_log_proba'):
        with pytest.raises(NotFittedError) as exc:
            getattr(gs, fn_name)(X)
        assert (('refit=False. %s is available only after refitting on the '
                 'best parameters' % fn_name) in str(exc.value))
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def build_decision_tree(filename):
    """
        ??????????????
    """
    f=open(sys.argv[1],'r')
    reader=csv.reader(f)
    x=[]
    y=[]
    for line in reader:
        if line[1] in ['1','2','3']:#??????,??????
            x.append(line[2:4]+line[5:])
            y.append(line[1])
    x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y, test_size=0.2, random_state=42)
    clf=tree.DecisionTreeClassifier(max_depth=5)
    clf=clf.fit(x_train,y_train)
    score=clf.score(x_test,y_test)
    print score
    return clf,score
项目:US-TransportationMode    作者:vlomonaco    | 项目源码 | 文件源码
def decision_tree(self, sensors_set):
        features = list(self.dataset.get_sensors_set_features(sensors_set))
        print("DECISION TREE.....")
        print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set))
        print("NUMBER OF FEATURES: ", len(features))
        train_features, train_classes, test_features, test_classes = self.__get_sets_for_classification(
            self.dataset.get_train, self.dataset.get_test, features)
        classifier_decision_tree = tree.DecisionTreeClassifier()
        classifier_decision_tree.fit(train_features, train_classes)
        test_prediction = classifier_decision_tree.predict(test_features)
        acc = accuracy_score(test_classes, test_prediction)
        df_feature = pd.DataFrame(
            {'accuracy': acc, 'features': features, 'importance': classifier_decision_tree.feature_importances_})
        df_feature = df_feature.sort_values(by='importance', ascending=False)
        print("ACCURACY : " + str(acc))
        print("END TREE")

        if not os.path.exists(const.DIR_RESULTS):
            os.makedirs(const.DIR_RESULTS)
        df_feature.to_csv(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_DECISION_TREE_RESULTS, index=False)

    # random forest algorithm training on training al train set and test on all test set
项目:coremltools    作者:apple    | 项目源码 | 文件源码
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.preprocessing import MultiLabelBinarizer
        import numpy as np

        scikit_data = load_boston()
        scikit_model = DecisionTreeClassifier(random_state = 1)
        t = scikit_data.target
        target = np.digitize(t, np.histogram(t)[1]) - 1
        scikit_model.fit(scikit_data.data, target)

        # Save the data and the model
        self.scikit_data = scikit_data
        self.target = target
        self.scikit_model = scikit_model
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Logistic:'
        # print classification_report(self.testLabel, pred_labels)

        self.classifier = SVC()
        self.classifier.fit(self.trainingSet, self.trainingLabel)
        pred_labels = {}
        for user in self.testDict:
            pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
        # print 'SVM:'
        # print classification_report(self.testLabel, pred_labels)

        # classifier = DecisionTreeClassifier(criterion='entropy')
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Decision Tree:'
        # print classification_report(self.testLabel, pred_labels)
        # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel

        return pred_labels
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Logistic:'
        # print classification_report(self.testLabel, pred_labels)
        pred_labels = {}
        classifier = SVC()
        classifier.fit(self.trainingSet, self.trainingLabel)

        for user in self.testDict:
            pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
        # print 'SVM:'
        # print classification_report(self.testLabel, pred_labels)
        return pred_labels

        # classifier = DecisionTreeClassifier(criterion='entropy')
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Decision Tree:'
        # print classification_report(self.testLabel, pred_labels)
        # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
项目:OpinionSpam    作者:Coder-Yu    | 项目源码 | 文件源码
def fitAndPredict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Logistic:'
        # print classification_report(self.testLabel, pred_labels)

        classifier = SVC()
        classifier.fit(self.trainingSet, self.trainingLabel)
        pred_labels = {}
        for user in self.testDict:
            pred_labels[user] = classifier.predict([[self.entropy[user], self.FMD[user]]])
        # print 'SVM:'
        #print classification_report(self.testLabel, pred_labels)

        # classifier = DecisionTreeClassifier(criterion='entropy')
        # classifier.fit(self.trainingSet, self.trainingLabel)
        # pred_labels = classifier.predict(self.testSet)
        # print 'Decision Tree:'
        # print classification_report(self.testLabel, pred_labels)
        # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel

        return pred_labels
项目:Instagram_spider_application    作者:panda0881    | 项目源码 | 文件源码
def train_decision_tree(file_name):
    file = open(file_name, 'r')
    train_data = json.load(file)
    file.close()
    train_list = list()
    train_result = list()
    for train_pair in train_data:
        tmp = formatting_data(train_pair[0])
        train_list.append(tmp)
        train_result.append(train_pair[1])
    my_clf = tree.DecisionTreeClassifier()
    my_clf.fit(train_list, train_result)
    return my_clf


# Setting up all the necessary preparation
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def article_trainers(articles: ArticleDB):
    """
    Run repeated models against article db to predict validity score for
    articles.
    """
    models = [(DecisionTreeClassifier, {}),
              (RandomForestClassifier, {}),
              (LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}),
              (MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}),
              (LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})]
    trained_models = []
    for classifier, param_grid in models:
        res = train_model(articles, classifier, param_grid, probabilities=True)
        trained_models.append((str(res), res))
    ensemble_learner = VotingClassifier(estimators=trained_models[:4],
                                        voting='soft')
    train_model(articles, ensemble_learner, {})
项目:whereareyou    作者:futurice    | 项目源码 | 文件源码
def train_model(data, with_mac=True):
    global without_mac_clf, mac_clf
    df = pd.DataFrame.from_dict(data)
    y = df.pop("location")
    features = [f for f in df.columns if f is not 'mac']
    df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
    model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
    if with_mac:
        df = df.apply(LabelEncoder().fit_transform)
    else:
        df.drop("mac", axis=1, inplace=True)
    clf = DecisionTreeClassifier()
    clf.fit(df, y)
    joblib.dump(clf, model_name)
    if with_mac and mac_clf is None:
        mac_clf = clf
    if not with_mac and without_mac_clf is None:
        without_mac_clf = clf
    export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
    os.system("dot -Tpng model.dot -o model.png")
项目:DataMiningCompetitionFirstPrize    作者:lzddzh    | 项目源码 | 文件源码
def learn(x, y, test_x):
    # set sample weight


    weight_list = []
    for j in range(len(y)):
        if y[j] == "0":
            weight_list.append(variables.weight_0_gdbt_b)
        if y[j] == "1000":
            weight_list.append(variables.weight_1000_gdbt_b)
        if y[j] == "1500":
            weight_list.append(variables.weight_1500_gdbt_b)
        if y[j] == "2000":
            weight_list.append(variables.weight_2000_gdbt_b)

    clf = tree.DecisionTreeClassifier(min_samples_split=500).fit(x, y, weight_list)
    print clf.feature_importances_

    prediction_list = clf.predict(test_x)

    return prediction_list
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def use_bagging_classifier():
    tree = DecisionTreeClassifier(
        criterion='entropy',
        max_depth=None,
        random_state=3,
    )
    bag = BaggingClassifier(
        base_estimator=tree,
        n_estimators=500,
        max_samples=1.0,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=False,
        random_state=1
    )
    return use_ensemble_classifier(tree, 'Decision tree', bag, 'Bagging')
项目:SDLib    作者:Coder-Yu    | 项目源码 | 文件源码
def predict(self):
        # classifier = LogisticRegression()
        # classifier.fit(self.training, self.trainingLabels)
        # pred_labels = classifier.predict(self.test)
        # print 'Logistic:'
        # print classification_report(self.testLabels, pred_labels)
        #
        # classifier = SVC()
        # classifier.fit(self.training, self.trainingLabels)
        # pred_labels = classifier.predict(self.test)
        # print 'SVM:'
        # print classification_report(self.testLabels, pred_labels)

        classifier = DecisionTreeClassifier(criterion='entropy')
        classifier.fit(self.training, self.trainingLabels)
        pred_labels = classifier.predict(self.test)
        print 'Decision Tree:'
        return pred_labels
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, path, etype, **kwargs):
        super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
        self.basedir = "models/ensemble/"
        self.goldstd = kwargs.get("goldstd")
        self.data = {}
        self.offsets = []
        self.pipeline = Pipeline(
            [
                #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                #('clf', SGDClassifier())
                # ('clf', svm.NuSVC(nu=0.01 ))
                ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
                # ('clf', tree.DecisionTreeClassifier(criterion="entropy")),
                # ('clf', MultinomialNB())
                # ('clf', GaussianNB())
                #('clf', svm.SVC(kernel="rbf", degree=2, C=1)),
                #('clf', svm.SVC(kernel="linear", C=2))
                #('clf', DummyClassifier(strategy="constant", constant=True))
            ])
项目:pyTweetBot    作者:nschaetti    | 项目源码 | 文件源码
def __init__(self, classes):
        """
        Constructor
        :param classes: Classes
        :param lang: Spacy language
        """
        super(DecisionTree, self).__init__(classes)
        # Properties
        self._token2index = dict()
        self._voc_size = 0
        self._samples = list()
        self._n_samples = 0
        self._tree_classifier = DecisionTreeClassifier(random_state=0)
    # end __init__

    ##############################################
    # Public
    ##############################################

    ##############################################
    # Override
    ##############################################

    # To str
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithDecisonTree(anamolySample,normalSample):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)
    name = []
    for i in data.columns:
        name.append(i)
    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")
项目:onlineDetectForHadoop    作者:DawnsonLi    | 项目源码 | 文件源码
def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data,target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")
项目:kdd99-scikit    作者:PENGZhaoqing    | 项目源码 | 文件源码
def train(self, training_set, training_target, fea_index):

        clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
        clf = clf.fit(training_set, training_target)

        class_names = np.unique([str(i) for i in training_target])
        feature_names = [attr_list[i] for i in fea_index]

        dot_data = tree.export_graphviz(clf, out_file=None,
                                        feature_names=feature_names,
                                        class_names=class_names,
                                        filled=True, rounded=True,
                                        special_characters=True)

        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("output/tree-vis.pdf")
        joblib.dump(clf, 'output/CART.pkl')
项目:Stock-Market-Analysis-and-Prediction    作者:samshara    | 项目源码 | 文件源码
def performDTClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
    """
    Decision Tree Classification 
    """
    # n = parameters[0]
    # l =  parameters[1]
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)

    if savemodel == True:
        #fname_out = '{}-{}.pickle'.format(fout, datetime.now())
        fname_out = fout+'.pickle'
        with open(fname_out, 'wb') as f:
            pickle.dump(clf, f, -1)    

    accuracy = clf.score(X_test, y_test)

    return accuracy

#TODO: use hdf datastructure for dataframes
项目:jamespy_py3    作者:jskDr    | 项目源码 | 文件源码
def clst( X_train, y_train, X_test, y_test, nb_classes):
    model = tree.DecisionTreeClassifier()
    model.fit( X_train, y_train)
    dt_score = model.score( X_test, y_test)
    print( "DT-C:", dt_score)

    model = svm.SVC( kernel = 'linear')
    model.fit( X_train, y_train)
    sv_score = model.score( X_test, y_test)
    print( "SVC:", sv_score)

    model = kkeras.MLPC( [X_train.shape[1], 30, 10, nb_classes])
    model.fit( X_train, y_train, X_test, y_test, nb_classes)
    mlp_score = model.score( X_test, y_test)
    print( "DNN:", mlp_score)

    model = ensemble.RandomForestClassifier( n_estimators=10)
    model.fit( X_train, y_train)
    rf_score = model.score( X_test, y_test)
    print( "RF:", rf_score)

    return dt_score, sv_score, mlp_score, rf_score
项目:menrva    作者:amirziai    | 项目源码 | 文件源码
def decision_tree(X, y, regression, max_depth=3):
    from sklearn.tree import export_graphviz
    from sklearn.externals.six import StringIO  
    from IPython.core.pylabtools import figsize
    from IPython.display import Image
    figsize(12.5, 6)
    import pydot

    if regression:
        clf = DecisionTreeRegressor(max_depth=max_depth)
    else:
        clf = DecisionTreeClassifier(max_depth=max_depth)

    clf.fit(X, y)
    dot_data = StringIO()  
    export_graphviz(clf, out_file=dot_data, feature_names=list(X.columns),
                    filled=True, rounded=True,)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())
项目:menrva    作者:amirziai    | 项目源码 | 文件源码
def fit_vanilla(x_train, x_test, y_train, y_test):
    scores = dict()

    # Decision tree
    dt = DecisionTreeClassifier(random_state=random_state)
    scores['dt'] = clf_scores(dt, x_train, y_train, x_test, y_test)

    # Logistic Regression
    lr = LogisticRegression(random_state=random_state, n_jobs=-1)
    scores['lr'] = clf_scores(lr, x_train, y_train, x_test, y_test)

    # Random Forest
    rf = RandomForestClassifier(random_state=random_state, n_jobs=-1)
    scores['rf'] = clf_scores(rf, x_train, y_train, x_test, y_test)

    return scores
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def bench_scikit_tree_classifier(X, Y):
    """Benchmark with scikit-learn decision tree classifier"""

    from sklearn.tree import DecisionTreeClassifier

    gc.collect()

    # start time
    tstart = datetime.now()
    clf = DecisionTreeClassifier()
    clf.fit(X, Y).predict(X)
    delta = (datetime.now() - tstart)
    # stop time

    scikit_classifier_results.append(
        delta.seconds + delta.microseconds / mu_second)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_probability():
    # Predict probabilities using DecisionTreeClassifier.

    for name, Tree in CLF_TREES.items():
        clf = Tree(max_depth=1, max_features=1, random_state=42)
        clf.fit(iris.data, iris.target)

        prob_predict = clf.predict_proba(iris.data)
        assert_array_almost_equal(np.sum(prob_predict, 1),
                                  np.ones(iris.data.shape[0]),
                                  err_msg="Failed with {0}".format(name))
        assert_array_equal(np.argmax(prob_predict, 1),
                           clf.predict(iris.data),
                           err_msg="Failed with {0}".format(name))
        assert_almost_equal(clf.predict_proba(iris.data),
                            np.exp(clf.predict_log_proba(iris.data)), 8,
                            err_msg="Failed with {0}".format(name))
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_importances_gini_equal_mse():
    # Check that gini is equivalent to mse for binary output variable

    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    # The gini index and the mean square error (variance) might differ due
    # to numerical instability. Since those instabilities mainly occurs at
    # high tree depth, we restrict this maximal depth.
    clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                 random_state=0).fit(X, y)
    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
                                random_state=0).fit(X, y)

    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_sample_weight_invalid():
    # Check sample weighting raises errors.
    X = np.arange(100)[:, np.newaxis]
    y = np.ones(100)
    y[:50] = 0.0

    clf = DecisionTreeClassifier(random_state=0)

    sample_weight = np.random.rand(100, 1)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.array(0)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(101)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)

    sample_weight = np.ones(99)
    assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_huge_allocations():
    n_bits = int(platform.architecture()[0].rstrip('bit'))

    X = np.random.randn(10, 2)
    y = np.random.randint(0, 2, 10)

    # Sanity check: we cannot request more memory than the size of the address
    # space. Currently raises OverflowError.
    huge = 2 ** (n_bits + 1)
    clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
    assert_raises(Exception, clf.fit, X, y)

    # Non-regression test: MemoryError used to be dropped by Cython
    # because of missing "except *".
    huge = 2 ** (n_bits - 1) - 1
    clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
    assert_raises(MemoryError, clf.fit, X, y)