Python sklearn.tree 模块，export_graphviz() 实例源码

我们从Python开源项目中，提取了以下30个代码示例，用于说明如何使用sklearn.tree.export_graphviz()。

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data,target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithDecisonTree(anamolySample,normalSample):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)
    name = []
    for i in data.columns:
        name.append(i)
    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data = data.append(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)
    print len(data)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data, target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")

项目：onlineDetectForHadoop 作者：DawnsonLi | 项目源码 | 文件源码

def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
    data = anamolySample
    target = []
    for i in range(0,len(anamolySample)):
        target.append(1)
    data.extend(normalSample)
    for i in range(0,len(normalSample)):
        target.append(0)

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(data,target)

    dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data) 
    s = str(time.time())
    graph.write_pdf(s+"DT.pdf")

项目：kdd99-scikit 作者：PENGZhaoqing | 项目源码 | 文件源码

def train(self, training_set, training_target, fea_index):

        clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
        clf = clf.fit(training_set, training_target)

        class_names = np.unique([str(i) for i in training_target])
        feature_names = [attr_list[i] for i in fea_index]

        dot_data = tree.export_graphviz(clf, out_file=None,
                                        feature_names=feature_names,
                                        class_names=class_names,
                                        filled=True, rounded=True,
                                        special_characters=True)

        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("output/tree-vis.pdf")
        joblib.dump(clf, 'output/CART.pkl')

项目：kaggle-tools 作者：yassineAlouini | 项目源码 | 文件源码

def visualize_tree(clf, feature_names, class_names, output_file,
                   method='pdf'):
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data,
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,
                         filled=True, rounded=True,
                         special_characters=True,
                         impurity=False)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    if method == 'pdf':
        graph.write_pdf(output_file + ".pdf")
    elif method == 'inline':
        Image(graph.create_png())

    return graph

# An example using the iris dataset

项目：python_utils 作者：Jayhello | 项目源码 | 文件源码

def visualize_tree(tree, feature_name, dot_file):
    """Create tree png using graphviz.
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    dot_file -- dot file name and path
    """
    with open("tree.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_name)

    dt_png = dot_file.replace('dot', 'png')
    command = ["dot", "-Tpng", dot_file, "-o", dt_png]
    try:
        subprocess.check_call(command)
    except Exception as e:
        print e
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_friedman_mse_in_graphviz():
    clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
    clf.fit(X, y)
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data)

    clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
    clf.fit(X, y)
    for estimator in clf.estimators_:
        export_graphviz(estimator[0], out_file=dot_data)

    for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()):
        assert_in("friedman_mse", finding.group())

项目：FPTuner 作者：soarlab | 项目源码 | 文件源码

def scikitExportDecisionTree2Dot (fname): 
    assert(DT_MODEL is not None) 

    fdot = open(fname, "w") 

    tree.export_graphviz(DT_MODEL, out_file=fdot) 

    fdot.close()

项目：EmotiW-2017-Audio-video-Emotion-Recognition 作者：xujinchang | 项目源码 | 文件源码

def use_tree(X_data,y_data):
    tree = DecisionTreeClassifier(criterion='entropy',max_features='sqrt',max_depth=3,random_state=0)
    tree.fit(X_data,y_data)
    return tree
    # hua tu
    # X_comined = np.vstack((X_data,X_valid))
    # y_comined = np.hstack((y_data,y_valid))
    # plot_decision_regions(X_comined,y_comined,classifier=tree,test_idx=range(105,150))
    # plt.show()
    # export_graphviz(tree,out_file='tree.dot',feature_names = ['petal length','petal width'])

项目：SIDR 作者：damurdock | 项目源码 | 文件源码

def constructModel(corpus, classList, features, modelOutput):
    """
    Trains a Decision Tree model on the test corpus.

    Args:
        corpus: A list of lists, containing the GC content, coverage, and class number.
        classList: A list of class names.
        features: List of variables used by each contig.
        modelOutput: Location to save model as GraphViz DOT, or False to save no model.
    Returns:
        classifier: A DecisionTreeClassifier object that has been trained on the test corpus.
    """
    corpus.sort()  # just in case
    X = []
    Y = []
    for item in corpus:
        X.append(item[:-1]) # all but the last item
        Y.append(item[-1]) # only the last item
    X_train, X_test, Y_train, Y_test = mscv.train_test_split(X, Y, test_size=0.3, random_state=0)
    # TODO: implement classifier testing and comparison, now only baggingClassifier is used as per paper
    #treeClassifier = tree.DecisionTreeClassifier()
    #treeClassifier = treeClassifier.fit(X_train, Y_train)
    #click.echo("Decision tree classifier built, score is %s out of 1.00" % treeClassifier.score(X_test, Y_test))
    baggingClassifier = ensemble.BaggingClassifier()
    baggingClassifier = baggingClassifier.fit(X_train, Y_train)
    click.echo("Bagging classifier built, score is %s out of 1.00" % baggingClassifier.score(X_test, Y_test))
    #forestClassifier = ensemble.RandomForestClassifier(n_estimators=10)
    #forestClassifier = forestClassifier.fit(X_train, Y_train)
    #click.echo("Random forest classifier built, score is %s out of 1.00" % forestClassifier.score(X_test, Y_test))
    #adaClassifier = ensemble.AdaBoostClassifier(n_estimators=100)
    #adaClassifier = adaClassifier.fit(X_train, Y_train)
    #click.echo("AdaBoost classifier built, score is %s out of 1.00" % adaClassifier.score(X_test, Y_test))
    #gradientClassifier = ensemble.GradientBoostingClassifier(n_estimators=100)
    #gradientClassifier = gradientClassifier.fit(X_train, Y_train)
    #click.echo("Gradient tree boosting classifier built, score is %s out of 1.00" % gradientClassifier.score(X_test, Y_test))
    if modelOutput:
        with open(modelOutput, 'w') as dotfile:
            tree.export_graphviz(baggingClassifier, out_file=dotfile, feature_names=features,
                                 class_names=classList, filled=True, rounded=True, special_characters=True)
    return baggingClassifier

项目：easyML 作者：aarshayj | 项目源码 | 文件源码

def export_model(self, IDcol):
        #Export the model into the model file as well as create a submission 
        #with model index. This will be used for creating an ensemble.
        self.export_model_base(IDcol,'decision_tree')

    ## UNDER DEVELOPMENT CODE FOR PRINTING TREES
    # def get_tree(self):
    #     return self.alg.tree_
    # Print the tree in visual format
    # Inputs:
    #     export_pdf - if True, a pdf will be exported with the 
    #     filename as specified in pdf_name argument
    #     pdf_name - name of the pdf file if export_pdf is True
    # def printTree(self, export_pdf=True, file_name="Decision_Tree.pdf"):
    #     dot_data = StringIO() 
    #     export_graphviz(
    #             self.alg, out_file=dot_data, feature_names=self.predictors,
    #             filled=True, rounded=True, special_characters=True)

    #     export_graphviz(
    #         self.alg, out_file='data.dot', feature_names=self.predictors,  
    #         filled=True, rounded=True, special_characters=True
    #         ) 
    #     graph = pydot.graph_from_dot_data(dot_data.getvalue())

    #     if export_pdf:
    #         graph.write_pdf(file_name)

    #     return graph

#####################################################################
##### RANDOM FOREST
#####################################################################

项目：rdocChallenge 作者：Elyne | 项目源码 | 文件源码

def save_decision_tree(treePath, model, fold_idx, featNames):
    if not os.path.exists(treePath):
        os.makedirs(treePath)
    export_graphviz(model, out_file=treePath+'fold'+str(fold_idx)+'.dot', feature_names=featNames, filled=True, class_names=["absent","mild","moderate","severe"], proportion = True)

项目：enhancement 作者：lwzswufe | 项目源码 | 文件源码

def classify(y, x, test_y, test_x):
    global data_df, factor_name, left, right, feature, ratio, threshold
    y_c = np.zeros(len(y))
    y_c[y > 0.02] = 1
    y_c[y < -0.02] = -1
    min_n = int(0.05 * len(y))
    clf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=min_n)
    clf.fit(x, y_c)
    y_p = clf.predict(x)
    fname = "D:\\Cache\\tree.txt"
    test_y = y
    with open(fname, 'w') as f:
        tree.export_graphviz(clf, out_file=f)
        f.close()
    factor_exchange(factor_name, fname)
    left = clf.tree_.children_left
    right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    disp_tree()
    # precision, recall, thresholds = precision_recall_curve(y_c, clf.predict(x))
    '''''???????'''
    print("mean income is:", str(np.average(test_y)),
          "\nwin ratio is: ", str(np.sum(test_y > 0) / len(test_y)))
    print("after training\n"
          "mean class_1 is: ", str(np.average(test_y[y_p > 0])),
          "\nwin ratio is: ", str(np.sum(test_y[y_p > 0] > 0) / np.sum(y_p > 0)),
          "\ntotal class_1 is:", str(np.sum(np.sum(y_p > 0))),
          "\nmean class_0 is: ", str(np.average(test_y[y_p < 0])))

项目：MLLearning 作者：buptdjd | 项目源码 | 文件源码

def DecisionTreeModel(self, dummy_x, dummy_y):
        clf = DecisionTreeClassifier(criterion='entropy')
        clf.fit(dummy_x, dummy_y)
        return clf


# with open('dt_information_gain.dot', 'w') as f:
#     f = export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f)

项目：smart_sniffer 作者：ScarWar | 项目源码 | 文件源码

def create_graphviz_file(self, file_name):
        dot_data = tree.export_graphviz(
            self.clf,
            out_file=None,
            feature_names=self.feature_names,
            class_names=self.target_names,
            filled=True, rounded=True,
            special_characters=True)
        graph = pdp.graph_from_dot_data(dot_data)
        graph.write_pdf(file_name + ".pdf")
        print "Decision graph created"

项目：South-African-Heart-Disease-data-analysis-using-python 作者：khushi4tiwari | 项目源码 | 文件源码

def decisionTree(X,y,attributeNames,classNames,fileName,s="",X_train=None,y_train=None, X_test=None, y_test=None):
    print "Doing decision tree for: "
    print s

    if(X_train is None or X_test is None or y_train is None or y_test is None):
        X_train = X
        X_test = X
        y_train = y
        y_test = y

    # Fit regression tree classifier, Gini split criterion, pruning enabled
    dtc = tree.DecisionTreeClassifier(criterion='gini', min_samples_split=100)
    dtc = dtc.fit(X_train,y_train)

    # Export tree graph for visualization purposes:
    # (note: you can use i.e. Graphviz application to visualize the file)
    out = tree.export_graphviz(dtc, out_file=fileName, feature_names=attributeNames)
    out.close()

    correct = 0
    wrong = 0

    for i in range(0,len(X_test)):
        x = X_test[i,:]
        x_class = dtc.predict(x)[0]
        if((x_class < 0.5 and y_test[i] < 0.5) or (x_class > 0.5 and y_test[i] > 0.5)):
            correct += 1
        else:
            wrong += 1

    rate = double(wrong) / double(correct + wrong)            
    print rate
    print '\n'

    return rate

项目：MLTrading 作者：MasterMSTC | 项目源码 | 文件源码

def train_predictor(df, markov_blanket, p_train=0.6):
    # DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
    #                        min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None,
    #                        min_impurity_split=1e-07, class_weight=None, presort=False)


    # RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
    #                        min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None,
    #                        min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=1, random_state=None,
    #                        verbose=0, warm_start=False, class_weight=None)





    rf = RandomForestClassifier(n_estimators=5)
    clf1 = tree.DecisionTreeClassifier(max_leaf_nodes=10,class_weight=None)

    x = df[list(markov_blanket)].values
    y = df["TAR10"].values

    n_samples = x.shape[0]
    n_train = int(np.round(p_train * n_samples))
    xt = x[:n_train, :]
    yt = y[:n_train]

    n_check = n_samples - n_train
    xc = x[n_train:, :]
    yc = y[n_train:]
    ynames = ["lateral", "alcista"]
    xnames = list(markov_blanket)
    clf1.fit(xt, yt)
    sys.stdout.write("Result INS is {}\n".format(clf1.score(xt, yt)))
    sys.stdout.write("Result OOS is {}\n".format(clf1.score(xc, yc)))
    scores = confusion_matrix(yt, clf1.predict(xt), labels=[0, 1, ])
    scores2 = confusion_matrix(yc, clf1.predict(xc), labels=[0, 1])
    print(scores)
    print(scores2)
    tree.export_graphviz(clf1, out_file='D:\MLmaster\Tree.dot', class_names=ynames, feature_names=xnames)
    return rf

项目：python_utils 作者：Jayhello | 项目源码 | 文件源码

def iris_demo():
    clf = tree.DecisionTreeClassifier()
    iris = load_iris()
    # iris.data??150*4,iris.target ???????0,1,2(150*1)
    clf = clf.fit(iris.data, iris.target)
    dot_file = 'tree.dot'
    tree.export_graphviz(clf, out_file=dot_file)
    visualize_tree(clf, iris.feature_names, dot_file)

    # (graph,) = pydot.graph_from_dot_file('tree.dot')
    # graph.write_png('somefile.png')

项目：python_utils 作者：Jayhello | 项目源码 | 文件源码

def loan_demo():
    dt = tree.DecisionTreeClassifier()
    X, Y = get_loan_data_lh()
    dt = dt.fit(X, Y)
    dot_file = 'loan.dot'
    tree.export_graphviz(dt, out_file=dot_file)
    feature_names = ['age', 'has work', 'own house', 'loan level']
    visualize_tree(dt, feature_names, dot_file)

项目：Parallel-SGD 作者：angadgill | 项目源码 | 文件源码

def test_graphviz_errors():
    # Check for errors of export_graphviz
    clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2)
    clf.fit(X, y)

    # Check feature_names error
    out = StringIO()
    assert_raises(IndexError, export_graphviz, clf, out, feature_names=[])

    # Check class_names error
    out = StringIO()
    assert_raises(IndexError, export_graphviz, clf, out, class_names=[])

项目：Oedipus 作者：tum-i22 | 项目源码 | 文件源码

def classifyTree(Xtr, ytr, Xte, yte, splitCriterion="gini", maxDepth=0, visualizeTree=False):
    """ Classifies data using CART """
    try:
        accuracyRate, probabilities, timing = 0.0, [], 0.0
        # Perform classification
        cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth)
        startTime = time.time()
        prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug")
        cartClassifier.fit(numpy.array(Xtr), numpy.array(ytr))
        prettyPrint("Submitting the test samples", "debug")
        predicted = cartClassifier.predict(Xte)
        endTime = time.time()
        # Compare the predicted and ground truth and append result to list
        accuracyRate = round(metrics.accuracy_score(predicted, yte), 2)
        # Also append the probability estimates
        probs = cartClassifier.predict_proba(Xte)
        probabilities.append(probs)
        timing = endTime-startTime # Keep track of performance
        if visualizeTree:
            # Visualize the tree
            dot_data = StringIO()
            tree.export_graphviz(cartClassifier, out_file=dot_data)
            graph = pydot.graph_from_dot_data(dot_data.getvalue())
            prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % getTimestamp(), "debug")
            graph.write_pdf("tree_%s.pdf" % getTimestamp())

    except Exception as e:
        prettyPrint("Error encountered in \"classifyTree\": %s" % e, "error")

    return accuracyRate, timing, probabilities, predicted

项目：data-preppy 作者：gurgeh | 项目源码 | 文件源码

def visualize_tree(clf, outname, headers):
    from sklearn.externals.six import StringIO
    import pydot
    dot_data = StringIO()
    tree.export_graphviz(clf, out_file=dot_data, feature_names=list(headers))
    graph = pydot.graph_from_dot_data(dot_data.getvalue().decode('latin1').encode('utf8'))
    graph.write_pdf(outname)

项目：SLIC_cityscapes 作者：wpqmanu | 项目源码 | 文件源码

def decision_tree_classifier(all_feature_data):
    input_data=np.asarray(all_feature_data[0])
    label=np.asarray(all_feature_data[1])

    data=input_data[:,:]
    # data=sklearn.preprocessing.normalize(data,axis=0)

    # clf = DecisionTreeClassifier(criterion="gini",
                                 # splitter="best",
                                 # max_features=None,
                                 # max_depth=5,
                                 # min_samples_leaf=1,
                                 # min_samples_split=2,
                                 # class_weight=None
                                 # )
    clf = DecisionTreeClassifier()
    fit_clf=clf.fit(data,label)

    result=fit_clf.predict(data)
    accuracy=float(np.sum(result==label))/len(label)
    print "Training accuracy is " + str(accuracy)
    with open("cityscapes.dot", 'w') as f:
        f = tree.export_graphviz(clf, out_file=f)

    # dot_data = StringIO()
    # tree.export_graphviz(clf, out_file=dot_data)
    # graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("cityscapes.pdf")


    # scores = cross_val_score(clf, data, label, cv=10)
    # print "Cross validation score is "+ str(scores.mean())

    return fit_clf

项目：loglizer 作者：logpai | 项目源码 | 文件源码

def makePrediction(para,rawData,totalNumRows,labels):
    traingSetSize=int(math.floor(totalNumRows*para['trainingSetPercent']))
    print('%d instances are selected as training dataset!'%traingSetSize)
    trainX=np.array(rawData[0:traingSetSize])
    trainY=np.array(labels[0:traingSetSize])
    clf=tree.DecisionTreeClassifier()
    clf=clf.fit(trainX,trainY)

    feaNames=['event'+str(i) for i in range(1,386)]
    classNames=trainY
    # generate the decision tree figure
    # dot_data = StringIO()  #class_names=classNames,
    # tree.export_graphviz(clf, out_file=dot_data, feature_names=feaNames,
    #                      filled=True, rounded=True,
    #                      special_characters=True)
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_png('sample_SOSP.png')

    testingX=rawData[traingSetSize:]
    testingY=labels[traingSetSize:]
    prediction=list(clf.predict(testingX))
    if len(prediction)!=len(testingY):
        print ('prediction and testingY have different length and SOMEWHERE WRONG!')
    sameLabelNum=0
    sameFailureNum=0
    for i in range(len(testingY)):
        if prediction[i]==testingY[i]:
            sameLabelNum+=1
            if prediction[i]==1:
                sameFailureNum+=1

    accuracy=float(sameLabelNum)/len(testingY)
    print ('accuracy is %.5f:'%accuracy)

    predictSuccess=0
    predictFailure=0
    for item in prediction:
        if item==0:
            predictSuccess+=1
        elif item==1:
            predictFailure+=1

    testSuccess=0
    testFailure=0
    for tt in testingY:
        if tt==0:
            testSuccess+=1
        elif tt==1:
            testFailure+=1

    print(predictSuccess,predictFailure,testSuccess,testFailure,sameFailureNum)
    if sameFailureNum==0:
        print ('precision is 0 and recall is 0')
    else:
        precision=float(sameFailureNum)/(predictFailure)
        print('precision is %.5f'%precision)
        recall=float(sameFailureNum)/(testFailure)
        print('recall is %.5f'%recall)
        F_measure=2*precision*recall/(precision+recall)
        print('F_measure is %.5f'%F_measure)
    return predictFailure,testFailure,sameFailureNum,precision,recall,F_measure

项目：Ossian 作者：CSTR-Edinburgh | 项目源码 | 文件源码

def do_training(self, speech_corpus, text_corpus):

        if self.model:  ## if already trained...
            return

        ## 1) get data:
        #### [Added dump_features method to Utterance class, use that: ]
        x_data = []
        y_data = []
        for utterance in speech_corpus:

            utt_feats = utterance.dump_features(self.target_nodes, \
                                                self.context_list, return_dict=True)

            for example in utt_feats:
                assert 'response' in example,example
                y_data.append({'response': example['response']})
                del example['response']
                x_data.append(example)

        ## Handle categorical features (strings) but to keep numerical ones 
        ## as they are:

        x_vectoriser = DictVectorizer()
        x_data = x_vectoriser.fit_transform(x_data).toarray()

        y_vectoriser = DictVectorizer()
        y_data = y_vectoriser.fit_transform(y_data).toarray()

        if False:
            print x_data
            print y_data

        ## 2) train classifier:
        model = tree.DecisionTreeClassifier(min_samples_leaf=self.min_samples_leaf)

        model.fit(x_data, y_data) 
        print '\n Trained classifier: '
        print model
        print '\n Trained x vectoriser:'
        print x_vectoriser
        print 'Feature names:'
        print x_vectoriser.get_feature_names()
        print '\n Trained y vectoriser:'
        print y_vectoriser
        print 'Feature names:'
        print y_vectoriser.get_feature_names()

        ## 3) Save classifier by pickling:
        output = open(self.model_file, 'wb')
        pickle.dump([x_vectoriser, y_vectoriser, model], output)
        output.close()        

        ## Write ASCII tree representation (which can be plotted):
        tree.export_graphviz(model, out_file=self.model_file + '.dot',  \
                                     feature_names=x_vectoriser.get_feature_names())

        self.verify(self.voice_resources) # ## reload -- get self.model etc

项目：Oedipus 作者：tum-i22 | 项目源码 | 文件源码

def classifyTreeKFold(X, y, kFold=2, splitCriterion="gini", maxDepth=0, visualizeTree=False):
    """ Classifies data using CART and K-Fold cross validation """
    try:
        groundTruthLabels, predictedLabels = [], []
        accuracyRates = [] # Meant to hold the accuracy rates
        # Split data into training and test datasets
        trainingDataset, testDataset = [], []
        trainingLabels, testLabels = [], []
        accuracyRates = []
        probabilities = []
        timings = []
        kFoldValidator = KFold(n=len(X), n_folds=kFold, shuffle=False)
        currentFold = 1
        for trainingIndices, testIndices in kFoldValidator:
            # Prepare the training and testing datasets
            for trIndex in trainingIndices:
                trainingDataset.append(X[trIndex])
                trainingLabels.append(y[trIndex])
            for teIndex in testIndices:
                testDataset.append(X[teIndex])
                testLabels.append(y[teIndex])
            # Perform classification
            startTime = time.time()
            cartClassifier = tree.DecisionTreeClassifier(criterion=splitCriterion, max_depth=maxDepth)
            prettyPrint("Training a CART tree for classification using \"%s\" and maximum depth of %s" % (splitCriterion, maxDepth), "debug")
            cartClassifier.fit(numpy.array(trainingDataset), numpy.array(trainingLabels))
            prettyPrint("Submitting the test samples", "debug")
            predicted = cartClassifier.predict(testDataset)
            endTime = time.time()
            # Add that to the groundTruthLabels and predictedLabels matrices
            groundTruthLabels.append(testLabels)
            predictedLabels.append(predicted)
            # Compare the predicted and ground truth and append result to list
            accuracyRates.append(round(metrics.accuracy_score(predicted, testLabels), 2))
            # Also append the probability estimates
            probs = cartClassifier.predict_proba(testDataset)
            probabilities.append(probs)
            timings.append(endTime-startTime) # Keep track of performance
            if visualizeTree:
                # Visualize the tree
                dot_data = StringIO() 
                tree.export_graphviz(cartClassifier, out_file=dot_data) 
                graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
                prettyPrint("Saving learned CART to \"tritonTree_%s.pdf\"" % currentFold, "debug")
                graph.write_pdf("tritonTree_%s.pdf" % currentFold)

            trainingDataset, trainingLabels = [], []
            testDataset, testLabels = [], []
            currentFold += 1 

    except Exception as e:
        prettyPrint("Error encountered in \"classifyTreeKFold\": %s" % e, "error")
        return [], [], []

    return accuracyRates, probabilities, timings, groundTruthLabels, predictedLabels

项目：android-malware-analysis 作者：mwleeds | 项目源码 | 文件源码

def train_tree_classifer(features, labels, model_output_path):
    """
    train_tree_classifer will train a DecisionTree and write it out to a pdf file

    features: 2D array of each input feature for each sample
    labels: array of string labels classifying each sample
    model_output_path: path for storing the trained tree model
    """
    # save 20% of data for performance evaluation
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.2)

    param = [
        {
            "max_depth": [None, 10, 100, 1000, 10000]
        }
    ]

    dtree = tree.DecisionTreeClassifier(random_state=0)

    # 10-fold cross validation, use 4 thread as each fold and each parameter set can be train in parallel
    clf = grid_search.GridSearchCV(dtree, param,
            cv=10, n_jobs=20, verbose=3)

    clf.fit(X_train, y_train)

    if os.path.exists(model_output_path):
        joblib.dump(clf.best_estimator_, model_output_path)
    else:
        print("Cannot save trained tree model to {0}.".format(model_output_path))

    dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf('best_tree.pdf')

    print("\nBest parameters set:")
    print(clf.best_params_)

    y_predict=clf.predict(X_test)

    labels=sorted(list(set(labels)))
    print("\nConfusion matrix:")
    print("Labels: {0}\n".format(",".join(labels)))
    print(confusion_matrix(y_test, y_predict, labels=labels))

    print("\nClassification report:")
    print(classification_report(y_test, y_predict))

项目：SLIC_cityscapes 作者：wpqmanu | 项目源码 | 文件源码

def decision_tree_manual_classifier(all_feature_data):
    input_data=np.asarray(all_feature_data[0])
    label=np.asarray(all_feature_data[1])

    data_for_manual_tree=[]
    for row_index in range(len(all_feature_data[0])):
        current_row=all_feature_data[0][row_index]+[all_feature_data[1][row_index]]
        data_for_manual_tree.append(current_row)

    # # splitting rule
    # set1, set2 = divideset(data_for_manual_tree, 1, 14)
    # # print(set1)
    # print(uniquecounts(set1))
    # print("")
    # # print(set2)
    # print(uniquecounts(set2))
    #
    # print entropy(set1)
    # print entropy(set2)
    # print entropy(data_for_manual_tree)

    tree = buildtree(data_for_manual_tree)


    data=input_data[:,:]
    # data=sklearn.preprocessing.normalize(data,axis=0)

    # clf = DecisionTreeClassifier(criterion="gini",
                                 # splitter="best",
                                 # max_features=None,
                                 # max_depth=5,
                                 # min_samples_leaf=1,
                                 # min_samples_split=2,
                                 # class_weight=None
                                 # )

    for row_index in range(len(all_feature_data[0])):
        to_be_predicted_data=all_feature_data[0][row_index]
        predicted_label=classify(to_be_predicted_data,tree)

    clf = DecisionTreeClassifier()
    fit_clf=clf.fit(data,label)

    result=fit_clf.predict(data)
    accuracy=float(np.sum(result==label))/len(label)
    print "Training accuracy is " + str(accuracy)
    with open("cityscapes.dot", 'w') as f:
        f = tree.export_graphviz(clf, out_file=f)

    return fit_clf