我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.tree.DecisionTreeClassifier()。
def main(): iris = datasets.load_iris() x = iris.data y = iris.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5) clrTree = tree.DecisionTreeClassifier() clrTree = clrTree.fit(x_train, y_train) outTree = clrTree.predict(x_test) clrKN = KNeighborsClassifier() clrKN = clrKN.fit(x_train, y_train) outKN = clrKN.predict(x_test) # Prediction accuracy print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%") print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
def get_feature_importance(self,clf, model_name ): clfs = {'RandomForestClassifier':'feature_importances', 'ExtraTreesClassifier': 'feature_importances', 'AdaBoostClassifier': 'feature_importances', 'LogisticRegression': 'coef', 'svm.SVC': 'coef', 'GradientBoostingClassifier': 'feature_importances', 'GaussianNB': None, 'DecisionTreeClassifier': 'feature_importances', 'SGDClassifier': 'coef', 'KNeighborsClassifier': None, 'linear.SVC': 'coef'} if clfs[model_name] == 'feature_importances': return list(clf.feature_importances_) elif clfs[model_name] == 'coef': return list(clf.coef_.tolist()) else: return None
def get_classifier_class(class_name): name_table = { 'svm': SVC, 'k_neighbors': KNeighborsClassifier, 'gaussian_process': GaussianProcessClassifier, 'decision_tree': DecisionTreeClassifier, 'random_forest': RandomForestClassifier, 'ada_boost': AdaBoostClassifier, 'mlp': MLPClassifier, 'gaussian_naive_bayes': GaussianNB, 'quadratic_discriminant_analysis': QuadraticDiscriminantAnalysis } if class_name not in name_table: raise ValueError('No such classifier') return name_table[class_name]
def __create_classifiers(self): classifiers = list() classifiers.append({"func": linear_model.SGDClassifier(loss="log"), "name": "sgd"}) classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'), "name": "knn1"}) classifiers.append({"func": neighbors.KNeighborsClassifier(3, weights='distance'), "name": "knn3"}) classifiers.append({"func": neighbors.KNeighborsClassifier(5, weights='distance'), "name": "knn5"}) classifiers.append({"func": GaussianNB(), "name": "naive_bayes"}) # classifiers.append({"func": tree.DecisionTreeClassifier(), "name": "decision_tree"}) # classifiers.append({"func": MLPClassifier(max_iter=10000), "name": "mlp"}) # classifiers.append({"func": RandomForestClassifier(), "name": "random_forest"}) return classifiers
def define_model(self, model, parameters, n_cores = 0): clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'), 'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200), 'LogisticRegression': LogisticRegression(penalty='l1', C=1e5), 'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10), 'GaussianNB': GaussianNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2", n_jobs=7), 'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3), 'linear.SVC': svm.LinearSVC() } if model not in clfs: raise ConfigError("Unsupported model {}".format(model)) clf = clfs[model] clf.set_params(**parameters) return clf
def __init__(self, isTrain, isOutlierRemoval): super(ClassificationAdaBoost, self).__init__(isTrain, isOutlierRemoval) # data preprocessing self.dataPreprocessing() self.dt_stump = DecisionTreeClassifier(max_depth=10) self.ada = AdaBoostClassifier( base_estimator=self.dt_stump, learning_rate=1, n_estimators=7, algorithm="SAMME.R") # self.dt_stump = DecisionTreeClassifier(max_depth=14) # self.ada = AdaBoostClassifier( # base_estimator=self.dt_stump, # learning_rate=1, # n_estimators=50, # algorithm="SAMME")
def learn_decision_tree(data): DT = tree.DecisionTreeClassifier(max_depth=7) scorer = make_scorer(matthews_corrcoef) for i in range(5): scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer) print("iteration",i, "dt mean:", scores.mean()) scores = list(scores) print("Decision Tree train scores:\n", scores) return DT # DT = DT.fit(train_data[:, :-1], train_data[:, -1]) # predictionsDT = DT.predict(validation_data[:, :-1]) # validating predicions # dtError = 0 # for i in range(0, len(validation_data)): # if(validation_data[i][20] != predictionsDT[i]): # dtError = dtError + 1 # print("DT Error : ", float(dtError)/len(validation_data)*100.0)
def analyseReasonWithDecisonTree(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data.extend(normalSample) for i in range(0,len(normalSample)): target.append(0) clf = tree.DecisionTreeClassifier() clf = clf.fit(data,target) dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) s = str(time.time()) graph.write_pdf(s+"DT.pdf")
def main(): iris = load_iris() test_idx = [0, 50, 100] # training Data train_target = np.delete(iris.target, test_idx) train_data = np.delete(iris.data, test_idx, axis=0) # testing data test_target = iris.target[test_idx] test_data = iris.data[test_idx] # Train Classifier clf = tree.DecisionTreeClassifier() clf = clf.fit(train_data, train_target) print(clf.predict(test_data)) # Run main
def main(): # 0: smooth, 1: bumpy features = [[130, 0], [140, 0], [150, 1], [170, 1]] # 0: apple, 1: orange labels = [0, 0, 1, 1] clf = tree.DecisionTreeClassifier() clf = clf.fit(features, labels) # 160, smooth predict = [[160, 0]] if (clf.predict(predict)[0]) == int(0): print('you are describing orange') elif (clf.predict(predict)[0]) == int(1): print('you are describing apple') else: print('Can\'t Guess')
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False): if use_tree: train_clf = Classifier(tree.DecisionTreeClassifier()) else: train_clf = Classifier() print train_clf.clf print '' t_start = time.clock() train_clf.learn(train_features, train_labels) t_end = time.clock() if save_file: train_clf.save_to_file(open(save_file, 'w')) p_start = time.clock() predicted = train_clf.clf.predict(test_features) p_end = time.clock() test_labels_t = train_clf.labels.transform(test_labels) print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_) print 'Training time: %fs' % (t_end - t_start) print 'Predicting time: %fs' % (p_end - p_start) print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted) return train_clf.score(test_features, test_labels)
def __init__( self,data_block, predictors=[],cv_folds=10, scoring_metric='accuracy',additional_display_metrics=[]): base_classification.__init__( self, alg=DecisionTreeClassifier(), data_block=data_block, predictors=predictors,cv_folds=cv_folds, scoring_metric=scoring_metric, additional_display_metrics=additional_display_metrics ) self.model_output = pd.Series(self.default_parameters) self.model_output['Feature_Importance'] = "-" #Set parameters to default values: self.set_parameters(set_default=True)
def learns(tests,trains,indep=lambda x: x[:-1], dep = lambda x: x[-1], rf = Abcd(), lg = Abcd(), dt = Abcd(), nb = Abcd()): x1,y1,x2,y2= trainTest(tests,trains,indep,dep) forest = RandomForestClassifier(n_estimators = 50) forest = forest.fit(x1,y1) for n,got in enumerate(forest.predict(x2)): rf(predicted = got, actual = y2[n]) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(x1, y1) for n,got in enumerate(logreg.predict(x2)): lg(predicted = got, actual = y2[n]) bayes = GaussianNB() bayes.fit(x1,y1) for n,got in enumerate(bayes.predict(x2)): nb(predicted = got, actual = y2[n]) dectree = DecisionTreeClassifier(criterion="entropy", random_state=1) dectree.fit(x1,y1) for n,got in enumerate(dectree.predict(x2)): dt(predicted = got, actual = y2[n])
def CART(train, test, tunings=None, smoteit=True, duplicate=True): " CART" # Apply random forest Classifier to predict the number of bugs. if smoteit: train = SMOTE(train, atleast=50, atmost=101, resample=duplicate) if not tunings: clf = DecisionTreeClassifier() else: clf = DecisionTreeClassifier(max_depth=int(tunings[0]), min_samples_split=int(tunings[1]), min_samples_leaf=int(tunings[2]), max_features=float(tunings[3] / 100), max_leaf_nodes=int(tunings[4]), criterion='entropy') train_DF = formatData(train) test_DF = formatData(test) features = train_DF.columns[:-2] klass = train_DF[train_DF.columns[-2]] # set_trace() clf.fit(train_DF[features].astype('float32'), klass.astype('float32')) preds = clf.predict(test_DF[test_DF.columns[:-2]].astype('float32')).tolist() return preds
def __init__(self, threshold=0.6, subsample=1., estimator=DecisionTreeClassifier(max_depth=6), n_folds=2, stratify=True, random_state=1, n_jobs=-1): self.threshold = threshold self.subsample = subsample self.estimator = estimator self.n_folds = n_folds self.stratify = stratify self.random_state = random_state self.n_jobs = n_jobs self.__Ddrifts = dict() self.__fitOK = False
def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=10), softmax=None, n_estimators=50, learning_rate=1.0, random_state=None, verbose=False): super(MILBoostClassifier, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, random_state=random_state) if not isinstance(softmax, SoftmaxFunction): raise TypeError("Softmax input must be an object of class `SoftmaxFunction`") self.softmax_fcn = softmax self._verbose = verbose self._bag_labels = None self._inferred_y = None self._bag_partitioning = None
def parameterChoosing(self): # Set the parameters by cross-validation tuned_parameters = [{'max_depth': range(2,60), 'max_features': ['sqrt', 'log2', None] } ] clf = GridSearchCV(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted') clf.fit(self.X_train, self.y_train.ravel()) print "Best parameters set found on development set:\n" print clf.best_params_ print "Grid scores on development set:\n" for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params) print "Detailed classification report:\n" y_true, y_pred = self.y_test, clf.predict(self.X_test) print classification_report(y_true, y_pred)
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def __init__(self, X, y, estimator = DecisionTreeClassifier, itern = 20, mode = "sign"): self.X = X self.y = y.copy() self.estimator = estimator self.mode = mode self.itern = itern self.estimators = [] # estimators produced by boosting algorithm self.alphas = np.array([]) # weights of each boost estimator self.m = self.X.shape[0] # number of samples self.w = np.array([1/self.m] * self.m) # weights of samples self.cls_list = [] # list used to store classes' name and numbers self.cls0 = y[0] for i in range(self.m): if y[i] not in self.cls_list: self.cls_list.append(y[i]) if y[i] == self.cls0: self.y[i] = 1 else: self.y[i] = -1 if len(self.cls_list) != 2: raise TypeError( '''This Adaboost only support two-class problem, for multiclass problem, please use AdaboostMH.''') self.train()
def __init__(self, X, y, estimator = DecisionTreeClassifier, itern = 20, mode = "sign"): self.X = X self.y = y self.estimator = estimator self.itern = itern self.mode = mode self.m = self.X.shape[0] # number of samples self.cls_list = [] # list used to store classes' name and numbers # if type(y[0]) != np.ndarray: # self.y = y.reshape(len(y),-1) for i in range(self.m): for cls in self.y[i]: if cls not in self.cls_list: self.cls_list.append(cls) self.k = len(self.cls_list) # number of classes self.boost = self.train()
def __init__(self, X, y, code_dic = None, estimator = DecisionTreeClassifier, itern = 20): self.X = X self.y = y self.estimator = estimator self.itern = itern self.m = self.X.shape[0] # number of samples self.cls_list = [] # list used to store classes' name and numbers for i in range(self.m): if y[i] not in self.cls_list: self.cls_list.append(y[i]) if code_dic != None: self.k = len(code_dic[cls_list[0]]) # dimension of encoding space else: self.k = len(self.cls_list) if code_dic == None: # generate default encode dictionary code_dic = {} for i in range(self.k): code = np.array([-1] * self.k) code[i] = 1 code_dic[self.cls_list[i]] = code self.code_dic = code_dic #store {label: array-like code} self.boost = self.train()
def test_no_refit_multiple_metrics(): clf = DecisionTreeClassifier() scoring = {'score_1': 'accuracy', 'score_2': 'accuracy'} gs = dcv.GridSearchCV(clf, {'max_depth': [1, 2, 3]}, refit=False, scoring=scoring) gs.fit(da_X, da_y) assert not hasattr(gs, "best_estimator_") assert not hasattr(gs, "best_index_") assert not hasattr(gs, "best_score_") assert not hasattr(gs, "best_params_") for fn_name in ('predict', 'predict_proba', 'predict_log_proba'): with pytest.raises(NotFittedError) as exc: getattr(gs, fn_name)(X) assert (('refit=False. %s is available only after refitting on the ' 'best parameters' % fn_name) in str(exc.value))
def build_decision_tree(filename): """ ?????????????? """ f=open(sys.argv[1],'r') reader=csv.reader(f) x=[] y=[] for line in reader: if line[1] in ['1','2','3']:#??????,?????? x.append(line[2:4]+line[5:]) y.append(line[1]) x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y, test_size=0.2, random_state=42) clf=tree.DecisionTreeClassifier(max_depth=5) clf=clf.fit(x_train,y_train) score=clf.score(x_test,y_test) print score return clf,score
def decision_tree(self, sensors_set): features = list(self.dataset.get_sensors_set_features(sensors_set)) print("DECISION TREE.....") print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set)) print("NUMBER OF FEATURES: ", len(features)) train_features, train_classes, test_features, test_classes = self.__get_sets_for_classification( self.dataset.get_train, self.dataset.get_test, features) classifier_decision_tree = tree.DecisionTreeClassifier() classifier_decision_tree.fit(train_features, train_classes) test_prediction = classifier_decision_tree.predict(test_features) acc = accuracy_score(test_classes, test_prediction) df_feature = pd.DataFrame( {'accuracy': acc, 'features': features, 'importance': classifier_decision_tree.feature_importances_}) df_feature = df_feature.sort_values(by='importance', ascending=False) print("ACCURACY : " + str(acc)) print("END TREE") if not os.path.exists(const.DIR_RESULTS): os.makedirs(const.DIR_RESULTS) df_feature.to_csv(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_DECISION_TREE_RESULTS, index=False) # random forest algorithm training on training al train set and test on all test set
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import MultiLabelBinarizer import numpy as np scikit_data = load_boston() scikit_model = DecisionTreeClassifier(random_state = 1) t = scikit_data.target target = np.digitize(t, np.histogram(t)[1]) - 1 scikit_model.fit(scikit_data.data, target) # Save the data and the model self.scikit_data = scikit_data self.target = target self.scikit_model = scikit_model
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) self.classifier = SVC() self.classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = self.classifier.predict([[self.BDS[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel return pred_labels
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) pred_labels = {} classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) for user in self.testDict: pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]]) # print 'SVM:' # print classification_report(self.testLabel, pred_labels) return pred_labels # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel
def fitAndPredict(self): # classifier = LogisticRegression() # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Logistic:' # print classification_report(self.testLabel, pred_labels) classifier = SVC() classifier.fit(self.trainingSet, self.trainingLabel) pred_labels = {} for user in self.testDict: pred_labels[user] = classifier.predict([[self.entropy[user], self.FMD[user]]]) # print 'SVM:' #print classification_report(self.testLabel, pred_labels) # classifier = DecisionTreeClassifier(criterion='entropy') # classifier.fit(self.trainingSet, self.trainingLabel) # pred_labels = classifier.predict(self.testSet) # print 'Decision Tree:' # print classification_report(self.testLabel, pred_labels) # return self.trainingSet, self.trainingLabel, self.testSet, self.testLabel return pred_labels
def train_decision_tree(file_name): file = open(file_name, 'r') train_data = json.load(file) file.close() train_list = list() train_result = list() for train_pair in train_data: tmp = formatting_data(train_pair[0]) train_list.append(tmp) train_result.append(train_pair[1]) my_clf = tree.DecisionTreeClassifier() my_clf.fit(train_list, train_result) return my_clf # Setting up all the necessary preparation
def article_trainers(articles: ArticleDB): """ Run repeated models against article db to predict validity score for articles. """ models = [(DecisionTreeClassifier, {}), (RandomForestClassifier, {}), (LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}), (MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}), (LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})] trained_models = [] for classifier, param_grid in models: res = train_model(articles, classifier, param_grid, probabilities=True) trained_models.append((str(res), res)) ensemble_learner = VotingClassifier(estimators=trained_models[:4], voting='soft') train_model(articles, ensemble_learner, {})
def train_model(data, with_mac=True): global without_mac_clf, mac_clf df = pd.DataFrame.from_dict(data) y = df.pop("location") features = [f for f in df.columns if f is not 'mac'] df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features]))) model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME if with_mac: df = df.apply(LabelEncoder().fit_transform) else: df.drop("mac", axis=1, inplace=True) clf = DecisionTreeClassifier() clf.fit(df, y) joblib.dump(clf, model_name) if with_mac and mac_clf is None: mac_clf = clf if not with_mac and without_mac_clf is None: without_mac_clf = clf export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot') os.system("dot -Tpng model.dot -o model.png")
def learn(x, y, test_x): # set sample weight weight_list = [] for j in range(len(y)): if y[j] == "0": weight_list.append(variables.weight_0_gdbt_b) if y[j] == "1000": weight_list.append(variables.weight_1000_gdbt_b) if y[j] == "1500": weight_list.append(variables.weight_1500_gdbt_b) if y[j] == "2000": weight_list.append(variables.weight_2000_gdbt_b) clf = tree.DecisionTreeClassifier(min_samples_split=500).fit(x, y, weight_list) print clf.feature_importances_ prediction_list = clf.predict(test_x) return prediction_list
def use_bagging_classifier(): tree = DecisionTreeClassifier( criterion='entropy', max_depth=None, random_state=3, ) bag = BaggingClassifier( base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, random_state=1 ) return use_ensemble_classifier(tree, 'Decision tree', bag, 'Bagging')
def predict(self): # classifier = LogisticRegression() # classifier.fit(self.training, self.trainingLabels) # pred_labels = classifier.predict(self.test) # print 'Logistic:' # print classification_report(self.testLabels, pred_labels) # # classifier = SVC() # classifier.fit(self.training, self.trainingLabels) # pred_labels = classifier.predict(self.test) # print 'SVM:' # print classification_report(self.testLabels, pred_labels) classifier = DecisionTreeClassifier(criterion='entropy') classifier.fit(self.training, self.trainingLabels) pred_labels = classifier.predict(self.test) print 'Decision Tree:' return pred_labels
def __init__(self, path, etype, **kwargs): super(EnsembleModel, self).__init__(path, etype=etype, **kwargs) self.basedir = "models/ensemble/" self.goldstd = kwargs.get("goldstd") self.data = {} self.offsets = [] self.pipeline = Pipeline( [ #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) # ('clf', svm.NuSVC(nu=0.01 )) ('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True)) # ('clf', tree.DecisionTreeClassifier(criterion="entropy")), # ('clf', MultinomialNB()) # ('clf', GaussianNB()) #('clf', svm.SVC(kernel="rbf", degree=2, C=1)), #('clf', svm.SVC(kernel="linear", C=2)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def __init__(self, classes): """ Constructor :param classes: Classes :param lang: Spacy language """ super(DecisionTree, self).__init__(classes) # Properties self._token2index = dict() self._voc_size = 0 self._samples = list() self._n_samples = 0 self._tree_classifier = DecisionTreeClassifier(random_state=0) # end __init__ ############################################## # Public ############################################## ############################################## # Override ############################################## # To str
def analyseReasonWithDecisonTree(anamolySample,normalSample,name): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) print len(data) clf = tree.DecisionTreeClassifier() clf = clf.fit(data, target) dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) s = str(time.time()) graph.write_pdf(s+"DT.pdf")
def analyseReasonWithDecisonTree(anamolySample,normalSample): data = anamolySample target = [] for i in range(0,len(anamolySample)): target.append(1) data = data.append(normalSample) for i in range(0,len(normalSample)): target.append(0) print len(data) clf = tree.DecisionTreeClassifier() clf = clf.fit(data, target) name = [] for i in data.columns: name.append(i) dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) s = str(time.time()) graph.write_pdf(s+"DT.pdf")
def train(self, training_set, training_target, fea_index): clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced") clf = clf.fit(training_set, training_target) class_names = np.unique([str(i) for i in training_target]) feature_names = [attr_list[i] for i in fea_index] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("output/tree-vis.pdf") joblib.dump(clf, 'output/CART.pkl')
def performDTClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): """ Decision Tree Classification """ # n = parameters[0] # l = parameters[1] clf = DecisionTreeClassifier() clf.fit(X_train, y_train) if savemodel == True: #fname_out = '{}-{}.pickle'.format(fout, datetime.now()) fname_out = fout+'.pickle' with open(fname_out, 'wb') as f: pickle.dump(clf, f, -1) accuracy = clf.score(X_test, y_test) return accuracy #TODO: use hdf datastructure for dataframes
def clst( X_train, y_train, X_test, y_test, nb_classes): model = tree.DecisionTreeClassifier() model.fit( X_train, y_train) dt_score = model.score( X_test, y_test) print( "DT-C:", dt_score) model = svm.SVC( kernel = 'linear') model.fit( X_train, y_train) sv_score = model.score( X_test, y_test) print( "SVC:", sv_score) model = kkeras.MLPC( [X_train.shape[1], 30, 10, nb_classes]) model.fit( X_train, y_train, X_test, y_test, nb_classes) mlp_score = model.score( X_test, y_test) print( "DNN:", mlp_score) model = ensemble.RandomForestClassifier( n_estimators=10) model.fit( X_train, y_train) rf_score = model.score( X_test, y_test) print( "RF:", rf_score) return dt_score, sv_score, mlp_score, rf_score
def decision_tree(X, y, regression, max_depth=3): from sklearn.tree import export_graphviz from sklearn.externals.six import StringIO from IPython.core.pylabtools import figsize from IPython.display import Image figsize(12.5, 6) import pydot if regression: clf = DecisionTreeRegressor(max_depth=max_depth) else: clf = DecisionTreeClassifier(max_depth=max_depth) clf.fit(X, y) dot_data = StringIO() export_graphviz(clf, out_file=dot_data, feature_names=list(X.columns), filled=True, rounded=True,) graph = pydot.graph_from_dot_data(dot_data.getvalue()) return Image(graph.create_png())
def fit_vanilla(x_train, x_test, y_train, y_test): scores = dict() # Decision tree dt = DecisionTreeClassifier(random_state=random_state) scores['dt'] = clf_scores(dt, x_train, y_train, x_test, y_test) # Logistic Regression lr = LogisticRegression(random_state=random_state, n_jobs=-1) scores['lr'] = clf_scores(lr, x_train, y_train, x_test, y_test) # Random Forest rf = RandomForestClassifier(random_state=random_state, n_jobs=-1) scores['rf'] = clf_scores(rf, x_train, y_train, x_test, y_test) return scores
def bench_scikit_tree_classifier(X, Y): """Benchmark with scikit-learn decision tree classifier""" from sklearn.tree import DecisionTreeClassifier gc.collect() # start time tstart = datetime.now() clf = DecisionTreeClassifier() clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_classifier_results.append( delta.seconds + delta.microseconds / mu_second)
def test_probability(): # Predict probabilities using DecisionTreeClassifier. for name, Tree in CLF_TREES.items(): clf = Tree(max_depth=1, max_features=1, random_state=42) clf.fit(iris.data, iris.target) prob_predict = clf.predict_proba(iris.data) assert_array_almost_equal(np.sum(prob_predict, 1), np.ones(iris.data.shape[0]), err_msg="Failed with {0}".format(name)) assert_array_equal(np.argmax(prob_predict, 1), clf.predict(iris.data), err_msg="Failed with {0}".format(name)) assert_almost_equal(clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)), 8, err_msg="Failed with {0}".format(name))
def test_importances_gini_equal_mse(): # Check that gini is equivalent to mse for binary output variable X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # The gini index and the mean square error (variance) might differ due # to numerical instability. Since those instabilities mainly occurs at # high tree depth, we restrict this maximal depth. clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(X, y) reg = DecisionTreeRegressor(criterion="mse", max_depth=5, random_state=0).fit(X, y) assert_almost_equal(clf.feature_importances_, reg.feature_importances_) assert_array_equal(clf.tree_.feature, reg.tree_.feature) assert_array_equal(clf.tree_.children_left, reg.tree_.children_left) assert_array_equal(clf.tree_.children_right, reg.tree_.children_right) assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_sample_weight_invalid(): # Check sample weighting raises errors. X = np.arange(100)[:, np.newaxis] y = np.ones(100) y[:50] = 0.0 clf = DecisionTreeClassifier(random_state=0) sample_weight = np.random.rand(100, 1) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.array(0) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.ones(101) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight) sample_weight = np.ones(99) assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)
def test_huge_allocations(): n_bits = int(platform.architecture()[0].rstrip('bit')) X = np.random.randn(10, 2) y = np.random.randint(0, 2, 10) # Sanity check: we cannot request more memory than the size of the address # space. Currently raises OverflowError. huge = 2 ** (n_bits + 1) clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge) assert_raises(Exception, clf.fit, X, y) # Non-regression test: MemoryError used to be dropped by Cython # because of missing "except *". huge = 2 ** (n_bits - 1) - 1 clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge) assert_raises(MemoryError, clf.fit, X, y)