def build_models_NLP(train_pos_vec, train_neg_vec): """ Returns a BernoulliNB and LosticRegression Model that are fit to the training data. """ Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec) # Use sklearn's BernoulliNB and LogisticRegression functions to fit two models to the training data. # For BernoulliNB, use alpha=1.0 and binarize=None # For LogisticRegression, pass no parameters train_vec = [] train_vec.extend(train_pos_vec) train_vec.extend(train_neg_vec) nb_model = BernoulliNB(alpha=1.0, binarize=None, class_prior=None, fit_prior=True) nb_model.fit(train_vec, Y) lr_model = LogisticRegression() lr_model.fit(train_vec, Y) return nb_model, lr_model
def get_classifier(self): algo=self.algo if algo=="GBT": return GradientBoostingClassifier() elif algo=="RF": return RandomForestClassifier() elif algo=="ADB": return AdaBoostClassifier() elif algo =="DT": return DecisionTreeClassifier() elif algo=="NB": return BernoulliNB() elif algo=="SGD": return SGDClassifier() elif algo=="SVC": return LinearSVC() elif algo=="MLPC": return MLPClassifier(activation='logistic', batch_size='auto', early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive', learning_rate_init=0.1, max_iter=5000, random_state=1, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) return 0
def Fit(self, bags, bagData): self.Bayes, self.GBayes = [], [] for i in xrange(10): bnb = BernoulliNB() gnb = GaussianNB() x, y, xg = [], [], [] for j in xrange(10): if i != j: for vv in xrange(len(bagData[j][0])): x.append(self.Convert(bagData[j][0][vv])) xg.append(self.ConvertGauss(bagData[j][0][vv])) y.extend(bagData[j][1]) bnb.fit(x, y) gnb.fit(xg, y) self.Bayes.append(bnb) self.GBayes.append(gnb)
def test_discretenb_pickle(): # Test picklability of discrete naive Bayes classifiers for cls in [BernoulliNB, MultinomialNB, GaussianNB]: clf = cls().fit(X2, y2) y_pred = clf.predict(X2) store = BytesIO() pickle.dump(clf, store) clf = pickle.load(BytesIO(store.getvalue())) assert_array_equal(y_pred, clf.predict(X2)) if cls is not GaussianNB: # TODO re-enable me when partial_fit is implemented for GaussianNB # Test pickling of estimator trained with partial_fit clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2)) clf2.partial_fit(X2[3:], y2[3:]) store = BytesIO() pickle.dump(clf2, store) clf2 = pickle.load(BytesIO(store.getvalue())) assert_array_equal(y_pred, clf2.predict(X2))
def test_input_check_partial_fit(): for cls in [BernoulliNB, MultinomialNB]: # check shape consistency assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], classes=np.unique(y2)) # classes is required for first call to partial fit assert_raises(ValueError, cls().partial_fit, X2, y2) # check consistency of consecutive classes values clf = cls() clf.partial_fit(X2, y2, classes=np.unique(y2)) assert_raises(ValueError, clf.partial_fit, X2, y2, classes=np.arange(42)) # check consistency of input shape for partial_fit assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) # check consistency of input shape for predict assert_raises(ValueError, clf.predict, X2[:, :-1])
def test_discretenb_provide_prior_with_partial_fit(): # Test whether discrete NB classes use provided prior # when using partial_fit iris = load_iris() iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split( iris.data, iris.target, test_size=0.4, random_state=415) for cls in [BernoulliNB, MultinomialNB]: for prior in [None, [0.3, 0.3, 0.4]]: clf_full = cls(class_prior=prior) clf_full.fit(iris.data, iris.target) clf_partial = cls(class_prior=prior) clf_partial.partial_fit(iris_data1, iris_target1, classes=[0, 1, 2]) clf_partial.partial_fit(iris_data2, iris_target2) assert_array_almost_equal(clf_full.class_log_prior_, clf_partial.class_log_prior_)
def test_feature_log_prob_bnb(): # Test for issue #4268. # Tests that the feature log prob value computed by BernoulliNB when # alpha=1.0 is equal to the expression given in Manning, Raghavan, # and Schuetze's "Introduction to Information Retrieval" book: # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]]) Y = np.array([0, 0, 1, 2, 2]) # Fit Bernoulli NB w/ alpha = 1.0 clf = BernoulliNB(alpha=1.0) clf.fit(X, Y) # Manually form the (log) numerator and denominator that # constitute P(feature presence | class) num = np.log(clf.feature_count_ + 1.0) denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T # Check manual estimate matches assert_array_equal(clf.feature_log_prob_, (num - denom))
def __init__(self, info, verbose=True, debug_mode=False): self.label_num=info['label_num'] self.target_num=info['target_num'] self.task = info['task'] self.metric = info['metric'] self.postprocessor = None #self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=True) # To calibrate proba self.postprocessor = MultiLabelEnsemble(LogisticRegression(), balance=False) # To calibrate proba if debug_mode>=2: self.name = "RandomPredictor" self.model = RandomPredictor(self.target_num) self.predict_method = self.model.predict_proba return if info['task']=='regression': if info['is_sparse']==True: self.name = "BaggingRidgeRegressor" self.model = BaggingRegressor(base_estimator=Ridge(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingRegressor" self.model = GradientBoostingRegressor(n_estimators=1, max_depth=4, min_samples_split=14, verbose=verbose, warm_start = True) self.predict_method = self.model.predict # Always predict probabilities else: if info['has_categorical']: # Out of lazziness, we do not convert categorical variables... self.name = "RandomForestClassifier" self.model = RandomForestClassifier(n_estimators=1, verbose=verbose) # unfortunately, no warm start... elif info['is_sparse']: self.name = "BaggingNBClassifier" self.model = BaggingClassifier(base_estimator=BernoulliNB(), n_estimators=1, verbose=verbose) # unfortunately, no warm start... else: self.name = "GradientBoostingClassifier" self.model = eval(self.name + "(n_estimators=1, verbose=" + str(verbose) + ", random_state=1, warm_start = True)") if info['task']=='multilabel.classification': self.model = MultiLabelEnsemble(self.model) self.predict_method = self.model.predict_proba
def test_basic(self, single_chunk_binary_classification): X, y = single_chunk_binary_classification a = nb.PartialBernoulliNB(classes=[0, 1]) b = nb_.BernoulliNB() a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_eq(a.coef_, b.coef_)
def generate_base_classification(): from sklearn.svm import LinearSVC, NuSVC, SVC from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB models = [ #(LinearSVC, params('C', 'loss')), # (NuSVC, params('nu', 'kernel', 'degree')), #(SVC, params('C', 'kernel')), #(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')), (DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')), (RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')), #(GaussianProcessClassifier, None), (LogisticRegression, params('C', 'penalty')), #(PassiveAggressiveClassifier, params('C', 'loss')), #(RidgeClassifier, params('alpha')), # we do in-place modification of what the method params return in order to add # more loss functions that weren't defined in the method #(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])), (KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({ 'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree'] })), (MultinomialNB, params('alpha')), #(GaussianNB, None), #(BernoulliNB, params('alpha')) ] return models
def train_model(data, target): """ Splits the data into a training set and test set Instatiating a Bernoulli Naive Bayes classifier, train on the training set, and then evaluate the model based upon the test set """ # Using cross-validation # TO TRY: stratification for dividing preclassified tweets into homogenous subgroups before # sampling in order to improve the representativeness of the sampling train_tweets, validation_tweets, train_sentiment, validation_sentiment = cross_validation.train_test_split(data, target, test_size=0.4) # Fitting the Naive Bayes classifier wtih the training tweets and corresponding sentiment classifier = BernoulliNB().fit(train_tweets, train_sentiment) predicted = classifier.predict(validation_tweets) # Using the cross-validation split, evaluate the accuracy of the predicted tweets evaluate_model(validation_sentiment, predicted) # Pickling the classifier pickle_file = open('nb_classifier.pickle', 'wb') pickle.dump(classifier, pickle_file) pickle_file.close() return classifier ################################################################################
def train_BNB(X, y): bnb = BernoulliNB() bnb.fit(X_train, y_train) return bnb
def test_BernoulliNB(*data): ''' test BernoulliNB :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data cls=naive_bayes.BernoulliNB() cls.fit(X_train,y_train) print('Training Score: {0}'.format(cls.score(X_train,y_train))) print('Testing Score: {0}'.format(cls.score(X_test, y_test)))
def test_BernoulliNB_alpha(*data): ''' test the performance with different alpha :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data alphas=np.logspace(-2,5,num=200) train_scores=[] test_scores=[] for alpha in alphas: cls=naive_bayes.BernoulliNB(alpha=alpha) cls.fit(X_train,y_train) train_scores.append(cls.score(X_train,y_train)) test_scores.append(cls.score(X_test, y_test)) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(alphas,train_scores,label="Training Score") ax.plot(alphas,test_scores,label="Testing Score") ax.set_xlabel(r"$\alpha$") ax.set_ylabel("score") ax.set_ylim(0,1.0) ax.set_title("BernoulliNB") ax.set_xscale("log") ax.legend(loc="best") plt.show()
def test_BernoulliNB_binarize(*data): ''' test the performance with different binarize :param data: train_data, test_data, train_value, test_value :return: None ''' X_train,X_test,y_train,y_test=data min_x=min(np.min(X_train.ravel()),np.min(X_test.ravel()))-0.1 max_x=max(np.max(X_train.ravel()),np.max(X_test.ravel()))+0.1 binarizes=np.linspace(min_x,max_x,endpoint=True,num=100) train_scores=[] test_scores=[] for binarize in binarizes: cls=naive_bayes.BernoulliNB(binarize=binarize) cls.fit(X_train,y_train) train_scores.append(cls.score(X_train,y_train)) test_scores.append(cls.score(X_test, y_test)) ## graph fig=plt.figure() ax=fig.add_subplot(1,1,1) ax.plot(binarizes,train_scores,label="Training Score") ax.plot(binarizes,test_scores,label="Testing Score") ax.set_xlabel("binarize") ax.set_ylabel("score") ax.set_ylim(0,1.0) ax.set_xlim(min_x-1,max_x+1) ax.set_title("BernoulliNB") ax.legend(loc="best") plt.show()
def sk_bernoulli_demo(): x = np.random.randint(2, size=(6, 100)) y = np.array([1, 2, 3, 4, 4, 5]) clf = BernoulliNB() clf.fit(x, y) # print clf.predict(x[2:3]) print clf.predict(x[2])
def test_discrete_prior(): # Test whether class priors are properly set. for cls in [BernoulliNB, MultinomialNB]: clf = cls().fit(X2, y2) assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8)
def test_discretenb_partial_fit(): for cls in [MultinomialNB, BernoulliNB]: yield check_partial_fit, cls
def test_input_check_fit(): # Test input checks for the fit method for cls in [BernoulliNB, MultinomialNB, GaussianNB]: # check shape consistency for number of samples at fit time assert_raises(ValueError, cls().fit, X2, y2[:-1]) # check shape consistency for number of input features at predict time clf = cls().fit(X2, y2) assert_raises(ValueError, clf.predict, X2[:, :-1])
def test_discretenb_uniform_prior(): # Test whether discrete NB classes fit a uniform prior # when fit_prior=False and class_prior=None for cls in [BernoulliNB, MultinomialNB]: clf = cls() clf.set_params(fit_prior=False) clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) assert_array_equal(prior, np.array([.5, .5]))
def test_discretenb_provide_prior(): # Test whether discrete NB classes use provided prior for cls in [BernoulliNB, MultinomialNB]: clf = cls(class_prior=[0.5, 0.5]) clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) assert_array_equal(prior, np.array([.5, .5])) # Inconsistent number of classes with prior assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], classes=[0, 1, 1])
def test_sample_weight_multiclass(): for cls in [BernoulliNB, MultinomialNB]: # check shape consistency for number of samples at fit time yield check_sample_weight_multiclass, cls
def test_coef_intercept_shape(): # coef_ and intercept_ should have shapes as in other linear models. # Non-regression test for issue #2127. X = [[1, 0, 0], [1, 1, 1]] y = [1, 2] # binary classification for clf in [MultinomialNB(), BernoulliNB()]: clf.fit(X, y) assert_equal(clf.coef_.shape, (1, 3)) assert_equal(clf.intercept_.shape, (1,))
def create_new_user(sess_id): gauss_clf = BernoulliNB() user = User(session_id=sess_id) db.session.add(user) user_id = User.query.filter_by(session_id = sess_id).all()[0].id classifier = Classifiers(user_id=user_id,pickled_classifier=gauss_clf) db.session.add(classifier) db.session.commit() return gauss_clf
def get_pipeline_builder(): pipe_builder = PipelineBuilder() # Feature Extraction params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]} pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params) params = {} pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params) params = {} pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params) # Dimension Reduction params = {} pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params) params = {} pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params) # Normalization params = {} pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params) params = {} pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params) # Classification Models params = {} pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params) params = {} pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params) params = {} pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params) params = {} pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params) return pipe_builder
def train(self): self.pos = open("data/positive.txt", "r").read() self.neg = open("data/negative.txt", "r").read() self.words = [] self.doc = [] for p in self.pos.split('\n'): self.doc.append((p, "pos")) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in ["J"]: self.words.append(w[0].lower()) for p in self.neg.split('\n'): self.doc.append((p, "neg")) words = word_tokenize(p) pos = nltk.pos_tag(words) for w in pos: if w[1][0] in ["J"]: self.words.append(w[0].lower()) pickle.dump(self.doc, open("pickle/doc.pickle", "wb")) self.words = nltk.FreqDist(self.words) self.wordFeat = [self.i for (selfi, self.c)in self.words.most_common(5000)] pickle.dump(self.wordFeat, open("pickle/wordFeat.pickle", "wb")) self.featSet = [(trainClassifier().featureFind(self.rev,self.wordFeat), self.category) for (self.rev, self.category) in self.doc] random.shuffle(self.featSet) self.testSet = self.featSet[10000:] self.triainSet = self.featSet[:10000] pickle.dump(self.featSet,open("pickle/featSet.pickle", "wb")) ONB = nltk.NaiveBayesClassifier.train(self.triainSet) print("Original Naive Bayes Algo accuracy:",round((nltk.clify.accuracy(ONB, self.testSet)) * 100,2),"%") pickle.dump(ONB, open("pickle/ONB.pickle", "wb")) MNB = SklearnClassifier(MultinomialNB()) MNB.train(self.triainSet) print("MultinomialNB accuracy:",round((nltk.clify.accuracy(MNB, self.testSet)) * 100,2),"%") pickle.dump(MNB, open("pickle/MNB.pickle", "wb")) BNB = SklearnClassifier(BernoulliNB()) BNB.train(self.triainSet) print("BernoulliNB accuracy percent:",round((nltk.clify.accuracy(BNB, self.testSet)) * 100,2),"%") pickle.dump(BNB, open("pickle/BNB.pickle", "wb")) LR = SklearnClassifier(LogisticRegression()) LR.train(self.triainSet) print("LogisticRegression accuracy:",round((nltk.clify.accuracy(LR, self.testSet)) * 100,2),"%") pickle.dump(LR, open("pickle/LR.pickle", "wb")) LSVC = SklearnClassifier(LinearSVC()) LSVC.train(self.triainSet) print("LinearSVC accuracy:",round((nltk.clify.accuracy(LSVC, self.testSet)) * 100,2),"%") pickle.dump(LSVC, open("pickle/LSVC.pickle", "wb")) SGDC = SklearnClassifier(SGDClassifier()) SGDC.train(self.triainSet) print("SGDClassifier accuracy:", round(nltk.clify.accuracy(SGDC, self.testSet) * 100,2),"%") pickle.dump(SGDC, open("pickle/SGDC.pickle", "wb"))
def test_bnb(): # Tests that BernoulliNB when alpha=1.0 gives the same values as # those given for the toy example in Manning, Raghavan, and # Schuetze's "Introduction to Information Retrieval" book: # http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html # Training data points are: # Chinese Beijing Chinese (class: China) # Chinese Chinese Shanghai (class: China) # Chinese Macao (class: China) # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo X = np.array([[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]]) # Classes are China (0), Japan (1) Y = np.array([0, 0, 0, 1]) # Fit BernoulliBN w/ alpha = 1.0 clf = BernoulliNB(alpha=1.0) clf.fit(X, Y) # Check the class prior is correct class_prior = np.array([0.75, 0.25]) assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior) # Check the feature probabilities are correct feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], [1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]]) assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob) # Testing data point is: # Chinese Chinese Chinese Tokyo Japan X_test = np.array([[0, 1, 1, 0, 0, 1]]) # Check the predictive probabilities are correct unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]]) predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba) assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)