我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.naive_bayes.MultinomialNB()。
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovr.partial_fit(iris.data[60:], iris.target[60:]) pred = ovr.predict(iris.data) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred), 0.65)
def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]]) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet(), Lasso(alpha=0.5)): clf = OneVsRestClassifier(base_clf).fit(X, y) y_pred = clf.predict([[0, 4, 4]])[0] assert_array_equal(y_pred, [0, 1, 1]) assert_true(clf.multilabel_)
def predict_job(job_list): """Assign a classification to a url""" # TODO: Add case where len is 1 or 0.... job_list = [job for j in job_list for job in j] new_job_list = [regex.tokenize_and_stem(i) for i in job_list] new_job_list = [' '.join(job) for job in new_job_list] vect = CountVectorizer() x_series = pd.Series(X) X_train_dtm = vect.fit_transform(x_series) y_train = pd.Series(y) job_list_series = pd.Series(new_job_list) job_list_dtm = vect.transform(job_list_series) nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred = nb.predict(job_list_dtm) # for i in range(len(job_list)): # print(job_list[i], y_pred[i]) return y_pred # print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def train_classifier(self, trainvectors, labels, no_label_encoding=False, alpha='default', fit_prior=True, iterations=10): if alpha == '': paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6) paramsearch.fit(trainvectors,self.label_encoder.transform(labels)) selected_alpha = paramsearch.best_estimator_.alpha elif alpha == 'default': selected_alpha = 1.0 else: selected_alpha = float(alpha) if fit_prior == 'False': fit_prior = False else: fit_prior = True self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior) if no_label_encoding: self.model.fit(trainvectors, labels) else: self.model.fit(trainvectors, self.label_encoder.transform(labels))
def __init__(self, df, weight=True, min_ct=0, total_iter=5): self.logger = logging.getLogger(__name__) super(MultinomialNaiveBayes, self).__init__(total_iterations=total_iter) # call base constructor #self.set_min_count(min_ct) self.is_weighted_sample = weight # process data #df = self._filter_rows(df) # filter out low count rows # row_sums = df.sum(axis=1).astype(float) # df = df.div(row_sums, axis=0) # normalize each row # df = df.mul(100) # df.to_csv('tmp.nbclf.txt', sep='\t') df = df.fillna(df.mean()) total = df['total'] df = df[['recurrent missense', 'recurrent indel', 'frame shift', 'nonsense', 'missense', 'synonymous', 'inframe indel', 'no protein', 'lost stop', 'splicing mutation']] df = df.mul(total, axis=0).astype(int) # get back counts instead of pct self.x, self.y = features.randomize(df) # setup classifier self.clf = MultinomialNB(alpha=1, # laplacian smooth, i.e. pseudocounts fit_prior=True) # use data for prior class probs
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def create_ngram_model(params=None): def preprocessor(tweet): global emoticons_replaced tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") clf = MultinomialNB() pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def article_trainers(articles: ArticleDB): """ Run repeated models against article db to predict validity score for articles. """ models = [(DecisionTreeClassifier, {}), (RandomForestClassifier, {}), (LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}), (MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}), (LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})] trained_models = [] for classifier, param_grid in models: res = train_model(articles, classifier, param_grid, probabilities=True) trained_models.append((str(res), res)) ensemble_learner = VotingClassifier(estimators=trained_models[:4], voting='soft') train_model(articles, ensemble_learner, {})
def test_integrated_plot_numpy_named_arrays(self): model = naive_bayes.MultinomialNB() X = np.array([ (1.1, 9.52, 1.23, 0.86, 7.89, 0.13), (3.4, 2.84, 8.65, 0.45, 7.43, 0.16), (1.2, 3.22, 6.56, 0.24, 3.45, 0.17), (3.8, 6.18, 2.45, 0.28, 2.53, 0.13), (5.1, 9.12, 1.06, 0.19, 1.43, 0.13), (4.4, 8.84, 4.97, 0.98, 1.35, 0.13), (3.2, 3.22, 5.03, 0.68, 3.53, 0.32), (7.8, 2.18, 6.87, 0.35, 3.25, 0.38), ], dtype=[('a','<f8'), ('b','<f8'), ('c','<f8'), ('d','<f8'), ('e','<f8'), ('f','<f8')] ) y = np.array([1, 1, 0, 1, 0, 0, 1, 0]) visualizer = DecisionBoundariesVisualizer(model, features=['a', 'f']) visualizer.fit_draw_poof(X, y=y) self.assertEquals(visualizer.features_, ['a', 'f']) self.assert_images_similar(visualizer)
def __init__(self, corpus, relationtype, modelname="scikit_classifier"): super(ScikitRE, self).__init__() self.modelname = relationtype + "_" + modelname self.relationtype = relationtype self.pairtype = relationtype self.corpus = corpus self.pairs = [] self.features = [] self.labels = [] self.pred = [] self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt") self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True) self.generate_data(corpus, modelname, relationtype) self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)), #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)), #('tfidf', TfidfTransformer(use_idf=True, norm="l2")), #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)), #('clf', SGDClassifier()) #('clf', svm.NuSVC(nu=0.01 )) #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1)) ('clf', MultinomialNB(alpha=0.01, fit_prior=False)) #('clf', DummyClassifier(strategy="constant", constant=True)) ])
def __init__(self, fit_scaler=None, transform_scaler='bin'): self.fit_scaler=fit_scaler self.transform_scaler=transform_scaler if fit_scaler in MNBScaler.fit_scalers: self.fit_scaler_ = None if fit_scaler is None else MNBScaler.fit_scalers[fit_scaler]() else: raise ValueError("fit_scaler should be one of %r but %s specified" % (MNBScaler.fit_scalers.keys(), fit_scaler)) if transform_scaler in MNBScaler.transform_scalers: self.transform_scaler_ = None if transform_scaler is None else \ self.fit_scaler_ if transform_scaler=='auto' else \ MNBScaler.transform_scalers[transform_scaler]() else: raise ValueError("transform_scaler should be one of %r but %s specified" % (MNBScaler.transform_scalers.keys(), transform_scaler)) self.mnb_ = MultinomialNB()
def MultinomialNBPredictModel(localTrainLabel, config): train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",") test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",") print "Train tf-idf vector Model..." encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50) localTrainFeature = encode.fit_transform(train['qlist'].values) localTestFeature = encode.transform(train['qlist'].values) print localTrainFeature.shape, localTestFeature.shape print 'train...' model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) model.fit(X = localTrainFeature, y = localTrainLabel) print 'predict...' if config['prob'] == False: return model.predict(localTestFeature), test['uid'].values else: return model.predict_log_proba(localTestFeature), test['uid'].values #-- xgboost local corss validation model frame
def test_ovr_multilabel_dataset(): base_clf = MultinomialNB(alpha=1) for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2) assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2)
def test_ovr_single_label_predict_proba(): base_clf = MultinomialNB(alpha=1) X, Y = iris.data, iris.target X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) # decision function only estimator. Fails in current implementation. decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train) assert_raises(AttributeError, decision_only.predict_proba, X_test) Y_pred = clf.predict(X_test) Y_proba = clf.predict_proba(X_test) assert_almost_equal(Y_proba.sum(axis=1), 1.0) # predict assigns a label if the probability that the # sample has the label is greater than 0.5. pred = np.array([l.argmax() for l in Y_proba]) assert_false((pred - Y_pred).any())
def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65)
def test_input_check_partial_fit(): for cls in [BernoulliNB, MultinomialNB]: # check shape consistency assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], classes=np.unique(y2)) # classes is required for first call to partial fit assert_raises(ValueError, cls().partial_fit, X2, y2) # check consistency of consecutive classes values clf = cls() clf.partial_fit(X2, y2, classes=np.unique(y2)) assert_raises(ValueError, clf.partial_fit, X2, y2, classes=np.arange(42)) # check consistency of input shape for partial_fit assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) # check consistency of input shape for predict assert_raises(ValueError, clf.predict, X2[:, :-1])
def getClassifier(self,**kwargs): """ returns a vectorizer to predict the query """ self.path = kwargs.get('path','trainer') self.df = self.trainWith(self.path) self.vectorizer = CountVectorizer() counts = self.vectorizer.fit_transform(self.df['message'].values) self.classifier = MultinomialNB() targets = self.df['class'].values self.classifier.fit(counts, targets) os.chdir(self.old_loc) return self.classifier,self.vectorizer
def classify(n = 50): #clf = MultinomialNB(fit_prior=False) #clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0}) clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0}) clf.fit(mat[:n], rel[:n]) return clf
def train_test(): """Identify accuracy via training set""" X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2) vect = CountVectorizer() X_train_dtm = vect.fit_transform(X_train) # creates vocab set and dtm for each raw document! X_test_dtm = vect.transform(X_test) nb = MultinomialNB() nb.fit(X_train_dtm, y_train) y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm # w = list(X_test) return metrics.accuracy_score(y_test, y_pred_class) # print(train_test())
def nb_test(X,y): X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1) model = MultinomialNB() model.fit(X_train, y_train) y_pred = model.predict(X_test) print metrics.accuracy_score(y_test,y_pred)
def __create_ngram_model(self, lang): if lang == 'en': tfidf_ngrams = EnglishTfidfVectorizer(decode_error='ignore') elif lang == 'ja': tfidf_ngrams = JapaneseTfidfVectorizer(decode_error='ignore') clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) return pipeline
def get_naive_bayes(is_multiclass=True): return MultinomialNB()
def _estimator(self): return MultinomialNB()
def train_classifier(self, trainvectors, labels, alpha='default', fit_prior=True, iterations=10): if alpha == '': paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6) paramsearch.fit(trainvectors,self.label_encoder.transform(labels)) selected_alpha = paramsearch.best_estimator_.alpha elif alpha == 'default': selected_alpha = 1.0 else: selected_alpha = alpha if fit_prior == 'False': fit_prior = False else: fit_prior = True self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior) self.model.fit(trainvectors, self.label_encoder.transform(labels))
def test_basic(self, single_chunk_count_classification): X, y = single_chunk_count_classification a = nb.PartialMultinomialNB(classes=[0, 1]) b = nb_.MultinomialNB() a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_eq(a.coef_, b.coef_)
def case1(): from sklearn import datasets news = datasets.fetch_20newsgroups(subset='all') # print len(news.data) # print len(news.target) # print '*'*10 # print news.data[0] # print '*'*10 # print news.target[0] from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer vec = CountVectorizer() x = vec.fit_transform(news.data) # print x.shape # print x[:2] print x[:10,:10].toarray() TFIDF = TfidfTransformer() x_tfidf = TFIDF.fit_transform(x) print x_tfidf[:10,:10].toarray() from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233) tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233) from sklearn.naive_bayes import MultinomialNB mnb =MultinomialNB() tf_mnb = MultinomialNB() mmb.fit(Xtrain,ytrain) tf_mnb.fit(tf_Xtrain,tf_ytrain)
def test_init_no_file(): mm = mnb_modelmanager.MNBModelManager() assert isinstance(mm, mnb_modelmanager.MNBModelManager) assert isinstance(mm.clf, Pipeline) assert isinstance(mm.clf.named_steps['clf'], MultinomialNB)
def test_init_mnb(): ct = classifytext.ClassifyText(type=classifytext.MNB) assert isinstance(ct.mm, mnb_modelmanager.MNBModelManager) assert isinstance(ct.mm.clf, Pipeline) # assert isinstance(ct.mm.clf.named_steps['clf'], MultinomialNB)
def __init__(self, filename=None): super().__init__(filename) if not filename: self.clf = Pipeline([ ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'))), ('anova', SelectPercentile(f_classif)), ('clf', MultinomialNB()) ])
def NbClass(x_train, y_train): from sklearn.naive_bayes import MultinomialNB clf=MultinomialNB(alpha=0.01).fit(x_train, y_train) return clf #========Logistic Regression========#
def bayes(self): self.mnb = MultinomialNB() self.y_train=self.y_train.astype('int') self.mnb.fit(self.x_trainvect,self.y_train)
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()) ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
def train(self, datadict, labels=None): ''' Runs the classifier training using the dictionary of label, features @param datadict: dictonary of label, features @param labels: (optional) list of labels. If given the order of labels is used from this list. ''' # Set labels from data dict if labels is None: self.labels = datadict.keys() else: self.labels = labels # Train the GMM for BoF computation if self.model.gmm is None: print >> sys.stderr, 'Model not trained yet.' self.model.train(datadict, self.labels) print >> sys.stderr,'Computing',self.model.__class__.__name__,'...' # Parse dictionary into BoF representations and labels bofs, bofl = self._parse_dict(datadict, self.labels) #Create Multinomial Bayes print >> sys.stderr,'Training Multinomial Bayes ...' self.bay = bayes.MultinomialNB(alpha=0.5, fit_prior=False) self.bay.fit(bofs, bofl) return
def train_expert(action_context): logreg = OneVsRestClassifier(LogisticRegression()) mnb = OneVsRestClassifier(MultinomialNB(), ) logreg.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1]) mnb.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1]) return [logreg, mnb]
def train_expert(history_context, history_action): n_round = len(history_context) history_context = np.array([history_context[t] for t in range(n_round)]) history_action = np.array([history_action[t] for t in range(n_round)]) logreg = OneVsRestClassifier(LogisticRegression()) mnb = OneVsRestClassifier(MultinomialNB()) logreg.fit(history_context, history_action) mnb.fit(history_context, history_action) return [logreg, mnb]
def get_pipeline(sample_col, parallel_jobs=None): feat_ext_objs = [feat_ext_class(sample_col) for feat_ext_class in get_objs(FEAT_EXTS_DIR, 'Worker')] feat_ext_tuples = [(feat_ext_obj.feature_name, feat_ext_obj) for feat_ext_obj in feat_ext_objs] pipeline = Pipeline([ ('features', FeatureUnion(feat_ext_tuples, n_jobs=parallel_jobs)), ('describe_data', describe_data.Transformer()), ('classifier', MultinomialNB()), ]) return pipeline
def train_classifier(self): """ ??????????? """ self.extract_feature(); self.clf = MultinomialNB().fit( self.train_tfidf, self.data['train'].target)
def NBModel(self, train_data, test_data, train_labels, test_labels): model = MultinomialNB(alpha = 0.01) model.fit(train_data, train_labels) self.saveModel(model, 'NB') predict = model.predict(test_data) return metrics.accuracy_score(test_labels, predict)
def _init_classifiers(self): mnb = MultinomialNB() cnb = ComplementNB() nnb = NegationNB() unb = UniversalSetNB() snb = SelectiveNB() return [mnb, cnb, nnb, unb, snb]
def create_ngram_model(params=None): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def create_ngram_model(): tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) clf = MultinomialNB() pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)]) return pipeline
def make_classifier(): pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))), ('classifier', MultinomialNB()) ]) return pipeline
def generate_base_classification(): from sklearn.svm import LinearSVC, NuSVC, SVC from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB models = [ #(LinearSVC, params('C', 'loss')), # (NuSVC, params('nu', 'kernel', 'degree')), #(SVC, params('C', 'kernel')), #(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')), (DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')), (RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')), #(GaussianProcessClassifier, None), (LogisticRegression, params('C', 'penalty')), #(PassiveAggressiveClassifier, params('C', 'loss')), #(RidgeClassifier, params('alpha')), # we do in-place modification of what the method params return in order to add # more loss functions that weren't defined in the method #(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])), (KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({ 'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree'] })), (MultinomialNB, params('alpha')), #(GaussianNB, None), #(BernoulliNB, params('alpha')) ] return models
def makEnsemble( X, xlist, Y ): #naive bayes clf = MultinomialNB() clf.fit( xlist, Y ) featureSelectModel.append (clf) #K nearest neighbours clf = KNeighborsClassifier() clf.fit( xlist, Y ) featureSelectModel.append (clf) #Logistic regression clf = LogisticRegression(C=1) clf.fit( xlist, Y ) featureSelectModel.append (clf) #random forest clf = RandomForestClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #extra forest clf = ExtraTreesClassifier(n_estimators = 400) clf.fit( X, Y ) wholeFeatureModel.append (clf) #decision forest clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) clf.fit( X, Y ) wholeFeatureModel.append (clf) #gradient boosting params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01} clf = GradientBoostingClassifier(**params) clf.fit( X, Y ) wholeFeatureModel.append (clf)
def find(lst, elem): return [i for i, x in enumerate(lst) if x == elem ] #clf = MultinomialNB()
def naive_bayes_classifier(train_x, train_y): from sklearn.naive_bayes import MultinomialNB model = MultinomialNB(alpha=0.01) model.fit(train_x, train_y) return model # KNN Classifier
def __init__(self, **kwargs): self.estimator = mock.MagicMock(spec=MultinomialNB()) Wrapper.__init__(self, self.estimator) MockVisualizer.__init__(self, **kwargs)
def test_real_data_set_viz(self): model = naive_bayes.MultinomialNB() data = datasets.load_iris() feature_names = [name.replace(' ', '_') for name in data.feature_names ] df = pd.DataFrame(data.data, columns=feature_names) X = df[['sepal_length_(cm)', 'sepal_width_(cm)']].as_matrix() y = data.target visualizer = DecisionBoundariesVisualizer(model) visualizer.fit_draw_poof(X, y) self.assert_images_similar(visualizer)
def test_quick_method(self): model = naive_bayes.MultinomialNB() data = datasets.load_iris() feature_names = [name.replace(' ', '_') for name in data.feature_names ] df = pd.DataFrame(data.data, columns=feature_names) X = df[['sepal_length_(cm)', 'sepal_width_(cm)']].as_matrix() y = data.target visualizer = decisionviz(model, X, y)