Python sklearn.naive_bayes 模块,MultinomialNB() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.naive_bayes.MultinomialNB()

项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = np.array([[0, 1, 1],
                  [0, 1, 0],
                  [1, 1, 1],
                  [1, 0, 1],
                  [1, 0, 0]])

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet(), Lasso(alpha=0.5)):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert_true(clf.multilabel_)
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def predict_job(job_list):
    """Assign a classification to a url"""
    # TODO: Add case where len is 1 or 0....
    job_list = [job for j in job_list for job in j]
    new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
    new_job_list = [' '.join(job) for job in new_job_list]
    vect = CountVectorizer()
    x_series = pd.Series(X)
    X_train_dtm = vect.fit_transform(x_series)
    y_train = pd.Series(y)
    job_list_series = pd.Series(new_job_list)
    job_list_dtm = vect.transform(job_list_series)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred = nb.predict(job_list_dtm)
    # for i in range(len(job_list)):
    #     print(job_list[i], y_pred[i])
    return y_pred

# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def train_classifier(self, trainvectors, labels, no_label_encoding=False, alpha='default', fit_prior=True, iterations=10):
        if alpha == '':
            paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
            paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
            selected_alpha = paramsearch.best_estimator_.alpha
        elif alpha == 'default':
            selected_alpha = 1.0
        else:
            selected_alpha = float(alpha)
        if fit_prior == 'False':
            fit_prior = False
        else:
            fit_prior = True
        self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
        if no_label_encoding:
            self.model.fit(trainvectors, labels)
        else:
            self.model.fit(trainvectors, self.label_encoder.transform(labels))
项目:2020plus    作者:KarchinLab    | 项目源码 | 文件源码
def __init__(self, df, weight=True, min_ct=0, total_iter=5):
        self.logger = logging.getLogger(__name__)
        super(MultinomialNaiveBayes, self).__init__(total_iterations=total_iter)  # call base constructor
        #self.set_min_count(min_ct)
        self.is_weighted_sample = weight

        # process data
        #df = self._filter_rows(df)  # filter out low count rows
        # row_sums = df.sum(axis=1).astype(float)
        # df = df.div(row_sums, axis=0)  # normalize each row
        # df = df.mul(100)
        # df.to_csv('tmp.nbclf.txt', sep='\t')
        df = df.fillna(df.mean())
        total = df['total']
        df = df[['recurrent missense', 'recurrent indel', 'frame shift',
                 'nonsense', 'missense', 'synonymous', 'inframe indel', 'no protein',
                 'lost stop', 'splicing mutation']]
        df = df.mul(total, axis=0).astype(int)  # get back counts instead of pct
        self.x, self.y = features.randomize(df)

        # setup classifier
        self.clf = MultinomialNB(alpha=1,  # laplacian smooth, i.e. pseudocounts
                                 fit_prior=True)  # use data for prior class probs
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_ngram_model(params=None):
    def preprocessor(tweet):
        global emoticons_replaced
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    clf = MultinomialNB()
    pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def article_trainers(articles: ArticleDB):
    """
    Run repeated models against article db to predict validity score for
    articles.
    """
    models = [(DecisionTreeClassifier, {}),
              (RandomForestClassifier, {}),
              (LogisticRegression, {'C': [0.01, 0.1, 1, 10, 100]}),
              (MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}),
              (LinearSVC, {'C': [0.01, 0.1, 1, 10, 100]})]
    trained_models = []
    for classifier, param_grid in models:
        res = train_model(articles, classifier, param_grid, probabilities=True)
        trained_models.append((str(res), res))
    ensemble_learner = VotingClassifier(estimators=trained_models[:4],
                                        voting='soft')
    train_model(articles, ensemble_learner, {})
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_integrated_plot_numpy_named_arrays(self):
        model = naive_bayes.MultinomialNB()

        X = np.array([
             (1.1, 9.52, 1.23, 0.86, 7.89, 0.13),
             (3.4, 2.84, 8.65, 0.45, 7.43, 0.16),
             (1.2, 3.22, 6.56, 0.24, 3.45, 0.17),
             (3.8, 6.18, 2.45, 0.28, 2.53, 0.13),
             (5.1, 9.12, 1.06, 0.19, 1.43, 0.13),
             (4.4, 8.84, 4.97, 0.98, 1.35, 0.13),
             (3.2, 3.22, 5.03, 0.68, 3.53, 0.32),
             (7.8, 2.18, 6.87, 0.35, 3.25, 0.38),
            ], dtype=[('a','<f8'), ('b','<f8'),
                ('c','<f8'), ('d','<f8'),
                ('e','<f8'), ('f','<f8')]
        )

        y = np.array([1, 1, 0, 1, 0, 0, 1, 0])

        visualizer = DecisionBoundariesVisualizer(model, features=['a', 'f'])
        visualizer.fit_draw_poof(X, y=y)
        self.assertEquals(visualizer.features_, ['a', 'f'])
        self.assert_images_similar(visualizer)
项目:IBRel    作者:lasigeBioTM    | 项目源码 | 文件源码
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
项目:MorphoBabushka    作者:nvanva    | 项目源码 | 文件源码
def __init__(self, fit_scaler=None, transform_scaler='bin'):
        self.fit_scaler=fit_scaler
        self.transform_scaler=transform_scaler
        if fit_scaler in MNBScaler.fit_scalers:
            self.fit_scaler_ = None if fit_scaler is None else MNBScaler.fit_scalers[fit_scaler]()
        else:
            raise ValueError("fit_scaler should be one of %r but %s specified" %
                             (MNBScaler.fit_scalers.keys(), fit_scaler))

        if transform_scaler in MNBScaler.transform_scalers:
            self.transform_scaler_ = None if transform_scaler is None else \
                             self.fit_scaler_ if transform_scaler=='auto' else \
                            MNBScaler.transform_scalers[transform_scaler]()
        else:
            raise ValueError("transform_scaler should be one of %r but %s specified" %
                             (MNBScaler.transform_scalers.keys(), transform_scaler))
        self.mnb_ = MultinomialNB()
项目:2016CCF-SouGou    作者:AbnerYang    | 项目源码 | 文件源码
def MultinomialNBPredictModel(localTrainLabel, config):
    train = pd.read_csv('../feature/trainQlist.csv', header = 0, sep = ",")
    test = pd.read_csv('../feature/testQlist.csv', header = 0, sep = ",")
    print "Train tf-idf vector Model..."    
    encode = TfidfVectorizer(decode_error = 'ignore', norm = "l2", binary = False, sublinear_tf = True, min_df = 50)
    localTrainFeature = encode.fit_transform(train['qlist'].values)
    localTestFeature = encode.transform(train['qlist'].values)

    print localTrainFeature.shape, localTestFeature.shape

    print 'train...'
    model = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
    model.fit(X = localTrainFeature, y = localTrainLabel)
    print 'predict...'
    if config['prob'] == False:
        return model.predict(localTestFeature), test['uid'].values
    else:
        return model.predict_log_proba(localTestFeature), test['uid'].values

#-- xgboost local corss validation model frame
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=2,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert_true(clf.multilabel_)
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                            prec,
                            decimal=2)
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
                            recall,
                            decimal=2)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovr_single_label_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    X, Y = iris.data, iris.target
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

    # decision function only estimator. Fails in current implementation.
    decision_only = OneVsRestClassifier(svm.SVR()).fit(X_train, Y_train)
    assert_raises(AttributeError, decision_only.predict_proba, X_test)

    Y_pred = clf.predict(X_test)
    Y_proba = clf.predict_proba(X_test)

    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
    # predict assigns a label if the probability that the
    # sample has the label is greater than 0.5.
    pred = np.array([l.argmax() for l in Y_proba])
    assert_false((pred - Y_pred).any())
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_ovo_partial_fit_predict():
    X, y = shuffle(iris.data, iris.target)
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches don't have all target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovo1.partial_fit(iris.data[60:], iris.target[60:])
    pred1 = ovo1.predict(iris.data)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred1), 0.65)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def test_input_check_partial_fit():
    for cls in [BernoulliNB, MultinomialNB]:
        # check shape consistency
        assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
                      classes=np.unique(y2))

        # classes is required for first call to partial fit
        assert_raises(ValueError, cls().partial_fit, X2, y2)

        # check consistency of consecutive classes values
        clf = cls()
        clf.partial_fit(X2, y2, classes=np.unique(y2))
        assert_raises(ValueError, clf.partial_fit, X2, y2,
                      classes=np.arange(42))

        # check consistency of input shape for partial_fit
        assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)

        # check consistency of input shape for predict
        assert_raises(ValueError, clf.predict, X2[:, :-1])
项目:NSIT-Bot    作者:gabru-md    | 项目源码 | 文件源码
def getClassifier(self,**kwargs):
        """
        returns a vectorizer to predict the query
        """

        self.path = kwargs.get('path','trainer')
        self.df = self.trainWith(self.path)


        self.vectorizer = CountVectorizer() 

        counts = self.vectorizer.fit_transform(self.df['message'].values)

        self.classifier = MultinomialNB()

        targets = self.df['class'].values
        self.classifier.fit(counts, targets) 


        os.chdir(self.old_loc)
        return self.classifier,self.vectorizer
项目:code-uai16    作者:thanhan    | 项目源码 | 文件源码
def classify(n = 50):
    #clf = MultinomialNB(fit_prior=False)
    #clf = SVC(gamma=2, C=1, class_weight = {0.0:0.063829777, 1.0:1.0})
    clf = SGDClassifier(loss="log", penalty="l1", class_weight = {0.0:0.022, 1.0:1.0})

    clf.fit(mat[:n], rel[:n])
    return clf
项目:linkedin_recommend    作者:duggalr2    | 项目源码 | 文件源码
def train_test():
    """Identify accuracy via training set"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
    vect = CountVectorizer()
    X_train_dtm = vect.fit_transform(X_train)  # creates vocab set and dtm for each raw document!
    X_test_dtm = vect.transform(X_test)

    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)  # make class predictions for X_test_dtm
    # w = list(X_test)
    return metrics.accuracy_score(y_test, y_pred_class)

# print(train_test())
项目:Flavor-Network    作者:lingcheng99    | 项目源码 | 文件源码
def nb_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print metrics.accuracy_score(y_test,y_pred)
项目:TextCategorization    作者:Y-oHr-N    | 项目源码 | 文件源码
def __create_ngram_model(self, lang):
        if   lang == 'en':
            tfidf_ngrams = EnglishTfidfVectorizer(decode_error='ignore')
        elif lang == 'ja':
            tfidf_ngrams = JapaneseTfidfVectorizer(decode_error='ignore')

        clf              = MultinomialNB()
        pipeline         = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])

        return pipeline
项目:website-fingerprinting    作者:AxelGoetz    | 项目源码 | 文件源码
def get_naive_bayes(is_multiclass=True):
    return MultinomialNB()
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def _estimator(self):

        return MultinomialNB()
项目:quoll    作者:LanguageMachines    | 项目源码 | 文件源码
def train_classifier(self, trainvectors, labels, alpha='default', fit_prior=True, iterations=10):
        if alpha == '':
            paramsearch = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=numpy.linspace(0,2,20)[1:]), n_jobs=6)
            paramsearch.fit(trainvectors,self.label_encoder.transform(labels))
            selected_alpha = paramsearch.best_estimator_.alpha
        elif alpha == 'default':
            selected_alpha = 1.0
        else:
            selected_alpha = alpha
        if fit_prior == 'False':
            fit_prior = False
        else:
            fit_prior = True
        self.model = naive_bayes.MultinomialNB(alpha=selected_alpha,fit_prior=fit_prior)
        self.model.fit(trainvectors, self.label_encoder.transform(labels))
项目:dask-ml    作者:dask    | 项目源码 | 文件源码
def test_basic(self, single_chunk_count_classification):
        X, y = single_chunk_count_classification
        a = nb.PartialMultinomialNB(classes=[0, 1])
        b = nb_.MultinomialNB()
        a.fit(X, y)
        b.partial_fit(X, y, classes=[0, 1])
        assert_eq(a.coef_, b.coef_)
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def case1():
    from sklearn import datasets
    news = datasets.fetch_20newsgroups(subset='all')
    # print len(news.data)
    # print len(news.target)

    # print '*'*10
    # print news.data[0]
    # print '*'*10
    # print news.target[0]
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    vec = CountVectorizer()
    x = vec.fit_transform(news.data)
    # print x.shape
    # print x[:2]
    print x[:10,:10].toarray()
    TFIDF = TfidfTransformer()
    x_tfidf = TFIDF.fit_transform(x)
    print x_tfidf[:10,:10].toarray()


    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)

    tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)


    from sklearn.naive_bayes import MultinomialNB
    mnb =MultinomialNB()
    tf_mnb = MultinomialNB()

    mmb.fit(Xtrain,ytrain)
    tf_mnb.fit(tf_Xtrain,tf_ytrain)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init_no_file():
    mm = mnb_modelmanager.MNBModelManager()
    assert isinstance(mm, mnb_modelmanager.MNBModelManager)
    assert isinstance(mm.clf, Pipeline)
    assert isinstance(mm.clf.named_steps['clf'], MultinomialNB)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def test_init_mnb():
    ct = classifytext.ClassifyText(type=classifytext.MNB)
    assert isinstance(ct.mm, mnb_modelmanager.MNBModelManager)
    assert isinstance(ct.mm.clf, Pipeline)
    # assert isinstance(ct.mm.clf.named_steps['clf'], MultinomialNB)
项目:UrbanSearch    作者:urbansearchTUD    | 项目源码 | 文件源码
def __init__(self, filename=None):
        super().__init__(filename)

        if not filename:
            self.clf = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'))),
                ('anova', SelectPercentile(f_classif)),
                ('clf', MultinomialNB())
            ])
项目:sentiment-analysis    作者:lplping    | 项目源码 | 文件源码
def NbClass(x_train, y_train):
    from sklearn.naive_bayes import MultinomialNB
    clf=MultinomialNB(alpha=0.01).fit(x_train, y_train) 
    return clf

#========Logistic Regression========#
项目:SofPythonBot    作者:UtkucanBykl    | 项目源码 | 文件源码
def bayes(self):
        self.mnb = MultinomialNB()
        self.y_train=self.y_train.astype('int')
        self.mnb.fit(self.x_trainvect,self.y_train)
项目:text-classification    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:bof-aed    作者:rgrzeszi    | 项目源码 | 文件源码
def train(self, datadict, labels=None):
        '''
        Runs the classifier training using the dictionary of label, features
        @param datadict: dictonary of label, features
        @param labels: (optional) list of labels. If given the order of labels is used from this list.
        '''

        # Set labels from data dict
        if labels is None:
            self.labels = datadict.keys()
        else:
            self.labels = labels
        # Train the GMM for BoF computation
        if self.model.gmm is None:
            print >> sys.stderr, 'Model not trained yet.'
            self.model.train(datadict, self.labels)

        print >> sys.stderr,'Computing',self.model.__class__.__name__,'...'
        # Parse dictionary into BoF representations and labels
        bofs, bofl = self._parse_dict(datadict, self.labels)

        #Create Multinomial Bayes
        print >> sys.stderr,'Training Multinomial Bayes ...'
        self.bay = bayes.MultinomialNB(alpha=0.5, fit_prior=False)
        self.bay.fit(bofs, bofl)
        return
项目:striatum    作者:ntucllab    | 项目源码 | 文件源码
def train_expert(action_context):
    logreg = OneVsRestClassifier(LogisticRegression())
    mnb = OneVsRestClassifier(MultinomialNB(), )
    logreg.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1])
    mnb.fit(action_context.iloc[:, 2:], action_context.iloc[:, 1])
    return [logreg, mnb]
项目:striatum    作者:ntucllab    | 项目源码 | 文件源码
def train_expert(history_context, history_action):
    n_round = len(history_context)
    history_context = np.array([history_context[t] for t in range(n_round)])
    history_action = np.array([history_action[t] for t in range(n_round)])
    logreg = OneVsRestClassifier(LogisticRegression())
    mnb = OneVsRestClassifier(MultinomialNB())
    logreg.fit(history_context, history_action)
    mnb.fit(history_context, history_action)
    return [logreg, mnb]
项目:Guess-Genre-By-Lyrics    作者:ormatt    | 项目源码 | 文件源码
def get_pipeline(sample_col, parallel_jobs=None):
    feat_ext_objs = [feat_ext_class(sample_col)
                     for feat_ext_class in get_objs(FEAT_EXTS_DIR, 'Worker')]

    feat_ext_tuples = [(feat_ext_obj.feature_name, feat_ext_obj)
                       for feat_ext_obj in feat_ext_objs]

    pipeline = Pipeline([
        ('features', FeatureUnion(feat_ext_tuples, n_jobs=parallel_jobs)),
        ('describe_data', describe_data.Transformer()),
        ('classifier', MultinomialNB()),
    ])
    return pipeline
项目:DataScience-And-MachineLearning-Handbook-For-Coders    作者:wxyyxc1992    | 项目源码 | 文件源码
def train_classifier(self):
        """
        ???????????
        """

        self.extract_feature();

        self.clf = MultinomialNB().fit(
            self.train_tfidf, self.data['train'].target)
项目:android_malware_detection    作者:congyuandong    | 项目源码 | 文件源码
def NBModel(self, train_data, test_data, train_labels, test_labels):
        model = MultinomialNB(alpha = 0.01)
        model.fit(train_data, train_labels)
        self.saveModel(model, 'NB')
        predict = model.predict(test_data)
        return metrics.accuracy_score(test_labels, predict)
项目:opentc    作者:cahya-wirawan    | 项目源码 | 文件源码
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
项目:Bayes    作者:krzjoa    | 项目源码 | 文件源码
def _init_classifiers(self):
        mnb = MultinomialNB()
        cnb = ComplementNB()
        nnb = NegationNB()
        unb = UniversalSetNB()
        snb = SelectiveNB()
        return [mnb, cnb, nnb, unb, snb]
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_ngram_model(params=None):
    tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
                                   analyzer="word", binary=False)
    clf = MultinomialNB()
    pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def create_ngram_model():
    tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
                                   analyzer="word", binary=False)
    clf = MultinomialNB()
    pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
    return pipeline
项目:pygameweb    作者:pygame    | 项目源码 | 文件源码
def make_classifier():
    pipeline = Pipeline([
        ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
        ('classifier',         MultinomialNB())
    ])
    return pipeline
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def generate_base_classification():
    from sklearn.svm import LinearSVC, NuSVC, SVC
    from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.gaussian_process import GaussianProcessClassifier
    from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
    models = [
        #(LinearSVC, params('C', 'loss')),
#         (NuSVC, params('nu', 'kernel', 'degree')),
        #(SVC, params('C', 'kernel')),
        #(ExtraTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
        (DecisionTreeClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf')),
        (RandomForestClassifier, params('criterion', 'min_samples_split', 'min_samples_leaf', 'n_estimators')),
        #(GaussianProcessClassifier, None),
        (LogisticRegression, params('C', 'penalty')),
        #(PassiveAggressiveClassifier, params('C', 'loss')),
        #(RidgeClassifier, params('alpha')),
        # we do in-place modification of what the method params return in order to add
        # more loss functions that weren't defined in the method
        #(SGDClassifier, params('loss', 'penalty', 'alpha')['loss'].extend(['log', 'modified_huber'])),
        (KNeighborsClassifier, params('n_neighbors', 'leaf_size', 'p').update({
            'algorithm': ['auto', 'brute', 'kd_tree', 'ball_tree']
        })),
        (MultinomialNB, params('alpha')),
        #(GaussianNB, None),
        #(BernoulliNB, params('alpha'))
    ]

    return models
项目:AnswerClassify    作者:kenluck2001    | 项目源码 | 文件源码
def makEnsemble( X, xlist, Y ):
    #naive bayes
    clf = MultinomialNB()
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #K nearest neighbours
    clf = KNeighborsClassifier()
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #Logistic regression
    clf = LogisticRegression(C=1)
    clf.fit( xlist, Y )
    featureSelectModel.append (clf)

    #random forest
    clf  = RandomForestClassifier(n_estimators = 400)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #extra forest
    clf = ExtraTreesClassifier(n_estimators = 400)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #decision forest
    clf = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)

    #gradient boosting
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
                  'learning_rate': 0.01}
    clf = GradientBoostingClassifier(**params)
    clf.fit( X, Y )
    wholeFeatureModel.append (clf)
项目:AnswerClassify    作者:kenluck2001    | 项目源码 | 文件源码
def find(lst, elem):
    return [i for i, x in enumerate(lst) if x == elem ]


#clf = MultinomialNB()
项目:Chinese_text_classifier    作者:swordLong    | 项目源码 | 文件源码
def naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.01)
    model.fit(train_x, train_y)
    return model


# KNN Classifier
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def __init__(self, **kwargs):
        self.estimator = mock.MagicMock(spec=MultinomialNB())

        Wrapper.__init__(self, self.estimator)
        MockVisualizer.__init__(self, **kwargs)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_real_data_set_viz(self):
        model = naive_bayes.MultinomialNB()

        data = datasets.load_iris()
        feature_names = [name.replace(' ', '_') for name in  data.feature_names ]
        df = pd.DataFrame(data.data, columns=feature_names)
        X = df[['sepal_length_(cm)', 'sepal_width_(cm)']].as_matrix()
        y = data.target

        visualizer = DecisionBoundariesVisualizer(model)
        visualizer.fit_draw_poof(X, y)
        self.assert_images_similar(visualizer)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_quick_method(self):
        model = naive_bayes.MultinomialNB()

        data = datasets.load_iris()
        feature_names = [name.replace(' ', '_') for name in  data.feature_names ]
        df = pd.DataFrame(data.data, columns=feature_names)
        X = df[['sepal_length_(cm)', 'sepal_width_(cm)']].as_matrix()
        y = data.target

        visualizer = decisionviz(model, X, y)