我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用sklearn.metrics.log_loss()。
def rf1(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 300 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) clf = ensemble.RandomForestRegressor(n_estimators=1000, max_depth=3, random_state=13) clf.fit(train2[itrain], y[itrain]) p = clf.predict(train2[ival]) v.loc[ival, cname] += p score = metrics.log_loss(y[ival], p) z[cname] += np.log1p(clf.predict(test2)) print(cname, 'step %d: score'%(n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits
def test_stacked_classfier_extkfold(self): bclf = LogisticRegression(random_state=1) clfs = [RandomForestClassifier(n_estimators=40, criterion = 'gini', random_state=1), RidgeClassifier(random_state=1), ] sl = StackedClassifier(bclf, clfs, n_folds=3, verbose=0, Kfold=StratifiedKFold(self.iris.target, 3), stack_by_proba=False, oob_score_flag=True, oob_metrics=log_loss) sl.fit(self.iris.data, self.iris.target) score = sl.score(self.iris.data, self.iris.target) self.assertGreater(score, 0.9, "Failed with score = {0}".format(score))
def opt_2_obj_func(w, X, y, n_class): """ Function to be minimized in the EN_OPT_2 ensembler. In this case there is only one weight for each classification restlt to be combined. Parameters: ---------- w: ndarray size=(n_preds) Candidate solution to the optimization problem (vector of weights). X: ndarray size=(n_samples, n_preds * n_class) Solutions to be combined horizontally concatenated. y: ndarray size=(n_samples,) Class labels n_class: int Number of classes in the problem, i.e. = 12 """ w = np.abs(w) sol = np.zeros((X.shape[0], n_class)) for i in range(len(w)): sol += X[:, i*n_class:(i+1)*n_class] * w[i] #Minimizing the logloss sc_ll = log_loss(y, sol) return sc_ll
def cross_validate(train): #separate training and validation set X_train,X_valid= split_train_validation(train) scores = []; preds = [] for i in xrange(len(X_train)): #convert X_train, Y_train etc... to xgboost matrix dtrain = xgb.DMatrix(X_train[i][['phone_brand','device_model','timestamp']], label = X_train[i]['group'],missing=np.nan) dvalid = xgb.DMatrix(X_valid[i][['phone_brand','device_model','timestamp']], label = X_valid[i]['group'],missing=np.nan) #predict with xgboost parameters = {'max_depth':4,'eta':0.1,'silent':1, 'subsample':0.8,'colsample_bytree':0.8, 'objective':'multi:softprob','booster':'gbtree','early_stopping_rounds':50, 'num_class':12,'num_boost_round':1000,'eval_metric':'mlogloss'} plst = parameters.items() bst = xgb.train(plst, dtrain) pred = bst.predict(dvalid) scores.append(log_loss(X_valid[i]['group'].tolist(),pred)) pred = pd.DataFrame(pred, index = X_valid[i].index, columns=target_encoder.classes_) preds.append(pred) return scores, preds
def check_log_loss(max_depth, n_splits, test_size): model = RandomForestClassifier(max_depth=max_depth, n_jobs=-1, random_state=777) trn_scores = [] vld_scores = [] sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=777) for i, (t_ind, v_ind) in enumerate(sss.split(feature_train, trainY)): print('# Iter {} / {}'.format(i + 1, n_splits)) x_trn = feature_train.values[t_ind] y_trn = trainY[t_ind] x_vld = feature_train.values[v_ind] y_vld = trainY[v_ind] model.fit(x_trn, y_trn) score = log_loss(y_trn, model.predict_proba(x_trn)) trn_scores.append(score) score = log_loss(y_vld, model.predict_proba(x_vld)) vld_scores.append(score) print("max_depth: %d n_splits: %d test_size: %f" % (max_depth, n_splits, test_size)) print('# TRN logloss: {}'.format(np.mean(trn_scores))) print('# VLD logloss: {}'.format(np.mean(vld_scores)))
def runET(train_X, train_y, test_X, test_y=None, validation=1, n_est_val=50, depth_val=None, split_val=2, leaf_val=1, feat_val='auto', jobs_val=4, random_state_val=0): clf = ensemble.ExtraTreesClassifier( n_estimators = n_est_val, max_depth = depth_val, min_samples_split = split_val, min_samples_leaf = leaf_val, max_features = feat_val, criterion='entropy', n_jobs = jobs_val, random_state = random_state_val) clf.fit(train_X, train_y) pred_train_y = clf.predict_proba(train_X)[:,1] pred_test_y = clf.predict_proba(test_X)[:,1] if validation: train_loss = log_loss(train_y, pred_train_y) loss = log_loss(test_y, pred_test_y) print "Train, Test loss : ", train_loss, loss return pred_test_y, loss else: return pred_test_y
def extratreescv(n_estimators, min_samples_split, min_samples_leaf, max_features, max_depth, min_weight_fraction_leaf ): clf = ExtraTreesClassifier(n_estimators=int(n_estimators), min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf), max_features= int(max_features), max_depth = int(max_depth), min_weight_fraction_leaf = min_weight_fraction_leaf, n_jobs=-1, random_state=1234, verbose=1) clf.fit(x0, y0) ll = -log_loss(y1, clf.predict_proba(x1)[:,1]) return ll
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def pac_metric (solution, prediction, task='binary.classification'): ''' Probabilistic Accuracy based on log_loss metric. We assume the solution is in {0, 1} and prediction in [0, 1]. Otherwise, run normalize_array.''' debug_flag=False [sample_num, label_num] = solution.shape if label_num==1: task='binary.classification' eps = 1e-15 the_log_loss = log_loss(solution, prediction, task) # Compute the base log loss (using the prior probabilities) pos_num = 1.* sum(solution) # float conversion! frac_pos = pos_num / sample_num # prior proba of positive class the_base_log_loss = prior_log_loss(frac_pos, task) # Alternative computation of the same thing (slower) # Should always return the same thing except in the multi-label case # For which the analytic solution makes more sense if debug_flag: base_prediction = np.empty(prediction.shape) for k in range(sample_num): base_prediction[k,:] = frac_pos base_log_loss = log_loss(solution, base_prediction, task) diff = np.array(abs(the_base_log_loss-base_log_loss)) if len(diff.shape)>0: diff=max(diff) if(diff)>1e-10: print('Arrggh {} != {}'.format(the_base_log_loss,base_log_loss)) # Exponentiate to turn into an accuracy-like score. # In the multi-label case, we need to average AFTER taking the exp # because it is an NL operation pac = mvmean(np.exp(-the_log_loss)) base_pac = mvmean(np.exp(-the_base_log_loss)) # Normalize: 0 for random, 1 for perfect score = (pac - base_pac) / sp.maximum(eps, (1 - base_pac)) return score
def log_loss(solution, prediction, task = 'binary.classification'): ''' Log loss for binary and multiclass. ''' [sample_num, label_num] = solution.shape eps = 1e-15 pred = np.copy(prediction) # beware: changes in prediction occur through this sol = np.copy(solution) if (task == 'multiclass.classification') and (label_num>1): # Make sure the lines add up to one for multi-class classification norma = np.sum(prediction, axis=1) for k in range(sample_num): pred[k,:] /= sp.maximum (norma[k], eps) # Make sure there is a single label active per line for multi-class classification sol = binarize_predictions(solution, task='multiclass.classification') # For the base prediction, this solution is ridiculous in the multi-label case # Bounding of predictions to avoid log(0),1/0,... pred = sp.minimum (1-eps, sp.maximum (eps, pred)) # Compute the log loss pos_class_log_loss = - mvmean(sol*np.log(pred), axis=0) if (task != 'multiclass.classification') or (label_num==1): # The multi-label case is a bunch of binary problems. # The second class is the negative class for each column. neg_class_log_loss = - mvmean((1-sol)*np.log(1-pred), axis=0) log_loss = pos_class_log_loss + neg_class_log_loss # Each column is an independent problem, so we average. # The probabilities in one line do not add up to one. # log_loss = mvmean(log_loss) # print('binary {}'.format(log_loss)) # In the multilabel case, the right thing i to AVERAGE not sum # We return all the scores so we can normalize correctly later on else: # For the multiclass case the probabilities in one line add up one. log_loss = pos_class_log_loss # We sum the contributions of the columns. log_loss = np.sum(log_loss) #print('multiclass {}'.format(log_loss)) return log_loss
def log_loss_(solution, prediction): return metrics.log_loss(solution, prediction)
def train_and_eval_sklearn_classifier( clf, data ): x_train = data['x_train'] y_train = data['y_train'] x_test = data['x_test'] y_test = data['y_test'] clf.fit( x_train, y_train ) try: p = clf.predict_proba( x_train )[:,1] # sklearn convention except IndexError: p = clf.predict_proba( x_train ) ll = log_loss( y_train, p ) auc = AUC( y_train, p ) acc = accuracy( y_train, np.round( p )) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc ) # try: p = clf.predict_proba( x_test )[:,1] # sklearn convention except IndexError: p = clf.predict_proba( x_test ) ll = log_loss( y_test, p ) auc = AUC( y_test, p ) acc = accuracy( y_test, np.round( p )) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc ) #return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc } return { 'loss': ll, 'log_loss': ll, 'auc': auc } ### # "clf", even though it's a regressor
def predict_proba_with_loss(self, X, y): y_pred = self.predict_proba(X) loss = log_loss(y,y_pred) return y_pred, loss # smallest prob given to an actual catastrophe
def predict_test_file(preds, sess, test_file, feature_cnt, _indices, _values, _values2, _cont_values, _text_values, _shape, _cont_shape, _text_shape, _y, _ind, epoch, batch_size, tag, path, output_prediction=True): day = date.today() if output_prediction: wt = open(path + '/'+str(day)+'_deepFM_pred_' + tag + str(epoch) + '.txt', 'w') gt_scores = [] pred_scores = [] for test_input_in_sp in load_data_cache(test_file): predictios = sess.run(preds, feed_dict={ _indices: test_input_in_sp['indices'], _values: test_input_in_sp['values'], _shape: test_input_in_sp['shape'], _cont_shape: test_input_in_sp['cont_shape'], _text_values: test_input_in_sp['text_values'], _text_shape: test_input_in_sp['text_shape'], _y: test_input_in_sp['labels'], _values2: test_input_in_sp['values2'], _cont_values: test_input_in_sp['cont_values'], _ind: test_input_in_sp['feature_indices'] }).reshape(-1).tolist() if output_prediction: for (gt, preded) in zip(test_input_in_sp['labels'].reshape(-1).tolist(), predictios): wt.write('{0:d},{1:f}\n'.format(int(gt), preded)) gt_scores.append(gt) # pred_scores.append(1.0 if preded >= 0.5 else 0.0) pred_scores.append(preded) else: gt_scores.extend(test_input_in_sp['labels'].reshape(-1).tolist()) pred_scores.extend(predictios) auc = metrics.roc_auc_score(np.asarray(gt_scores), np.asarray(pred_scores)) logloss = metrics.log_loss(np.asarray(gt_scores), np.asarray(pred_scores)) # print('auc is ', auc, ', at epoch ', epoch) if output_prediction: wt.close() return auc, logloss
def print_k_result(ys, Ep, ll, acc, name): acc.append(accuracy_score(ys, Ep.argmax(axis=1))) ll.append(log_loss(ys, Ep)) print("{}: accuracy = {:.4g}, log-loss = {:.4g}" .format(name, acc[-1], ll[-1]))
def main(): validate = True n = SData(validate=validate) Xtrain = n.train_features.as_matrix() ytrain = n.train_targets Xtest = n.test_features.as_matrix() ytest = n.test_targets Xtrain = np.reshape(Xtrain, (Xtrain.shape[0], Xtrain.shape[1], 1)) Xtest = np.reshape(Xtest, (Xtest.shape[0], Xtest.shape[1], 1)) rnn = RNN([1, 100, 100, 1]) rnn.fit(Xtrain, ytrain) p = rnn.predict(Xtest) p_prob = rnn.predict(Xtest) if validate: mse = mean_squared_error(ytest, p) print("MSE: {}".format(mse)) loss = log_loss(ytest, p_prob) print("Log loss: {}".format(loss)) else: base_path = dirname(__file__) results_df = DataFrame(data={'probability':results}) joined = DataFrame(t_id).join(results_df) joined.to_csv(join(base_path, 'results', 'dl.csv'), index=False)
def Log_loss(Ytest,Ydist): return log_loss(Ytest, Ydist, eps=1e-15, normalize=True) # N_test,L = Ytest.shape # return sum((Ytest == Ypred) * 1.) / N_test / L
def parse_args(): parser = argparse.ArgumentParser(description="Run FM.") parser.add_argument('--path', nargs='?', default='./data/', help='Input data path.') parser.add_argument('--dataset', nargs='?', default='frappe', help='Choose a dataset.') parser.add_argument('--epoch', type=int, default=100, help='Number of epochs.') parser.add_argument('--pretrain', type=int, default=-1, help='flag for pretrain. 1: initialize from pretrain; 0: randomly initialize; -1: save the model to pretrain file') parser.add_argument('--batch_size', type=int, default=128, help='Batch size.') parser.add_argument('--hidden_factor', type=int, default=64, help='Number of hidden factors.') parser.add_argument('--lamda', type=float, default=0, help='Regularizer for bilinear part.') parser.add_argument('--keep_prob', type=float, default=0.5, help='Keep probility (1-dropout_ratio) for the Bi-Interaction layer. 1: no dropout') parser.add_argument('--lr', type=float, default=0.05, help='Learning rate.') parser.add_argument('--loss_type', nargs='?', default='square_loss', help='Specify a loss type (square_loss or log_loss).') parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer', help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).') parser.add_argument('--verbose', type=int, default=1, help='Show the results per X epochs (0, 1 ... any positive integer)') parser.add_argument('--batch_norm', type=int, default=0, help='Whether to perform batch normaization (0 or 1)') return parser.parse_args()
def evaluate(self, data): # evaluate the results for an input set num_example = len(data['Y']) feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: 1.0, self.train_phase: False} predictions = self.sess.run((self.out), feed_dict=feed_dict) y_pred = np.reshape(predictions, (num_example,)) y_true = np.reshape(data['Y'], (num_example,)) if self.loss_type == 'square_loss': predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true)) # bound the lower values predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true)) # bound the higher values RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded)) return RMSE elif self.loss_type == 'log_loss': logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss return logloss
def parse_args(): parser = argparse.ArgumentParser(description="Run Neural FM.") parser.add_argument('--path', nargs='?', default='../data/', help='Input data path.') parser.add_argument('--dataset', nargs='?', default='frappe', help='Choose a dataset.') parser.add_argument('--epoch', type=int, default=200, help='Number of epochs.') parser.add_argument('--pretrain', type=int, default=0, help='Pre-train flag. 0: train from scratch; 1: load from pretrain file') parser.add_argument('--batch_size', type=int, default=128, help='Batch size.') parser.add_argument('--hidden_factor', type=int, default=64, help='Number of hidden factors.') parser.add_argument('--layers', nargs='?', default='[64]', help="Size of each layer.") parser.add_argument('--keep_prob', nargs='?', default='[0.8,0.5]', help='Keep probability (i.e., 1-dropout_ratio) for each deep layer and the Bi-Interaction layer. 1: no dropout. Note that the last index is for the Bi-Interaction layer.') parser.add_argument('--lamda', type=float, default=0, help='Regularizer for bilinear part.') parser.add_argument('--lr', type=float, default=0.05, help='Learning rate.') parser.add_argument('--loss_type', nargs='?', default='square_loss', help='Specify a loss type (square_loss or log_loss).') parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer', help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).') parser.add_argument('--verbose', type=int, default=1, help='Show the results per X epochs (0, 1 ... any positive integer)') parser.add_argument('--batch_norm', type=int, default=1, help='Whether to perform batch normaization (0 or 1)') parser.add_argument('--activation', nargs='?', default='relu', help='Which activation function to use for deep layers: relu, sigmoid, tanh, identity') parser.add_argument('--early_stop', type=int, default=1, help='Whether to perform early stop (0 or 1)') return parser.parse_args()
def evaluate(self, data): # evaluate the results for an input set num_example = len(data['Y']) feed_dict = {self.train_features: data['X'], self.train_labels: [[y] for y in data['Y']], self.dropout_keep: self.no_dropout, self.train_phase: False} predictions = self.sess.run((self.out), feed_dict=feed_dict) y_pred = np.reshape(predictions, (num_example,)) y_true = np.reshape(data['Y'], (num_example,)) if self.loss_type == 'square_loss': predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true)) # bound the lower values predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true)) # bound the higher values RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded)) return RMSE elif self.loss_type == 'log_loss': logloss = log_loss(y_true, y_pred) # I haven't checked the log_loss return logloss
def check_score(subm_file): real_answ = "../modified_data/answers_stage1.csv" real = pd.read_csv(real_answ) pred = pd.read_csv(subm_file) real['s'] = 0 real.loc[real['Type_1'] > 0, 's'] = 0 real.loc[real['Type_2'] > 0, 's'] = 1 real.loc[real['Type_3'] > 0, 's'] = 2 pred = pd.merge(pred, real[['image_name', 's']], on=['image_name'], left_index=True) score = log_loss(pred['s'], pred[['Type_1', 'Type_2', 'Type_3']].as_matrix()) return score
def predict(self, clf, X, y, X_test, stage): np.random.seed(self.seed) n_train = X.shape[0] kf = KFold(n_train, n_folds=self.n_fold, shuffle=True) best_score = [] y_pred_sum = np.zeros((X_test.shape[0], self.num_class)) if stage=='base': meta_feat = np.zeros((n_train+X_test.shape[0], self.num_class)) i = 0 for train, val in kf: i += 1 print(i) X_train, X_val, y_train, y_val = X[train], X[val], y[train], y[val] ## CV sets # train clf.fit(X_train, y_train) curr_pred = clf.predict_proba(X_val) curr_best_score = log_loss(y_val, curr_pred) print(curr_best_score) best_score += [curr_best_score] # predict if stage=='base': meta_feat[val, :] = curr_pred else: y_pred = clf.predict_proba(X_test) y_pred_sum = y_pred_sum+y_pred print(np.mean(best_score), np.std(best_score)) ## test set if stage=='base': # train clf.fit(X, y) # predict meta_feat[n_train:, :] = clf.predict_proba(X_test) return meta_feat else: y_pred = y_pred_sum/self.n_fold return y_pred
def print_clf(clf, trainx, testx, trainy, testy): start = time.time() model = clf.fit(trainx, trainy) end = time.time() pred = model.predict(testx) print "log_loss: ", log_loss(testy, model.predict_proba(testx)) print confusion_matrix(np.array(testy), pred)
def loss_scorer(estimator, x, y): loss = log_loss(y, estimator.predict_proba(x)) assert loss >= 0 # minimal loss is best # however, we try to maximize the score # to account for this we take negative loss return -loss
def logloss(y, p): """Bounded log loss error. Args: y (numpy.array): target p (numpy.array): prediction Returns: bounded log loss error """ p[p < 1e-15] = 1e-15 p[p > 1 - 1e-15] = 1 - 1e-15 return log_loss(y, p)
def evalData(z,test_set_y): " z- prediction test_set_y is the truth " diff=z-test_set_y fpr, tpr, thresholds = metrics.roc_curve(test_set_y.ravel(), z.ravel(), pos_label=1) auc=metrics.auc(fpr, tpr) ap=metrics.average_precision_score(test_set_y.ravel(), z.ravel()) Q=test_set_y.shape[0] Pk10=0 Pk20=0 Pk30=0 Pk50=0 Pk37=0 for i in range(Q): Pk10+=ranking_precision_score(test_set_y[i], z[i], k=10) Pk20+=ranking_precision_score(test_set_y[i], z[i], k=20) Pk30+=ranking_precision_score(test_set_y[i], z[i], k=30) Pk37+=ranking_precision_score(test_set_y[i], z[i], k=37) Pk50+=ranking_precision_score(test_set_y[i], z[i], k=30) Pk10=Pk10/Q Pk20=Pk20/Q Pk30=Pk30/Q Pk50=Pk50/Q Pk37=Pk37/Q cross=metrics.log_loss(test_set_y,z) print '\n' print 'AUC',auc,'MSE',np.mean((diff)**2),'Cross-entropy:',cross print 'Precision at k=10: ',Pk10,' k=20: ',Pk20,' k=30: ',Pk30,' k=50: ',Pk50, ' k=37: ',Pk37 return Pk37
def opt_1_obj_func(w, X, y, n_class): """ Function to be minimized in the EN_OPT_1 ensembler. Parameters: ---------- w: ndarray size=(n_preds * n_class) Candidate solution to the optimization problem (vector of weights). X: ndarray size=(n_samples, n_preds * n_class) Solutions to be combined horizontally concatenated. y: ndarray size=(n_samples,) Class labels n_class: int Number of classes in the problem, i.e. = 12 """ #Constraining the weights for each class to sum 1. #This constrain can be defined in the scipy.minimize function, but doing it here #gives more flexibility to the scipy.minimize function (e.g. more solvers #are allowed). w_range = np.arange(len(w))%n_class for i in range(n_class): w[w_range==i] = w[w_range==i] / np.sum(w[w_range==i]) sol = np.zeros((X.shape[0], n_class)) for i in range(len(w)): sol[:, i % n_class] += X[:, i] * w[i] #The quantity to minimize is the log_loss. sc_ll = log_loss(y, sol) return sc_ll
def _features_sel_cv(self, X, Y, splits, model_id, data_id, log, early_stop_cv = None): #workaround to set first fold the worst, for using early stop cv splits_new_order_temp = [] for train_indexes, test_indexes in splits: splits_new_order_temp += [[train_indexes, test_indexes]] splists_new_order = [splits_new_order_temp[2], splits_new_order_temp[1], splits_new_order_temp[3], splits_new_order_temp[0], splits_new_order_temp[4]] scores = [] i = 0 for train_indexes, test_indexes in splists_new_order: i += 1 X_train = X.loc[train_indexes] Y_train = Y.loc[train_indexes][QML_RES_COL] X_test = X.loc[test_indexes] Y_test = Y.loc[test_indexes][QML_RES_COL] res = self.qm.qpredict(model_id, data_id, data=(X_train, Y_train, X_test), Y_test=Y_test, force=True, save_result=False) score = log_loss(Y_test, res.astype(np.float64), eps=1e-14) log(' {} {}'.format(i, score)) sys.stdout.flush() scores.append(score) if early_stop_cv is not None: if early_stop_cv(score): scores = [score] break total_score = sum(scores) / len(scores) return total_score
def eval_func(ytrue, ypredproba): return metrics.log_loss(ytrue, ypredproba)
def xgb_accuracy(ypred, dtrain): ytrue = dtrain.get_label().astype(int) ypred = np.where(ypred <= 0., 1e-5 , ypred) ypred = np.where(ypred >= 1., 1.-1e-5, ypred) return 'logloss', metrics.log_loss(ytrue, ypred)
def logloss(self, label, pred_prob): return metrics.log_loss(label, pred_prob)
def on_train_begin(self, model): self.validation = {} self.validation['epoch'] = [] self.validation['auc'] = [] self.validation['time'] = [] self.validation['log_loss'] = [] self.validation['roc'] = []
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier ''' print clf np.random.seed(seed) feature_index = np.arange(X.shape[1]) for epoch in range(nb_epoch): print "Start epoch:",epoch mf_tr = np.zeros((X.shape[0],len(np.unique(y)))) mf_te = np.zeros((X_test.shape[0],len(np.unique(y)))) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y) np.random.shuffle(feature_index) new_index = feature_index[:int(max_features*len(feature_index))] for ind_tr, ind_te in skf: if ssp.issparse(X): X_tr = X[ind_tr].tocsc()[:,new_index] X_te = X[ind_te].tocsc()[:,new_index] else: X_tr = X[ind_tr][:,new_index] X_te = X[ind_te][:,new_index] y_tr = y[ind_tr] y_te = y[ind_te] clf.fit(X_tr, y_tr) mf_tr[ind_te] += clf.predict_proba(X_te) mf_te += clf.predict_proba(X_test[:,new_index]) score = log_loss(y_te, mf_tr[ind_te]) print '\tpred[{}] score:{}'.format(epoch, score) mf_te/=n_folds pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random_r.pkl'%(name,epoch)) pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random_r.pkl'%(name,epoch))