我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.model_selection.KFold()。
def cross_validation(): M = read_dataset() n_fold = 10 rating_idx = np.array(M.nonzero()).T kf = KFold(n_splits=n_fold, random_state=0) with tf.Session() as sess: model = VAEMF(sess, num_user, num_item, hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim, latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param) for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)): print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i, n_fold, train_idx.size, test_idx.size)) model.train(M, train_idx=train_idx, test_idx=test_idx, n_steps=n_steps)
def cross_validation(): M = read_dataset() n_fold = 10 rating_idx = np.array(M.nonzero()).T kf = KFold(n_splits=n_fold, random_state=0) with tf.Session() as sess: model = VAEMF(sess, num_user, num_item, hidden_encoder_dim=hidden_encoder_dim, hidden_decoder_dim=hidden_decoder_dim, latent_dim=latent_dim, output_dim=output_dim, learning_rate=learning_rate, batch_size=batch_size, reg_param=reg_param, one_hot=one_hot) for i, (train_idx, test_idx) in enumerate(kf.split(rating_idx)): print("{0}/{1} Fold start| Train size={2}, Test size={3}".format(i, n_fold, train_idx.size, test_idx.size)) model.train(M, train_idx=train_idx, test_idx=test_idx, n_steps=n_steps)
def kfold_train(self, n_splits=3): logger.info('train classifier using kFold') kf = KFold(n_splits=n_splits, shuffle=True) scores = [] precisions = [] recalls = [] for train_index, test_index in kf.split(self.data): train_text = self.data.iloc[train_index]['text'].values train_y = self.data.iloc[train_index]['class'].values test_text = self.data.iloc[test_index]['text'].values test_y = self.data.iloc[test_index]['class'].values self.cls.train(train_text, train_y) predictions = self.cls.predict(test_text) self.confusion += confusion_matrix(test_y, predictions) scores.append(f1_score(test_y, predictions, pos_label='geography')) recalls.append(recall_score(test_y, predictions, pos_label='geography')) precisions.append(precision_score(test_y, predictions, pos_label='geography')) self.score = sum(scores) / len(scores) self.precision = sum(precisions) / len(precisions) self.recall = sum(recalls) / len(recalls) return self.cls
def __init__(self, name, X, y, task, test_size=None, cv=None, random_state=42): self.name = name self.X = X self.y = y self.task = task self.random_state = random_state if test_size is not None: self.test_size = test_size self.validation_method = "train_test_split" self.X_train, self.X_test, self.y_train, self.y_test = \ model_selection.train_test_split(self.X, self.y, test_size=test_size, random_state=random_state) elif cv is not None: self.validation_method = "cv" if task == "regression": self.kfold = model_selection.KFold(n_splits=cv, random_state=random_state) elif task == "classification": self.kfold = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
def set_kfold(self, no_folds = 10, fold_id = 0): inst = KFold(n_splits = no_folds, shuffle=True, random_state=125) self.fold_id = fold_id self.KFolds = list(inst.split(np.arange(self.no_samples))) self.train_idx, self.test_idx = self.KFolds[fold_id] self.no_samples_train = self.train_idx.shape[0] self.no_samples_test = self.test_idx.shape[0] self.print_ext('Data ready. no_samples_train:', self.no_samples_train, 'no_samples_test:', self.no_samples_test) if self.train_batch_size == 0: self.train_batch_size = self.no_samples_train if self.test_batch_size == 0: self.test_batch_size = self.no_samples_test self.train_batch_size = min(self.train_batch_size, self.no_samples_train) self.test_batch_size = min(self.test_batch_size, self.no_samples_test) # This function is cropped before batch # Slice each sample to improve performance
def run_cross_validation_create_models_unet2(nfolds=5): from sklearn.model_selection import KFold files_full = glob.glob(INPUT_PATH + "*/*.png") files = [] for f in files_full: if '_mask' not in f: continue files.append(f) kf = KFold(n_splits=nfolds, shuffle=True, random_state=66) num_fold = 0 sum_score = 0 for train_index, test_index in kf.split(range(len(files))): num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, nfolds)) print('Split train: ', len(train_index)) print('Split valid: ', len(test_index)) if num_fold != 2: continue score = train_single_model(num_fold, train_index, test_index, files) sum_score += score print('Avg loss: {}'.format(sum_score/nfolds))
def run_cross_validation_create_models_unet1(nfolds=5): from sklearn.model_selection import KFold files_full = glob.glob(INPUT_PATH + "*/*.png") files = [] for f in files_full: if '_mask' in f: continue files.append(f) kf = KFold(n_splits=nfolds, shuffle=True, random_state=66) num_fold = 0 sum_score = 0 for train_index, test_index in kf.split(range(len(files))): num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, nfolds)) print('Split train: ', len(train_index)) print('Split valid: ', len(test_index)) score = train_single_model(num_fold, train_index, test_index, files) sum_score += score print('Avg loss: {}'.format(sum_score/nfolds))
def model_cross_valid(X,Y): seed = 7 kfold = model_selection.KFold(n_splits=10, random_state=seed) def bulid_model(model_name): model = model_name() return model scoring = 'neg_mean_squared_error' # + random fest boost lstm gbdt for model_name in [LinearRegression,ElasticNet]: #for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor]: model = bulid_model(model_name) results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(model_name,results.mean())
def cross_validate(classifier, n_folds = 5): '''Custom cross-validation module I always use ''' train_X = classifier['train_X'] train_y = classifier['train_y'] model = classifier['model'] score = 0.0 skf = KFold(n_splits = n_folds) for train_index, test_index in skf.split(train_X): X_train, X_test = train_X[train_index], train_X[test_index] y_train, y_test = train_y[train_index], train_y[test_index] clf = model.fit(X_train,y_train) pred = clf.predict_proba(X_test)[:,1] #print 'cross', roc_auc_score(y_test,pred) score = score + roc_auc_score(y_test,pred) return score/n_folds
def fit(self, X, y): self.base_models_ = [list() for x in self.base_models] self.meta_model_ = clone(self.meta_model) kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=15) # train cloned base models then create out-of-fold predictions that are needed to train the cloned meta-model out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models))) for i, model in enumerate(self.base_models): for train_index, holdout_index in kfold.split(X, y): instance = clone(model) self.base_models_[i].append(instance) instance.fit(X[train_index], y[train_index]) y_pred = instance.predict(X[holdout_index]) out_of_fold_predictions[holdout_index, i] = y_pred # now train the cloned meta-model using the out-of-fold predictions as new feature self.meta_model_.fit(out_of_fold_predictions, y) return self # do the predictions of all base models on the test data and use the averaged predictions as #meta-features for the final prediction which is done by the meta-model
def train_cross_validation(args, sess, model, phi_xs_train, ys_train): kf = KFold(n_splits=args.K) w_best = None validation_loss = 0 for train_index, validation_index in kf.split(phi_xs_train): sess.run(tf.global_variables_initializer()) model.fit(sess, phi_xs_train[train_index], ys_train[train_index], epoch=args.epoch, batch_size=args.batch_size) loss = model.eval(sess, phi_xs_train[validation_index], ys_train[validation_index]) logging.info('Validation loss = %f' % (loss)) validation_loss += loss model.reset(sess) return validation_loss / float(args.K)
def evaluate(self, individual): #print(" *** evaluate *** ") #model = individual.createNetwork() #return random.random(), random.seed(42) # perform KFold crossvalidation kf = KFold(n_splits=3) scores = [] for train, test in kf.split(self.X): # train, test are indicies X_train, X_test = self.X[train], self.X[test] y_train, y_test = self.y[train], self.y[test] model = individual.createNetwork() model.fit(X_train, y_train, batch_size=Config.batch_size, nb_epoch=Config.epochs, verbose=0) yy_test = model.predict(X_test) scores.append(error(y_test, yy_test)) fitness = np.mean(scores) return fitness,
def kfold(self, k=5, stratify=False, shuffle=True, seed=33): """K-Folds cross validation iterator. Parameters ---------- k : int, default 5 stratify : bool, default False shuffle : bool, default True seed : int, default 33 Yields ------- X_train, y_train, X_test, y_test, train_index, test_index """ if stratify: kf = StratifiedKFold(n_splits=k, random_state=seed, shuffle=shuffle) else: kf = KFold(n_splits=k, random_state=seed, shuffle=shuffle) for train_index, test_index in kf.split(self.X_train, self.y_train): X_train, y_train = idx(self.X_train, train_index), self.y_train[train_index] X_test, y_test = idx(self.X_train, test_index), self.y_train[test_index] yield X_train, y_train, X_test, y_test, train_index, test_index
def do_kfold(proc_images, proc_labels, split=10): trainimages = [] trainlabels = [] testimages = [] testlabels = [] rand_idx = random.sample(range(0, len(proc_images)), len(proc_images)) proc_images = proc_images[rand_idx] proc_labels = proc_labels[rand_idx] kf = KFold(n_splits=split) for train_index, test_index in kf.split(proc_images): x_train, x_test = proc_images[train_index], proc_images[test_index] y_train, y_test = proc_labels[train_index], proc_labels[test_index] trainimages.append(x_train) testimages.append(x_test) trainlabels.append(y_train) testlabels.append(y_test) np.save("trainimages.npy", trainimages) np.save("testimages.npy", testimages) np.save("trainlabels.npy", trainlabels) np.save("testlabels.npy", testlabels) return(trainimages, testimages, trainlabels, testlabels)
def testClassificationQuality(self): score = 0 kfold = KFold(n_splits=10, shuffle=True, random_state=0) tweetClassification = TweetClassification() for ind_train, ind_test in kfold.split(self.tweets): dataTest = self.tweets[ind_test] dataTrain = self.tweets[ind_train] targetTest = self.target[ind_test] targetTrain = self.target[ind_train] tweetClassification.fit(dataTrain, targetTrain) score += tweetClassification.score(dataTest, targetTest) return score / 10
def evaluate_cross_validation(self, clf, data, target, cluster): score = 0 kfold = KFold(n_splits=cluster, shuffle=True, random_state=0) for ind_train, ind_test in kfold.split(data): dataTest = data[ind_test] dataTrain = data[ind_train] targetTest = target[ind_test] targetTrain = target[ind_train] clf.fit(dataTrain, targetTrain) score += clf.score(dataTest, targetTest) print ('-'*30) print ("Mean score: %0.3f" % (score/10)) print ('-'*30) return score/10
def test_cross_val_predict(): # Make sure it works in cross_val_predict for multiclass. X, y = load_iris(return_X_y=True) y = LabelBinarizer().fit_transform(y) X = StandardScaler().fit_transform(X) mlp = MLPClassifier(n_epochs=10, solver_kwargs={'learning_rate': 0.05}, random_state=4567).fit(X, y) cv = KFold(n_splits=4, random_state=457, shuffle=True) y_oos = cross_val_predict(mlp, X, y, cv=cv, method='predict_proba') auc = roc_auc_score(y, y_oos, average=None) assert np.all(auc >= 0.96)
def gs_numpy( method, X, Y, alphas_log = (-1, 1, 9), n_splits=5, n_jobs = -1, disp = True): """ Grid search method with numpy array of X and Y Previously, np.mat are used for compatible with Matlab notation. """ if disp: print( X.shape, Y.shape) clf = getattr( linear_model, method)() parmas = {'alpha': np.logspace( *alphas_log)} kf5_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf5 = kf5_c.split( X) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf5_c, n_jobs = n_jobs) gs.fit( X, Y) return gs
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = svm.SVR( **svr_params) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1): """ gs = gs_classfier( classifier, xM, yVc, params, n_splits=5, n_jobs=-1) Inputs ====== classifier = svm.SVC(), for example param = {"C": np.logspace(-2,2,5)} """ #print(xM.shape, yVc.shape) kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=True) gs = model_selection.GridSearchCV( classifier, params, cv=kf5_c, n_jobs=n_jobs) gs.fit( xM, yVc) return gs
def gs_Ridge_BIKE( A_list, yV, XX = None, alphas_log = (1, -1, 9), n_splits = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, XX) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf_n = kf5_ext_c.split( A_list[0]) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def gs_BIKE_Ridge( A_list, yV, alphas_log = (1, -1, 9), X_concat = None, n_splits = 5, n_jobs = -1): """ As is a list of A matrices where A is similarity matrix. X is a concatened linear descriptors. If no X is used, X can be empty """ clf = binary_model.BIKE_Ridge( A_list, X_concat) parmas = {'alpha': np.logspace( *alphas_log)} ln = A_list[0].shape[0] # ls is the number of molecules. kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) #kf_n = kf5_ext_c.split( A_list[0]) gs = model_selection.GridSearchCV( clf, parmas, scoring = 'r2', cv = kf_n_c, n_jobs = n_jobs) AX_idx = np.array([list(range( ln))]).T gs.fit( AX_idx, yV) return gs
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_splits = xM.shape[0] # print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') jutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cv_SVR( xM, yV, svr_params, n_splits = 5, n_jobs = -1, grid_std = None, graph = True, shuffle = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = svm.SVR( **svr_params) kf_n_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1, graph=False): """ gs = gs_param( model, X, y, param_grid, n_splits=5, shuffle=True, n_jobs=-1) Inputs ====== model = svm.SVC(), or linear_model.LinearRegression(), for example param = {"C": np.logspace(-2,2,5)} """ #print(xM.shape, yVc.shape) kf5_c = model_selection.KFold( n_splits=n_splits, shuffle=shuffle) gs = model_selection.GridSearchCV( model, param_grid, cv=kf5_c, n_jobs=n_jobs) gs.fit( X, y) if graph: plt.plot( gs.cv_results_["mean_train_score"], label='E[Train]') plt.plot( gs.cv_results_["mean_test_score"], label='E[Test]') plt.legend(loc=0) plt.grid() return gs
def _cv_r0( method, xM, yV, alpha, n_splits = 5, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n_c = model_selection.KFold( n_splits = n_splits, shuffle=True) kf_n = kf5_ext_c.split( xM) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def cvLOO( method, xM, yV, alpha, n_jobs = -1, grid_std = None, graph = True): """ method can be 'Ridge', 'Lasso' cross validation is performed so as to generate prediction output for all input molecules """ n_splits = xM.shape[0] # print(xM.shape, yV.shape) clf = getattr( linear_model, method)( alpha = alpha) kf_n = model_selection.KFold( xM.shape[0], n_splits=n_splits) yV_pred = model_selection.cross_val_predict( clf, xM, yV, cv = kf_n, n_jobs = n_jobs) if graph: print('The prediction output using cross-validation is given by:') kutil.cv_show( yV, yV_pred, grid_std = grid_std) return yV_pred
def gs_Lasso(xM, yV, alphas_log=(-1, 1, 9), n_folds=5, n_jobs=-1): print(xM.shape, yV.shape) clf = linear_model.Lasso() #parmas = {'alpha': np.logspace(1, -1, 9)} parmas = {'alpha': np.logspace(*alphas_log)} kf5_c = model_selection.KFold(n_folds=n_folds, shuffle=True) kf5 = kf5_c.split(xM) gs = model_selection.GridSearchCV( clf, parmas, scoring='r2', cv=kf5, n_jobs=n_jobs) gs.fit(xM, yV) return gs
def create_training_test_sets(self): """ Split data set into training and test folds. """ # load input data input_data = np.asarray(np.loadtxt('input/data.txt'), dtype=np.float32) self.input_dim = input_data.shape[1] - 1 self.output_dim = 1 # align to batch size batches = input_data.shape[0] // (self.batch_size * self.n_splits) input_data = input_data[:batches * (self.batch_size * self.n_splits)] self.data_size = input_data.shape[0] print(f'Loaded input data, shape = {input_data.shape}') # create splits kfold = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed) print(f'Splits: {self.n_splits}') # assume y is in the last column by default for idx_train, idx_test in kfold.split(input_data): self.train_x.append(input_data[idx_train, :-1]) self.train_y.append(input_data[idx_train, -1:]) self.test_x.append(input_data[idx_test, :-1]) self.test_y.append(input_data[idx_test, -1:]) # layers described as [number of neurons, dropout probability] if self.layers_description is None: self.layers_description = [[self.input_dim, 0.0], [100, 0.0], [100, 0.0], [self.output_dim, 0.0]]
def transform(self, M, **kwargs): """ Takes a Takes a dataframe that has :code:`item_id` index, other 'features' columns for prediction, and applies a Keras sequential model to it. :param M: a dataframe that has an :code:`item_id` index, and "features" columns. :type M: pandas.DataFrame :rtype: a tuple with trained Keras model and its keyword arguments """ rows, columns = M.shape factors = M.merge(self.validation_matrix, left_index=True, right_index=True) factors = factors.values if self.classification: kfold = StratifiedKFold(n_splits=self.kfold_n_splits, random_state=self.kfold_seed, shuffle=self.kfold_shuffle) else: kfold = KFold(n_splits=self.kfold_n_splits, random_state=self.kfold_seed, shuffle=self.kfold_shuffle) X = factors[:, :columns] Y = factors[:, columns:] for train_index, test_index in kfold.split(X, Y): self.keras_model.fit( X[train_index], Y[train_index], validation_data=[X[test_index], Y[train_index]], **self.keras_kwargs) return self.keras_model, kwargs
def setup_data(self, path): """Read and iteratively yield data to agent""" print('loading: ' + path) questions = [] y = [] # open data file with labels # (path will be provided to setup_data from opt['datafile'] defined above) with open(path) as labels_file: context = csv.reader(labels_file) next(context) for item in context: label, text = item questions.append(text) y.append([self.answer_candidates[int(label)]]) episode_done = True indexes = range(len(questions)) if self.datatype_strict != 'test': random_state = random.getstate() random.setstate(self.random_state) kf_seed = random.randrange(500000) kf = KFold(self.opt.get('bagging_folds_number'), shuffle=True, random_state=kf_seed) i = 0 for train_index, test_index in kf.split(questions): indexes = train_index if self.datatype_strict == 'train' else test_index if i >= self.opt.get('bagging_fold_index', 0): break self.random_state = random.getstate() random.setstate(random_state) # define iterator over all queries for i in indexes: # get current label, both as a digit and as a text # yield tuple with information and episode_done? flag yield (questions[i], y[i]), episode_done
def kf_worker(X_tr, Y_tr, mu_range, tr_idx, vld_idx, i, results): """Worker for parallel KFold implementation.""" betas = RLS_path(X_tr, Y_tr, mu_range) results[i] = {'betas': betas, 'tr_idx': tr_idx, 'vld_idx': vld_idx}
def test_large_grid(): """In this test, we purposely overfit a RandomForest to completely random data in order to assert that the test error will far supercede the train error. """ if not SK18: custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42) else: custom_cv = KFold(n_splits=3, shuffle=True, random_state=42) # define the pipe pipe = Pipeline([ ('scaler', SelectiveScaler()), ('pca', SelectivePCA(weight=True)), ('rf', RandomForestClassifier(random_state=42)) ]) # define hyper parameters hp = { 'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()], 'pca__whiten': [True, False], 'pca__weight': [True, False], 'pca__n_components': uniform(0.75, 0.15), 'rf__n_estimators': randint(5, 10), 'rf__max_depth': randint(5, 15) } # define the grid grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42) # this will fail because we haven't fit yet assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train) # fit the grid grid.fit(X_train, y_train) # score for coverage -- this might warn... with warnings.catch_warnings(): warnings.simplefilter("ignore") grid.score(X_train, y_train) # coverage: assert grid._estimator_type == 'classifier' # get predictions tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test) # evaluate score (SHOULD be better than random...) accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred) # grid score reports: # assert fails for bad percentile assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0}) assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0}) # assert fails for bad y_axis assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'}) # assert passes otherwise report_grid_score_detail(grid, charts=True, percentile=0.95) # just ensure percentile works
def adaBoost(self, settings, data=None, dropna=True): df = self.__loadData(data, dropna) features = df.columns[:-1] X = df[features] y = df.iloc[:, -1].values seed = 7 num_trees = 500 kfold = model_selection.KFold(n_splits=10, random_state=seed) print kfold model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model_selection.cross_val_score(model, X, y, cv=kfold) model.fit(X, y) print results.mean() print model.score(X, y) return True
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio): fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha, l1_ratio)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry path = os.path.join("data", "process", ds, "folds", "{}", "{}") # sorry again: get val docs n_folds = 5 if dataset == 'ukp' else 3 load, ids = get_dataset_loader(dataset, "train") for k_, (_, val) in enumerate(KFold(n_folds).split(ids)): if k_ == k: break val_docs = list(load(ids[val])) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump((Y_marg, baseline), f) return Y_marg, baseline
def linear_cv_score(dataset, alpha, l1_ratio, constraints): fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio, constraints)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio) val_docs = list(load(ids[val])) Y_true = [doc.label for doc in val_docs] Y_pred = bl.fast_decode(Y_marg, val_docs, constraints) scores.append(bl._score(Y_true, Y_pred)) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump(scores, f) return scores
def svmstruct_cv_score(dataset, C, class_weight, constraints, compat_features, second_order_features): fn = cache_fname("svmstruct_cv_score", (dataset, C, class_weight, constraints, compat_features, second_order_features)) if os.path.exists(fn): logging.info("Cached file already exists.") with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 # below are boolean logical ops grandparents = second_order_features and dataset == 'ukp' coparents = second_order_features siblings = second_order_features and dataset == 'cdcp' scores = [] all_Y_pred = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): train_docs = list(load(ids[tr])) val_docs = list(load(ids[val])) clf, Y_val, Y_pred = fit_predict(train_docs, val_docs, dataset, C, class_weight, constraints, compat_features, second_order_features, grandparents, coparents, siblings) all_Y_pred.extend(Y_pred) scores.append(clf.model._score(Y_val, Y_pred)) with open(fn, "wb") as f: dill.dump((scores, all_Y_pred), f) return scores, all_Y_pred
def split_kfold_r(y): skf = KFold(5) ilst = [] for tri, tei in skf.split(y): ilst.append((tri, tei)) return ilst
def split_fold(in_pattern, rettrain=True, fold=0, cvs=5, include_vlaidation=True, split_seed=0): """ Splits the elements of the in_pattern into training and test sets :param in_pattern: string of tfrecord patterns :param rettrain: return training set (True) or leave out set (False) :param fold: which fold to process :param cvs: how many folds you want :param include_vlaidation: include validation set :return: subset of tfrecords """ assert fold < cvs files = gfile.Glob(in_pattern) if split_seed > 0: kf = KFold(n_splits=cvs, shuffle=True, random_state=split_seed) else: kf = KFold(n_splits=cvs) for i, (train, test) in enumerate(kf.split(files)): if i == fold: break if rettrain: retfiles = list(np.array(files)[train]) else: retfiles = list(np.array(files)[test]) if include_vlaidation: addition = [fname.replace('train', 'validate') for fname in retfiles] retfiles += addition return retfiles
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) tprs = np.zeros((nrof_folds,nrof_thresholds)) fprs = np.zeros((nrof_folds,nrof_thresholds)) accuracy = np.zeros((nrof_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold for the fold acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set]) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs,0) fpr = np.mean(fprs,0) return tpr, fpr, accuracy
def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10): assert(embeddings1.shape[0] == embeddings2.shape[0]) assert(embeddings1.shape[1] == embeddings2.shape[1]) nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = KFold(n_splits=nrof_folds, shuffle=False) val = np.zeros(nrof_folds) far = np.zeros(nrof_folds) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff),1) indices = np.arange(nrof_pairs) for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the threshold that gives FAR = far_target far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set]) if np.max(far_train)>=far_target: f = interpolate.interp1d(far_train, thresholds, kind='slinear') threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set]) val_mean = np.mean(val) far_mean = np.mean(far) val_std = np.std(val) return val_mean, val_std, far_mean
def test_using_kfold(X, y, clf, splits=5): kf = KFold(n_splits=splits, shuffle=True) scores = [] for k, (train, test) in enumerate(kf.split(X, y)): logger.info("Fitting and transforming the model on one fold") clf.fit(X[train], y[train]) score = clf.score(X[test], y[test]) logger.info("[Fold {0}] score: {1:.5f}".format(k+1, score)) scores.append(score) utils.persistence.dump(CLF_KFOLD_DUMP_NAME, clf) scores_mean = np.mean(scores) logger.info("Score: {}".format(scores_mean)) return clf
def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, num_folds=10): """Calculate TPR and FPR under different threshold, accuracy under the best threshold""" assert (embeddings1.shape[0] == embeddings2.shape[0]) assert (embeddings1.shape[1] == embeddings2.shape[1]) num_pairs = min(len(actual_issame), embeddings1.shape[0]) num_threshold = len(thresholds) k_fold = KFold(n_splits=num_folds, shuffle=False) tprs = np.zeros((num_folds, num_threshold)) fprs = np.zeros((num_folds, num_threshold)) acc = np.zeros((num_folds)) diff = np.subtract(embeddings1, embeddings2) dist = np.sum(np.square(diff), 1) indices = np.arange(num_pairs) for fold_id, (train_set, test_set) in enumerate(k_fold.split(indices)): # Find the best threshold acc_train = np.zeros((num_threshold)) for thres_id, thres in enumerate(thresholds): _, _, acc_train[thres_id] = calculate_acc(thres, dist[train_set], actual_issame[train_set]) best_id = np.argmax(acc_train) # Calculate tprs and fprs on test set for thres_id, thres in enumerate(thresholds): tprs[fold_id, thres_id], fprs[fold_id, thres_id], _ = calculate_acc(thres, dist[test_set], actual_issame[test_set]) # Use the best threshold to calculate accuracy _, _, acc[fold_id] = calculate_acc(thresholds[best_id], dist[test_set], actual_issame[test_set]) tpr = np.mean(tprs, 0) # true positive rate under different threshold fpr = np.mean(fprs, 0) # false positive rate under different threshold return tpr, fpr, acc