我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用sklearn.utils.resample()。
def dispersion_test(yhat, y, k=100): """ Implement the regression based dispersion test with k re-sampling. Args: yhat (np.array): predicted mutation count y (np.array): observed mutation count k (int): Returns: float, float: p-value, theta """ theta = 0 pval = 0 for i in range(k): y_sub, yhat_sub = resample(y, yhat, random_state=i) # (np.power((y - yhat), 2) - y) / yhat for Poisson regression aux = (np.power((y_sub - yhat_sub), 2) - yhat_sub) / yhat_sub mod = sm.OLS(aux, yhat_sub) res = mod.fit() theta += res.params[0] pval += res.pvalues[0] theta = theta/k pval = pval/k return pval, theta
def plot_mean_bootstrap_exponential_readme(): X = np.random.exponential(7, 4) classical_samples = [np.mean(resample(X)) for _ in range(10000)] posterior_samples = mean(X, 10000) l, r = highest_density_interval(posterior_samples) classical_l, classical_r = highest_density_interval(classical_samples) plt.subplot(2, 1, 1) plt.title('Bayesian Bootstrap of mean') sns.distplot(posterior_samples, label='Bayesian Bootstrap Samples') plt.plot([l, r], [0, 0], linewidth=5.0, marker='o', label='95% HDI') plt.xlim(-1, 18) plt.legend() plt.subplot(2, 1, 2) plt.title('Classical Bootstrap of mean') sns.distplot(classical_samples, label='Classical Bootstrap Samples') plt.plot([classical_l, classical_r], [0, 0], linewidth=5.0, marker='o', label='95% HDI') plt.xlim(-1, 18) plt.legend() plt.savefig('readme_exponential.png', bbox_inches='tight')
def _fit_one_bootstrap(self, i): m = clone(self.model) m._ensemble = True X, y = self.X_, self.y_ n = X.shape[0] n_samples = math.ceil(0.8 * n) # Get bootstrap set X_bs, y_bs = resample(X, y, replace=True, n_samples=n_samples, random_state=self.bs_seed+i) m.fit(X_bs, y_bs) if self.model.shadow_features: return m.interval_, m._omegas, m._biase, m._shadowintervals else: return m.interval_, m._omegas, m._biase
def plot_mean_bootstrap(): X = [-1, 0, 1] posterior_samples = mean(X, 10000) sns.distplot(posterior_samples) classical_samples = [np.mean(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
def plot_mean_resample_bootstrap(): X = [-1, 0, 1] posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100) sns.distplot(posterior_samples) classical_samples = [np.mean(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
def plot_median(): X = np.random.uniform(-1, 1, 10) posterior_samples = bayesian_bootstrap(X, np.median, 10000, 100) sns.distplot(posterior_samples) classical_samples = [np.median(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
def plot_var_bootstrap(): X = np.random.uniform(-1, 1, 100) posterior_samples = var(X, 10000) sns.distplot(posterior_samples) classical_samples = [np.var(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
def plot_var_resample_bootstrap(): X = np.random.uniform(-1, 1, 100) posterior_samples = bayesian_bootstrap(X, np.var, 10000, 500) sns.distplot(posterior_samples) classical_samples = [np.var(resample(X)) for _ in range(10000)] sns.distplot(classical_samples) plt.show()
def plot_regression_bootstrap(): X = np.array([[0], [1], [2], [3]]) y = np.array([0, 1, 2, 3]) + np.random.normal(0, 1, 4) classical_samples = [LinearRegression().fit(*resample(X, y)).coef_ for _ in tqdm(range(10000))] posterior_samples = bayesian_bootstrap_regression(X, y, lambda X, y: LinearRegression().fit(X, y).coef_, 10000, 1000) plt.scatter(X.reshape(-1, 1), y) plt.show() sns.distplot(classical_samples) sns.distplot(posterior_samples) plt.show()
def bootstrap_sample(test_x, test_y, model, n): """Stratified bootstrap sampling of test data to generate confidence intervals. Arguments ---------- test_x (pandas DataFrame): test data features. test_y (pandas Series): test outcome. Returns ------- CI (tuple): tuple with lower and upper limit of 95% confidence interval """ aucs = [] for sample in range(n): ind_pos = np.where(test_y.values > 0) ind_neg = np.where(test_y.values <= 0) pos_x = test_x[ind_pos[0], ] neg_x = test_x[ind_neg[0], ] pos_y = test_y.iloc[ind_pos[0]] neg_y = test_y.iloc[ind_neg[0]] resampled_pos_x, resampled_pos_y = resample(pos_x, pos_y) resampled_neg_x, resampled_neg_y = resample(neg_x, neg_y) resampled_x = scipy.sparse.vstack((resampled_pos_x, resampled_neg_x)) resampled_y = pd.concat((resampled_pos_y, resampled_neg_y), axis=0) probs = model.predict_proba(resampled_x) aucs.append(roc_auc_score(resampled_y.replace( to_replace=-1, value=0), probs[:, 1])) # Return 95% confidence interval CI = (np.percentile(aucs, 2.5), np.percentile(aucs, 97.5)) return CI
def bootstrap_auc(df, col, pred_col, n_bootstrap=1000): """ Calculate the boostrapped AUC for a given col trying to predict a pred_col. Parameters ---------- df : pandas.DataFrame col : str column to retrieve the values from pred_col : str the column we're trying to predict n_boostrap : int the number of bootstrap samples Returns ------- list : AUCs for each sampling """ scores = np.zeros(n_bootstrap) old_len = len(df) df.dropna(subset=[col], inplace=True) new_len = len(df) if new_len < old_len: logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len)) preds = df[pred_col].astype(int) for i in range(n_bootstrap): sampled_counts, sampled_pred = resample(df[col], preds) if is_single_class(sampled_pred, col=pred_col): continue scores[i] = roc_auc_score(sampled_pred, sampled_counts) return scores
def test_resample_noarg(): # Border case not worth mentioning in doctests assert_true(resample() is None)
def test_resample_value_errors(): # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)