我们从Python开源项目中,提取了以下11个代码示例,用于说明如何使用statsmodels.api.WLS。
def lm(data, xseq, **params): """ Fit OLS / WLS if data has weight """ X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) try: model = sm.WLS(data['y'], X, weights=data['weight']) except KeyError: model = sm.OLS(data['y'], X) results = model.fit(**params['method_args']) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(Xseq) if params['se']: alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std( results, Xseq, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
def __init__(self, x, y, w): self.x = x self.y = y self.w = w WLS = sm.WLS(y,x, w) OLS = sm.OLS(WLS.wendog,WLS.wexog).fit() self.coefficient = OLS.params[0] self.WSSResidual = OLS.ssr self.fittedvalues = OLS.predict(x) self.residual = OLS.resid if len(x) == 1: self.mean_square_error = np.nan self.standard_error = np.nan self.sigma = np.nan self.std_resid = np.nan else: self.mean_square_error = OLS.mse_resid self.standard_error = OLS.params[0]/OLS.tvalues[0] self.sigma = np.sqrt(self.mean_square_error) self.std_resid = OLSInfluence(OLS).resid_studentized_internal
def testWLS(self): # WLS centered SS changed (fixed) in 0.5.0 sm_version = sm.version.version if sm_version < LooseVersion('0.5.0'): raise nose.SkipTest("WLS centered SS not fixed in statsmodels" " version {0}".format(sm_version)) X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) Y = Series(np.random.randn(30)) weights = X.std(1) self._check_wls(X, Y, weights) weights.ix[[5, 15]] = np.nan Y[[2, 21]] = np.nan self._check_wls(X, Y, weights)
def _check_wls(self, x, y, weights): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=y, x=x, weights=1 / weights) combined = x.copy() combined['__y__'] = y combined['__weights__'] = weights combined = combined.dropna() endog = combined.pop('__y__').values aweights = combined.pop('__weights__').values exog = sm.add_constant(combined.values, prepend=False) sm_result = sm.WLS(endog, exog, weights=1 / aweights).fit() assert_almost_equal(sm_result.params, result._beta_raw) assert_almost_equal(sm_result.resid, result._resid_raw) self.checkMovingOLS('rolling', x, y, weights=weights) self.checkMovingOLS('expanding', x, y, weights=weights)
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, nw_overlap=False): import warnings warnings.warn("The pandas.stats.ols module is deprecated and will be " "removed in a future version. We refer to external packages " "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html", FutureWarning, stacklevel=4) try: import statsmodels.api as sm except ImportError: import scikits.statsmodels.api as sm self._x_orig = x self._y_orig = y self._weights_orig = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap (self._y, self._x, self._weights, self._x_filtered, self._index, self._time_has_obs) = self._prepare_data() if self._weights is not None: self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) self._y_trans = self._y * np.sqrt(self._weights) self.sm_ols = sm.WLS(self._y.get_values(), self._x.get_values(), weights=self._weights.values).fit() else: self._x_trans = self._x self._y_trans = self._y self.sm_ols = sm.OLS(self._y.get_values(), self._x.get_values()).fit()
def regression_apply(row, timepoints, weighted): """ :py:meth:`pandas.DataFrame.apply` apply function for calculating enrichment using linear regression. If *weighted* is ``True`` perform weighted least squares; else perform ordinary least squares. Weights for weighted least squares are included in *row*. Returns a :py:class:`pandas.Series` containing regression coefficients, residuals, and statistics. """ # retrieve log ratios from the row y = row[['L_{}'.format(t) for t in timepoints]] # re-scale the x's to fall within [0, 1] xvalues = [x / float(max(timepoints)) for x in timepoints] # perform the fit X = sm.add_constant(xvalues) # fit intercept if weighted: W = row[['W_{}'.format(t) for t in timepoints]] fit = sm.WLS(y, X, weights=W).fit() else: fit = sm.OLS(y, X).fit() # re-format as a data frame row values = np.concatenate([fit.params, [fit.bse['x1'], fit.tvalues['x1'], fit.pvalues['x1']], fit.resid]) index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \ ['e_{}'.format(t) for t in timepoints] return pd.Series(data=values, index=index)
def validate(self): """ Raises an informative ``ValueError`` if the time points in the analysis are not suitable. Calls validate method on all child SeqLibs. """ # check the time points if 0 not in self.timepoints: raise ValueError("Missing timepoint 0 [{}]".format(self.name)) if self.timepoints[0] != 0: raise ValueError("Invalid negative timepoint [{}]".format(self.name)) if len(self.timepoints) < 2: raise ValueError("Multiple timepoints required [{}]".format(self.name)) elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"): raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name)) # check the wild type sequences if self.has_wt_sequence(): for child in self.children[1:]: if self.children[0].wt != child.wt: logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name}) break # check that we're not doing wild type normalization on something with no wild type #if not self.has_wt_sequence() and self.logr_method == "wt": # raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name)) # validate children for child in self.children: child.validate()
def calculate(self): """ Wrapper method to calculate counts and enrichment scores for all data in the :py:class:`~selection.Selection`. """ if len(self.labels) == 0: raise ValueError("No data present across all sequencing libraries [{}]".format(self.name)) for label in self.labels: self.merge_counts_unfiltered(label) self.filter_counts(label) if self.is_barcodevariant() or self.is_barcodeid(): self.combine_barcode_maps() if self.scoring_method == "counts": pass elif self.scoring_method == "ratios": for label in self.labels: self.calc_ratios(label) elif self.scoring_method == "simple": for label in self.labels: self.calc_simple_ratios(label) elif self.scoring_method in ("WLS", "OLS"): if len(self.timepoints) <= 2: raise ValueError("Regression-based scoring requires three or more time points.") for label in self.labels: self.calc_log_ratios(label) if self.scoring_method == "WLS": self.calc_weights(label) self.calc_regression(label) else: raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name)) if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers: if self.is_barcodevariant() or self.is_barcodeid(): self.calc_outliers("barcodes") if self.is_coding(): self.calc_outliers("variants")
def calc_regression(self, label): """ Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares. Regression results are stored in ``'/main/label/scores'`` """ if self.check_store("/main/{}/scores".format(label)): return elif "/main/{}/scores".format(label) in self.store.keys(): # need to remove the current keys because we are using append self.store.remove("/main/{}/scores".format(label)) logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name}) # append is required because it takes the "min_itemsize" argument, and put doesn't longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max() chunk = 1 if self.scoring_method == "WLS": for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize): logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 elif self.scoring_method == "OLS": for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize): logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 else: raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name)) # need to read from the file, calculate percentiles, and rewrite it logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name}) data = self.store['/main/{}/scores'.format(label)] data['score'] = data['slope'] data['SE'] = data['SE_slope'] data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']] data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
def wt_plot(self, pdf): """ Create a plot of the linear fit of the wild type variant. *pdf* is an open PdfPages instance. Only created for selections that use WLS or OLS scoring and have a wild type specified. Uses :py:func:`~plots.fit_axes` for the plotting. """ logging.info("Creating wild type fit plot", extra={'oname' : self.name}) # get the data and calculate log ratios if "variants" in self.labels: wt_label = "variants" elif "identifiers" in self.labels: wt_label = "identifiers" data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0] sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index") # sum of complete cases (N') yvalues = np.log(data + 0.5) - np.log(sums + 0.5) xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints] # fit the line X = sm.add_constant(xvalues) # fit intercept if self.scoring_method == "WLS": W = 1 / (1 / (data + 0.5) + 1 / (sums + 0.5)) fit = sm.WLS(yvalues, X, weights=W).fit() elif self.scoring_method == "OLS": fit = sm.OLS(yvalues, X).fit() else: raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name)) intercept, slope = fit.params slope_se = fit.bse['x1'] # make the plot fig, ax = plt.subplots() fig.set_tight_layout(True) fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints) fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se)) ax.set_title("Wild Type Shape\n{}".format(self.name)) ax.set_ylabel("Log Ratio (Complete Cases)") pdf.savefig(fig) plt.close(fig)
def scatterColor(x0, y, w): """Creates scatter plot with points colored by variable. All input arrays must have matching lengths :param x0: x values to plot :type x0: list :param y: y values to plot :type y: list :param w: z values to plot :returns: plot; slope and intercept of the RLM best fit line shown on the plot .. warning:: all input arrays must have matching lengths and scalar values .. note:: See documentation at http://statsmodels.sourceforge.net/0.6.0/generated/statsmodels.robust.robust_linear_model.RLM.html for the RLM line """ import matplotlib as mpl import matplotlib.cm as cm import statsmodels.api as sm from scipy.stats import linregress cmap = plt.cm.get_cmap('RdYlBu') norm = mpl.colors.Normalize(vmin=w.min(), vmax=w.max()) m = cm.ScalarMappable(norm=norm, cmap=cmap) m.set_array(w) sc = plt.scatter(x0, y, label='', color=m.to_rgba(w)) xa = sm.add_constant(x0) est = sm.RLM(y, xa).fit() r2 = sm.WLS(y, xa, weights=est.weights).fit().rsquared slope = est.params[1] x_prime = np.linspace(np.min(x0), np.max(x0), 100)[:, np.newaxis] x_prime = sm.add_constant(x_prime) y_hat = est.predict(x_prime) const = est.params[0] y2 = [i * slope + const for i in x0] lin = linregress(x0, y) x1 = np.arange(np.min(x0), np.max(x0), 0.1) y1 = [i * lin[0] + lin[1] for i in x1] y2 = [i * slope + const for i in x1] plt.plot(x1, y1, c='g', label='simple linear regression m = {:.2f} b = {:.0f}, r^2 = {:.2f}'.format(lin[0], lin[1], lin[2] ** 2)) plt.plot(x1, y2, c='r', label='rlm regression m = {:.2f} b = {:.0f}, r2 = {:.2f}'.format(slope, const, r2)) plt.legend() cbar = plt.colorbar(m) cbar.set_label('Julian Date') return slope, const