我们从Python开源项目中,提取了以下7个代码示例,用于说明如何使用statsmodels.api.GLM。
def _get_intercept_stats(self, add_slopes=True): # start with mean and variance of Y on the link scale mod = sm.GLM(endog=self.model.y.data, exog=np.repeat(1, len(self.model.y.data)), family=self.model.family.smfamily(), missing='drop' if self.model.dropna else 'none').fit() mu = mod.params # multiply SE by sqrt(N) to turn it into (approx.) SD(Y) on link scale sd = (mod.cov_params()[0] * len(mod.mu))**.5 # modify mu and sd based on means and SDs of slope priors. if len(self.model.fixed_terms) > 1 and add_slopes: means = np.array([x['mu'] for x in self.priors.values()]) sds = np.array([x['sd'] for x in self.priors.values()]) # add to intercept prior index = list(self.priors.keys()) mu -= np.dot(means, self.stats['mean_x'][index]) sd = (sd**2 + np.dot(sds**2, self.stats['mean_x'][index]**2))**.5 return mu, sd
def __init__(self, model, taylor): self.model = model self.stats = model.dm_statistics if hasattr(model, 'dm_statistics') \ else None self.dm = pd.DataFrame({lev: t.data[:, i] for t in model.fixed_terms.values() for i, lev in enumerate(t.levels)}) self.priors = {} missing = 'drop' if self.model.dropna else 'none' self.mle = sm.GLM(endog=self.model.y.data, exog=self.dm, family=self.model.family.smfamily(), missing=missing).fit() self.taylor = taylor with open(join(dirname(__file__), 'config', 'derivs.txt'), 'r') as file: self.deriv = [next(file).strip('\n') for x in range(taylor+1)]
def run_glm(X, y, model_name): """ Train the binomial/negative binomial GLM Args: X (np.array): scaled X. y (pd.df): four columns response table. Returns: sm.model: trained GLM models. """ # Add const manually. sm.add_constant cannot add 1 for shape (1, n) X = np.c_[X, np.ones(X.shape[0])] if model_name == 'Binomial': logger.info('Building binomial GLM') # make two columns response (# success, # failure) y_binom = np.zeros((y.shape[0], 2), dtype=np.int_) y_binom[:, 0] = y.nMut y_binom[:, 1] = y.length * y.N - y.nMut glm = sm.GLM(y_binom, X, family=sm.families.Binomial()) elif model_name == 'NegativeBinomial': logger.info('Building negative binomial GLM') # use nMut as response and length*N as exposure glm = sm.GLM(y.nMut.values, X, family=sm.families.NegativeBinomial(), exposure=y.length.values*y.N.values+1) else: sys.stderr.write('Unknown GLM name {}. Must be Binomial or NegativeBinomial'.format(model_name)) sys.exit(1) model = glm.fit() return model
def glm(data, xseq, **params): """ Fit GLM """ X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) results = sm.GLM(data['y'], X).fit(**params['method_args']) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(Xseq) if params['se']: # TODO: Depends on statsmodel > 0.7 # https://github.com/statsmodels/statsmodels/pull/2151 # https://github.com/statsmodels/statsmodels/pull/3406 # Remove the try/except when a compatible version is released try: prediction = results.get_prediction(Xseq) ci = prediction.conf_int(1 - params['level']) data['ymin'] = ci[:, 0] data['ymax'] = ci[:, 1] except (AttributeError, TypeError): warnings.warn( "Cannot compute confidence intervals." "Install latest/development version of statmodels.") return data
def glmfit(X,Y): ''' Wrapper for statsmodels glmfit that prepares a constant parameter and configuration options for poisson-GLM fitting. Please see the documentation for glmfit in statsmodels for more details. This method will automatically add a constant colum to the feature matrix Y Parameters ---------- X : array-like A nobs x k array where `nobs` is the number of observations and `k` is the number of regressors. An intercept is not included by default and should be added by the user (models specified using a formula include an intercept by default). See `statsmodels.tools.add_constant`. Y : array-like 1d array of poisson counts. This array can be 1d or 2d. ''' # check for and maybe add constant value to X if not all(X[:,0]==X[0,0]): X = hstack([ ones((shape(X)[0],1),dtype=X.dtype), X]) poisson_model = sm.GLM(Y,X,family=sm.families.Poisson()) poisson_results = poisson_model.fit() M = poisson_results.params return M
def _scale_random(self, term): # these default priors are only defined for HalfNormal priors if term.prior.args['sd'].name != 'HalfNormal': return sd_corr = term.prior.scale # recreate the corresponding fixed effect data fix_data = term.data.sum(axis=1) # handle intercepts and cell means if term.constant: mu, sd = self._get_intercept_stats() sd *= sd_corr # handle slopes else: exists = [x for x in self.dm.columns if np.array_equal(fix_data, self.dm[x].values)] # handle case where there IS a corresponding fixed effect if exists and exists[0] in self.priors.keys(): sd = self.priors[exists[0]]['sd'] # handle case where there IS NOT a corresponding fixed effect else: # the usual case: add the random effect data as a fixed effect # in the design matrix if not exists: fix_dataframe = pd.DataFrame(fix_data) # things break if column names are integers (the default) fix_dataframe.rename( columns={c: '_'+str(c) for c in fix_dataframe.columns}, inplace=True) exog = self.dm.join(fix_dataframe) # this handles the corner case where there technically is the # corresponding fixed effect, but the parameterization differs # between the fixed- and random-effect specification. usually # this means the fixed effects use cell-means coding but the # random effects use k-1 coding else: group = term.name.split('|')[1] exog = self.model.random_terms.values() exog = [v.data.sum(1) for v in exog if v.name.split('|')[-1] == group] index = ['_'+str(i) for i in range(len(exog))] exog = pd.DataFrame(exog, index=index).T # this will replace self.mle (which is missing predictors) missing = 'drop' if self.model.dropna else 'none' full_mod = sm.GLM(endog=self.model.y.data, exog=exog, family=self.model.family.smfamily(), missing=missing).fit() sd = self._get_slope_stats(exog=exog, predictor=fix_data, full_mod=full_mod, sd_corr=sd_corr) # set the prior SD. term.prior.args['sd'].update(sd=np.squeeze(np.atleast_1d(sd)))
def fit(self, y, X): """Fit the model. Args: y (pandas.DataFrame): The vector of endogenous variable. X (pandas.DataFrame): The matrix of exogenous variables. """ # Creating the GLM model from StatsModels and fitting it model = sm.GLM(y, X, family=sm.families.Binomial()) try: fitted = model.fit() except PerfectSeparationError as e: raise StatsError(str(e)) out = {} parameters = fitted.params.index # Results about the model fit. out = { "MODEL": { "log_likelihood": fitted.llf, "nobs": fitted.nobs }, } # Getting the confidence intervals conf_ints = fitted.conf_int() for param in parameters: # If GWAS, check that inference could be done on the SNP. if param == "SNPs" and np.isnan(fitted.pvalues[param]): raise StatsError("Inference did not converge.") out[param] = { "coef": fitted.params[param], "std_err": fitted.bse[param], "lower_ci": conf_ints.loc[param, 0], "upper_ci": conf_ints.loc[param, 1], "t_value": fitted.tvalues[param], "p_value": fitted.pvalues[param], } return out