我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用statsmodels.api.OLS。
def build_model(y, x, add_constant=True): ''' Build a linear regression model from the provided data Provided: y: a series or single column dataframe holding our solution vector for linear regression x: a dataframe that runs parallel to y, with all the features for our linear regression add_constant: a boolean, if true it will add a constant row to our provided x data. Otherwise this method assumes you've done-so already, or do not want one for some good reason Return: a linear regression model from StatsModels and the data which was used to train the model. If add_constant was true this will be a new dataframe, otherwise it will be x ''' if add_constant: x = sm.add_constant(x) model = sm.OLS(y, x).fit() return (model, x)
def dispersion_test(yhat, y, k=100): """ Implement the regression based dispersion test with k re-sampling. Args: yhat (np.array): predicted mutation count y (np.array): observed mutation count k (int): Returns: float, float: p-value, theta """ theta = 0 pval = 0 for i in range(k): y_sub, yhat_sub = resample(y, yhat, random_state=i) # (np.power((y - yhat), 2) - y) / yhat for Poisson regression aux = (np.power((y_sub - yhat_sub), 2) - yhat_sub) / yhat_sub mod = sm.OLS(aux, yhat_sub) res = mod.fit() theta += res.params[0] pval += res.pvalues[0] theta = theta/k pval = pval/k return pval, theta
def lm(data, xseq, **params): """ Fit OLS / WLS if data has weight """ X = sm.add_constant(data['x']) Xseq = sm.add_constant(xseq) try: model = sm.WLS(data['y'], X, weights=data['weight']) except KeyError: model = sm.OLS(data['y'], X) results = model.fit(**params['method_args']) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(Xseq) if params['se']: alpha = 1 - params['level'] prstd, iv_l, iv_u = wls_prediction_std( results, Xseq, alpha=alpha) data['se'] = prstd data['ymin'] = iv_l data['ymax'] = iv_u return data
def _degradation_CI(results): ''' Description ----------- Monte Carlo estimation of uncertainty in degradation rate from OLS results Parameters ---------- results: OLSResults object from fitting a model of the form: results = sm.OLS(endog = df.energy_ma, exog = df.loc[:,['const','years']]).fit() Returns ------- 68.2% confidence interval for degradation rate ''' sampled_normal = np.random.multivariate_normal(results.params, results.cov_params(), 10000) dist = sampled_normal[:, 1] / sampled_normal[:, 0] Rd_CI = np.percentile(dist, [50 - 34.1, 50 + 34.1]) * 100.0 return Rd_CI
def __get_tail_weighted_time_period(self): """ Method to approximate the weighted-average development age assuming exponential tail fit. Returns: float32 """ #n = self.triangle.ncol-1 #y = self.f[:n] #x = np.array([i + 1 for i in range(len(y))]) #ldf_reg = stats.linregress(x, np.log(y - 1)) #time_pd = (np.log(self.f[n] - 1) - ldf_reg[1]) / ldf_reg[0] n = self.triangle.ncol-1 y = Series(self.f[:n]) x = [num+1 for num, item in enumerate(y)] y.index = x x = sm.add_constant((y.index)[y>1]) y = y[y>1] ldf_reg = sm.OLS(np.log(y-1),x).fit() time_pd = (np.log(self.f[n] - 1) - ldf_reg.params[0]) / ldf_reg.params[1] #tail_factor = np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1) return time_pd
def __init__(self, x, y, w): self.x = x self.y = y self.w = w WLS = sm.WLS(y,x, w) OLS = sm.OLS(WLS.wendog,WLS.wexog).fit() self.coefficient = OLS.params[0] self.WSSResidual = OLS.ssr self.fittedvalues = OLS.predict(x) self.residual = OLS.resid if len(x) == 1: self.mean_square_error = np.nan self.standard_error = np.nan self.sigma = np.nan self.std_resid = np.nan else: self.mean_square_error = OLS.mse_resid self.standard_error = OLS.params[0]/OLS.tvalues[0] self.sigma = np.sqrt(self.mean_square_error) self.std_resid = OLSInfluence(OLS).resid_studentized_internal
def __get_MCL_model(self): modelsI=[] modelsP=[] for i in range(len(self.Incurred.data.columns)): modelsI.append(cl.WRTO(self.Incurred.data.iloc[:,i].dropna(), self.Paid.data.iloc[:,i].dropna(), w=1/self.Incurred.data.iloc[:,i].dropna())) modelsP.append(cl.WRTO(self.Paid.data.iloc[:,i].dropna(), self.Incurred.data.iloc[:,i].dropna(), w=1/self.Paid.data.iloc[:,i].dropna())) q_f = np.array([item.coefficient for item in modelsI]) qinverse_f = np.array([item.coefficient for item in modelsP]) rhoI_sigma = np.array([item.sigma for item in modelsI]) rhoP_sigma = np.array([item.sigma for item in modelsP]) #y = np.log(rhoI_sigma[:-1]) #x = np.array([i + 1 for i in range(len(y))]) #x = sm.add_constant(x) #OLS = sm.OLS(y,x).fit() #tailsigma = np.exp((x[:,1][-1]+ 1) * OLS.params[1] + OLS.params[0]) return rhoI_sigma, rhoP_sigma, q_f, qinverse_f
def model_fit_to_dataframe(fit): """ Take an object containing a statsmodels OLS model fit and extact the main model fit metrics into a data frame. Parameters ---------- fit : a statsmodels fit object Model fit object obtained from a linear model trained using `statsmodels.OLS`. Returns ------- df_fit : pandas DataFrame Data frame with the main model fit metrics. """ df_fit = pd.DataFrame({"N responses": [int(fit.nobs)]}) df_fit['N features'] = int(fit.df_model) df_fit['R2'] = fit.rsquared df_fit['R2_adjusted'] = fit.rsquared_adj return df_fit
def checkOLS(self, exog, endog, x, y): reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=y, x=x) # check that sparse version is the same with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) _compare_ols_results(result, sparse_result) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.tvalues, result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def ols(data,iv,dv1,dv2,dv3,title='OLS Summary',table_title=''): if len(table_title) == 0: table_title = "Independent Variable : " + str(iv) features = str(iv + ' ~ ' + dv1 + ' + ' + dv2 + ' + ' + dv3) y, X = dmatrices(features, data=data, return_type='dataframe') mod = sm.OLS(y, X) res = mod.fit() result = pd.DataFrame.transpose(pd.DataFrame([res.params,res.tvalues,res.pvalues,res.bse])) result.columns = ['coef','t','p_t','std_error'] data = result.round(decimals=4) html = display(HTML("<style type=\"text/css\"> .tg {border-collapse:collapse;border-spacing:0;border:none;} .tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg .tg-ejgj{font-family:Verdana, Geneva, sans-serif !important;;vertical-align:top} .tg .tg-anay{font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} .tg .tg-jua3{font-weight:bold;font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} h5{font-family:Verdana;} h4{font-family:Verdana;} hr{height: 3px; background-color: #333;} .hr2{height: 1px; background-color: #333;} </style> <table class=\"tg\" style=\"undefined;table-layout: fixed; width: 500px; border-style: hidden; border-collapse: collapse;\"> <colgroup> <col style=\"width: 150px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> </colgroup> <h5>" + str(table_title) + "</h5> <h4><i>" + str(title) + "</i></h4> <hr align=\"left\", width=\"630\"> <tr> <th class=\"tg-ejgj\"></th> <th class=\"tg-anay\">" + str(data.keys()[0]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[1]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[2]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[3]) + "</th> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][0]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][1]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][2]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][3]) + "</td> </tr>")) return html
def estimate(self): """ Estimate the FE model with OLS :return: results """ # demean self.__demean() # first, make a dataframe out of the panel of indvars x_dataframe = self.x_demeaned.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs # unstack the depvar dataframe into a series y_series = self.y_demeaned.unstack() # fit regression model with statsmodels results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit() print(results.summary(self.depvar, self.indvars)) self.result = results
def estimate(self): """ Estimate the first differenced OLS :return: results """ # first difference data self.__first_diff() # first, make a dataframe out of the panel of indvars x_dataframe = self.fd_x.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs # unstack the depvar dataframe into a series y_series = self.fd_y.unstack() # fit regression model with statsmodels results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit() print(results.summary(self.depvar, self.indvars)) self.result = results
def solution2(): print("Start problem 2") print("Start data preparation for problem 2") dataPreparation() print("Results: ") tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \ 'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl'] #tweets, retweets, sum of followers, maximum followers, time of the day for tag in tags: tweets, parameters = [], [] f = open('problem 2 ' + tag + ' data.txt') line = f.readline() while len(line): p = line.split() tweets.append(float(p[0])) parameters.append([float(p[i]) for i in range(len(p))]) line = f.readline() del(tweets[0]) next_hour_tweets = tweets parameters.pop() res = sm.OLS(next_hour_tweets, parameters).fit() print("Result of " + tag) print(res.summary()) f.close()
def macro_reg_ret(ret, macro_data): ''' ????????????????????????????????????????????????? ??????????????????????????*???????????????-?????? :param DataFrame ret: ?????????? :param DataFrame macro_index: ?????????? :return: [DataFrame,dict] [loading,significant_list]: ???????????????;?????????? ''' macro_loading = pd.DataFrame(np.zeros([ret.shape[1], macro_data.shape[1]])) # ?????????? for i in range(ret.shape[1]): y = ret.values[:, i] # ?????????????? for j in range(macro_data.shape[1]): x = macro_data.values[:, j] model = sm.OLS(y, x).fit() macro_loading.iloc[i, j] = model.params[0] return macro_loading
def ret_reg_loading(tech_loading,ret,dummy): ''' ???????111????Loading??????111?????????????????????????? :param tech_loading: :param ret: :return: ''' # ??????? significant_days=dict() for tech in tech_loading.columns: significant_days[tech]=0 # ??????111?????loading???? for tech in tech_loading.columns: # ?????111??????????? for i in range(ret.shape[0]): model = sm.OLS(ret.iloc[i,:].values, pd.concat([tech_loading[tech],dummy],axis=1).values).fit() pvalue=model.pvalues[0] if pvalue<0.1: significant_days[tech]+=1 return significant_days
def ind_reg_ret(ind_ret,ret): ''' ??????????????????????????????????????????????? :param DataFrame ind_ret: ????????*???? :param DataFrame ret: ????????*???? :return: ''' #???pvalue????0.1 pvalue_threshold=0.1 ind_loading=np.zeros([ret.shape[1],ind_ret.shape[1]]) # ????????????????????????? for i in range(ret.shape[1]): # ????????????? for j in range(ind_ret.shape[1]): model=sm.OLS(ret.values[:,i],ind_ret.values[:,j]).fit() #??????????? ind_loading[i,j]=model.params[0] #??pvalue?????????????????significant_stocks_list?1 ind_loading=pd.DataFrame(ind_loading,columns=ind_ret.columns) return ind_loading
def Regress(self, print_data): a = np.ndarray(shape=(len(self._rows), len(self._REGRESS_COLUMNS))) b = np.zeros(shape=(len(self._rows))) rn = 0 if print_data: print('Found %d rows' % (len(self._rows))) print('Columns') print('\t'.join(sorted(self._REGRESS_COLUMNS))) for juris,cols in self._rows.iteritems(): if not self._IsValidInputData(cols): continue b[rn] = self._values[juris] cn = 0 for c in sorted(self._REGRESS_COLUMNS): a[rn,cn] = cols[c] cn += 1 if print_data: print('\t'.join(str(x) for x in a[rn:rn+1,][0].tolist())) rn += 1 a.resize((rn, len(self._REGRESS_COLUMNS))) b.resize(rn) if print_data: print('LogValue') print('\n'.join(str(x) for x in b.tolist())) results = sm.OLS(b, a).fit() print(results.summary(yname='log(Votes)', xname=sorted(self._REGRESS_COLUMNS), alpha=0.01)) i = 0 for juris,cols in sorted(self._rows.iteritems()): p = [] for c in sorted(self._REGRESS_COLUMNS): p.append(cols[c]) pred = results.predict(p) diff = math.e**self._values[juris] - math.e**pred print('%-20s\t%7d\t%7d\t%7.4f\t%7d' % (juris, math.e**pred, math.e**self._values[juris], diff / math.e**pred, diff)) return results
def get_tail_factor(self, colname_sep='-'): """Estimate tail factor, idea from Thomas Mack: Returns a tail factor based off of an exponential fit to the LDFs. This will fail if the product of 2nd and 3rd to last LDF < 1.0001. This also fails if the estimated tail is larger than 2.0. In other areas, exponential fit is rejected if the slope parameter p-value >0.5. This is currently representative of the R implementation of this package, but may be enhanced in the future to be p-value based. Parameters: colname_sep : str text to join the names of two adjacent columns representing the age-to-age factor column name. Returns: Pandas.Series of the tail factor. """ LDF = np.array(self.get_LDF()[:self.triangle.ncol-1]) if self.tail==False: tail_factor=1 elif len(LDF[LDF>1]) < 2: warn("Not enough factors larger than 1.0 to fit an exponential regression.") tail_factor = 1 elif (LDF[-3] * LDF[-2] > 1.0001): y = Series(LDF) x = sm.add_constant((y.index+1)[y>1]) y = LDF[LDF>1] n, = np.where(LDF==y[-1])[0] tail_model = sm.OLS(np.log(y-1),x).fit() tail_factor = np.product(np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1) if tail_factor > 2: warn("The estimate tail factor was bigger than 2 and has been reset to 1.") tail_factor = 1 if tail_model.f_pvalue > 0.05: warn("The p-value of the exponential tail fit is insignificant and tail has been set to 1.") tail_factor = 1 else: warn("LDF[-2] * LDF[-1] is not greater than 1.0001 and tail has been set to 1.") tail_factor = 1 return Series(tail_factor, index = [self.triangle.data.columns[-1] + colname_sep + 'Ult'])
def apply_half_life(self, time_series): lag = np.roll(time_series, 1) lag[0] = 0 ret = time_series - lag ret[0] = 0 # adds intercept terms to X variable for regression lag2 = sm.add_constant(lag) model = sm.OLS(ret, lag2) res = model.fit() self.half_life = -np.log(2) / res.params[1]
def hedge_ratio(Y, X): # Look into using Kalman Filter to calculate the hedge ratio X = sm.add_constant(X) model = sm.OLS(Y, X).fit() return model.params[1]
def ols_coefficients_to_dataframe(coefs): """ Take a series containing OLS coefficients and convert it to a data frame. Parameters ---------- coefs : pandas Series Series with feature names in the index and the coefficient values as the data, obtained from a linear model trained using `statsmodels.OLS`. Returns ------- df_coef : pandas DataFrame Data frame with two columns, the first being the feature name and the second being the coefficient value. Note ---- The first row in the output data frame is always for the intercept and the rest are sorted by feature name. """ # first create a sorted data frame for all the non-intercept features non_intercept_columns = [c for c in coefs.index if c != 'const'] df_non_intercept = pd.DataFrame(coefs.filter(non_intercept_columns), columns=['coefficient']) df_non_intercept.index.name = 'feature' df_non_intercept = df_non_intercept.sort_index() df_non_intercept.reset_index(inplace=True) # now create a data frame that just has the intercept df_intercept = pd.DataFrame([{'feature': 'Intercept', 'coefficient': coefs['const']}]) # append the non-intercept frame to the intercept one df_coef = df_intercept.append(df_non_intercept, ignore_index=True) # we always want to have the feature column first df_coef = df_coef[['feature', 'coefficient']] return df_coef
def checkForSeries(self, x, y, series_x, series_y, **kwds): # Consistency check with simple OLS. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=y, x=x, **kwds) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): reference = ols(y=series_y, x=series_x, **kwds) self.compare(reference, result)
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, nw_overlap=False): import warnings warnings.warn("The pandas.stats.ols module is deprecated and will be " "removed in a future version. We refer to external packages " "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html", FutureWarning, stacklevel=4) try: import statsmodels.api as sm except ImportError: import scikits.statsmodels.api as sm self._x_orig = x self._y_orig = y self._weights_orig = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap (self._y, self._x, self._weights, self._x_filtered, self._index, self._time_has_obs) = self._prepare_data() if self._weights is not None: self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) self._y_trans = self._y * np.sqrt(self._weights) self.sm_ols = sm.WLS(self._y.get_values(), self._x.get_values(), weights=self._weights.values).fit() else: self._x_trans = self._x self._y_trans = self._y self.sm_ols = sm.OLS(self._y.get_values(), self._x.get_values()).fit()
def _prepare_data(self): """ Cleans the input for single OLS. Parameters ---------- lhs: Series Dependent variable in the regression. rhs: dict, whose values are Series, DataFrame, or dict Explanatory variables of the regression. Returns ------- Series, DataFrame Cleaned lhs and rhs """ (filt_lhs, filt_rhs, filt_weights, pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig, self._weights_orig) if self._intercept: filt_rhs['intercept'] = 1. pre_filt_rhs['intercept'] = 1. if hasattr(filt_weights, 'to_dense'): filt_weights = filt_weights.to_dense() return (filt_lhs, filt_rhs, filt_weights, pre_filt_rhs, index, valid)
def summary_as_matrix(self): """Returns the formatted results of the OLS as a DataFrame.""" results = self._results beta = results['beta'] data = {'beta': results['beta'], 't-stat': results['t_stat'], 'p-value': results['p_value'], 'std err': results['std_err']} return DataFrame(data, beta.index).T
def Orthogonalize(left_data, right_data, left_name, right_name): """????? ?????????[date,IDs,factor1,factor2...] ?? -------- left_name: str ??1????? right_name: str or list ??2?????????????????? industry: str ??????????None????????? """ def OLS(data, left_name): tempData = data.copy() yData = np.array(tempData.pop(left_name)) xData = np.array(tempData) NaNInd = pd.notnull(yData) & pd.notnull(xData).all(axis=1) model = sm.OLS(yData, xData, missing='drop') res = model.fit() data[left_name+'_orthogonalized'] = np.nan data.ix[NaNInd, left_name+'_orthogonalized'] = res.resid return data factor_1 = left_data[[left_name]] if not isinstance(right_name, list): right_name = [right_name] factor_2 = right_data[right_name] factor = pd.concat([factor_1, factor_2], axis=1) factor['alpha'] = 1 # ????? factor = factor.groupby(level=0).apply(OLS, left_name=left_name) return factor[[left_name+'_orthogonalized']]
def Calculate_Liquidity_Coeff(df_list): #list of lists of liq coeff per bond liq_arr_list = [] #print 'df_list size: ', len(df_list) for df in df_list: if df.empty: continue # A temporary array for holding liquidity beta for each month liq_arr = [np.nan] * num_months_CONST #print df['cusip_id'][0] # Group dataframe on index by month for date, df_group in df.groupby(pd.TimeGrouper("M")): month = ''.join([str(date.month),str(date.year)]) month_key = month_keys[month] # When there are some data in current month, if df_group.shape[0] > 0: # Run regression (as equation (2)) to get liquidity measure y,X = dmatrices('excess_return_1 ~ yld_pt + volume_and_sign', data=df_group, return_type='dataframe') #print date, X.shape mod = sm.OLS(y,X) res = mod.fit() #set specific months with liquidity factors #res.params(2) = liquidity coefficient liq_arr[month_key] = res.params[2] liq_arr_list.append(liq_arr) #store all liq coeff for each month per bond return liq_arr_list
def Run_Regression(liq_month_list): df = pd.DataFrame(index = month_list) #set dates as index # Liquidity pi_t df['liq_month_list_1'] = liq_month_list # Liquidity pi_t-1 df['liq_month_list'] = df['liq_month_list_1'].shift(1) # After shift, the first row is not longer valid data df = df.drop(df.index[0], inplace = False) ## FIX A scaling factor to delta_pi_t is dropped here because we do not ## having historical amount outstanding of the bonds in the portfolio. ## The scaling factor is (M_t-1 - M_1), where M_t is total dollar value at ## end of month t-1, representing the total dollar value of the bonds ## in month t # delta_pi_t = pi_t - pi_t-1 df['liq_delta_1'] = df['liq_month_list_1'] - df['liq_month_list'] # delta_pi_t-1 df['liq_delta'] = df['liq_delta_1'].shift(1) # After shift, the first row is not longer valid data df = df.drop(df.index[0], inplace = False) # Run linear regression using equation (4) y,X = dmatrices('liq_delta_1 ~ liq_delta + liq_month_list', data = df, return_type = 'dataframe') mod = sm.OLS(y,X) res = mod.fit() # Calculate the predicted change in liquidty values df['liq_proxy_values'] = \ res.params[0] + res.params[1] * df['liq_delta'] + \ res.params[2] * df['liq_month_list'] # Calculate the actual - predicted change in liquidity --> the residual term # --> the liquidity risk df['residual_term'] = df['liq_delta_1'] - df['liq_proxy_values'] # Scale the magnitude of the liquidity risk for convenient use in later steps df['residual_term'] = df['residual_term'] * 10000 return df
def fit(self, y, X, handler=None): """Fit the model. Args: y (pandas.DataFrame): The vector of endogenous variable. X (pandas.DataFrame): The matrix of exogenous variables. """ # Creating the OLS model from StatsModels and fitting it model = sm.OLS(y, X) handler = self._results_handler if handler is None else handler return handler(model.fit())
def ols(self, x, y): xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])]) yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])]) xflatten = sm.add_constant(xflatten) model = sm.OLS(yflatten,xflatten) results = model.fit() print results.summary()
def regression_apply(row, timepoints, weighted): """ :py:meth:`pandas.DataFrame.apply` apply function for calculating enrichment using linear regression. If *weighted* is ``True`` perform weighted least squares; else perform ordinary least squares. Weights for weighted least squares are included in *row*. Returns a :py:class:`pandas.Series` containing regression coefficients, residuals, and statistics. """ # retrieve log ratios from the row y = row[['L_{}'.format(t) for t in timepoints]] # re-scale the x's to fall within [0, 1] xvalues = [x / float(max(timepoints)) for x in timepoints] # perform the fit X = sm.add_constant(xvalues) # fit intercept if weighted: W = row[['W_{}'.format(t) for t in timepoints]] fit = sm.WLS(y, X, weights=W).fit() else: fit = sm.OLS(y, X).fit() # re-format as a data frame row values = np.concatenate([fit.params, [fit.bse['x1'], fit.tvalues['x1'], fit.pvalues['x1']], fit.resid]) index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \ ['e_{}'.format(t) for t in timepoints] return pd.Series(data=values, index=index)
def validate(self): """ Raises an informative ``ValueError`` if the time points in the analysis are not suitable. Calls validate method on all child SeqLibs. """ # check the time points if 0 not in self.timepoints: raise ValueError("Missing timepoint 0 [{}]".format(self.name)) if self.timepoints[0] != 0: raise ValueError("Invalid negative timepoint [{}]".format(self.name)) if len(self.timepoints) < 2: raise ValueError("Multiple timepoints required [{}]".format(self.name)) elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"): raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name)) # check the wild type sequences if self.has_wt_sequence(): for child in self.children[1:]: if self.children[0].wt != child.wt: logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name}) break # check that we're not doing wild type normalization on something with no wild type #if not self.has_wt_sequence() and self.logr_method == "wt": # raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name)) # validate children for child in self.children: child.validate()
def calculate(self): """ Wrapper method to calculate counts and enrichment scores for all data in the :py:class:`~selection.Selection`. """ if len(self.labels) == 0: raise ValueError("No data present across all sequencing libraries [{}]".format(self.name)) for label in self.labels: self.merge_counts_unfiltered(label) self.filter_counts(label) if self.is_barcodevariant() or self.is_barcodeid(): self.combine_barcode_maps() if self.scoring_method == "counts": pass elif self.scoring_method == "ratios": for label in self.labels: self.calc_ratios(label) elif self.scoring_method == "simple": for label in self.labels: self.calc_simple_ratios(label) elif self.scoring_method in ("WLS", "OLS"): if len(self.timepoints) <= 2: raise ValueError("Regression-based scoring requires three or more time points.") for label in self.labels: self.calc_log_ratios(label) if self.scoring_method == "WLS": self.calc_weights(label) self.calc_regression(label) else: raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name)) if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers: if self.is_barcodevariant() or self.is_barcodeid(): self.calc_outliers("barcodes") if self.is_coding(): self.calc_outliers("variants")
def calc_regression(self, label): """ Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares. Regression results are stored in ``'/main/label/scores'`` """ if self.check_store("/main/{}/scores".format(label)): return elif "/main/{}/scores".format(label) in self.store.keys(): # need to remove the current keys because we are using append self.store.remove("/main/{}/scores".format(label)) logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name}) # append is required because it takes the "min_itemsize" argument, and put doesn't longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max() chunk = 1 if self.scoring_method == "WLS": for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize): logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 elif self.scoring_method == "OLS": for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize): logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name}) result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns") self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest}) chunk += 1 else: raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name)) # need to read from the file, calculate percentiles, and rewrite it logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name}) data = self.store['/main/{}/scores'.format(label)] data['score'] = data['slope'] data['SE'] = data['SE_slope'] data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']] data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
def breusch_pagan_test(y,x): results=sm.OLS(y,x).fit() resid=results.resid n=len(resid) sigma2 = sum(resid**2)/n f = resid**2/sigma2 - 1 results2=sm.OLS(f,x).fit() fv=results2.fittedvalues bp=0.5 * sum(fv**2) df=results2.df_model p_value=1-sp.stats.chi.cdf(bp,df) return round(bp,6), df, round(p_value,7) #
def sm_multireg(self,Xtrain,ytrain, Xtest, ytest): self.normalize(Xtrain) results = sm.OLS(ytrain, Xtrain).fit_regularized() #print "coefficient: ", results.params # train accuracy predictions = results.predict(Xtrain) correct = 0 for i in range(len(predictions)): if round(predictions[i], 1) == ytrain[i]: correct += 1 accuracy = correct * 1.0 / len(ytrain) print "train accuracy: ", accuracy * 100, "%" # calculate SSE, SSM & SST SSE = 0 for i in range(len(predictions)): SSE += (predictions[i] - ytrain[i])**2 yAverage = np.mean(ytrain) SSM = 0 for pred in predictions: SSM += (pred - yAverage)**2 print "SSM:", SSM SST = SSE + SSM print "SST:", SST # calculate PVE = SSM / SST PVE = SSM / SST print "PVE:", PVE # test accuracy self.normalize(Xtest) predictions = results.predict(Xtest) correct = 0 for i in range(len(predictions)): print round(predictions[i], 1), ytest[i] if round(predictions[i], 1) == ytest[i]: correct += 1 accuracy = correct * 1.0 / len(ytest) print "test accuracy: ", accuracy * 100, "%" return results
def fit_ols(y, x, idx=-1): ols = OLS(y, add_constant(x)) results = ols.fit() return results.params.values[idx], results.cov_params().values[idx, idx]
def weak_instruments(self, n_sims=20): np.random.seed(1692) model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2) treatment_effects = [] ols_betas, ols_ses = [], [] old_corrs, new_corrs = [], [] for _ in xrange(n_sims): df = self.treatment_gen.simulate_data(False) X = np.hstack((self.x, df['new_treat'].values[:, None])) Z = np.hstack((self.x, df['instrument'].values[:, None])) ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X) ols_betas.append(ols_beta) ols_ses.append(ols_se) old_corr = df[['instrument', 'new_treat']].corr().values[0, 1] new_instrument, new_corr = model.fit_instruments(X, Z, df['treatment_effect'].values, batchsize=128) new_corrs.append(new_corr) old_corrs.append(old_corr) Z2 = Z.copy() Z2[:, -1] = new_instrument[:, 0] iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2)) model.reset_params() if new_corr: logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs)) logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses)) logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def estimate(self, n_sims=20): np.random.seed(1692) model = feedforward.FeedForwardModel(19, 1, dense_size=10, n_dense_layers=2) treatment_effects = [] ols_betas, ols_ses = [], [] old_corrs, new_corrs = [], [] for _ in xrange(n_sims): df = self.treatment_gen.simulate_data(True) X = np.hstack((self.x, df['new_treat'].values[:, None])) Z = np.hstack((self.x, df['instrument'].values[:, None])) ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X) ols_betas.append(ols_beta) ols_ses.append(ols_se) new_corr = model.fit(X, Z, df['treatment_effect'].values, instrument=True, batchsize=64) if new_corr: old_corr = df[['instrument', 'new_treat']].corr().values[0, 1] new_corrs.append(new_corr) old_corrs.append(old_corr) treatment_effects.append(model.get_treatment_effect(X)) model.reset_params() if new_corr: logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs)) logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses)) logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
def solution3(): print("Start problem 3") print("Start data preparation for problem 3") # firstly prepare data for problem 3, all text files will be saved at current path data_preparation() print("Results: ") tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \ 'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl'] # idx for tag index in tags idx=0 for tag in tags: tweets, parameters = [], [] f = open('problem 3 ' + tag + ' data.txt') line = f.readline() while len(line): p = line.split() tweets.append(float(p[0])) parameters.append([float(p[i]) for i in range(len(p))]) line = f.readline() del(tweets[0]) next_hour_tweets = np.array(tweets) parameters.pop() X = np.matrix(parameters) res = sm.OLS(next_hour_tweets, X).fit() print("Result of " + tag) print(res.summary()) if tag == 'tweets_#gohawks' or tag=='tweets_#nfl' or tag=='tweets_#patriots': scatterPlot(idx,0,1,2,X,next_hour_tweets,tag) elif tag == 'tweets_#gopatriots': scatterPlot(idx,1,3,4,X,next_hour_tweets, tag) elif tag == 'tweets_#sb49': scatterPlot(idx,0,1,5,X,next_hour_tweets,tag) elif tag == 'tweets_#superbowl': scatterPlot(idx,0,2,5,X, next_hour_tweets,tag) idx += 1 f.close()
def macro_regression(ret, macro_index,macro_index_list): ''' ????????????????????????????????????????????????? ??????????????????????????*???????????????-?????? :param DataFrame ret: ?????????? :param DataFrame macro_index: ?????????? :return: [DataFrame,dict] [loading,significant_list]: ???????????????;?????????? ''' loading = pd.DataFrame(np.zeros([ret.shape[1], macro_index.shape[1]]), columns=macro_index.columns) # ?????????????????? significant_list = dict() # ?????? for i in range(len(macro_index_list)): significant_list[macro_index_list[i]] = 0 # p??????????? pvalue_threshold = 0.1 # ?????????? for i in range(ret.shape[1]): y = ret.values[:, i] # ?????????????? for j in range(len(macro_index_list)): x = macro_index.values[:, j] model = sm.OLS(y, x).fit() loading.iloc[i, j] = model.params[0] if model.pvalues < pvalue_threshold: significant_list[macro_index_list[j]] += 1 return [loading, significant_list]
def rm_reg_ri(rm,ret): ''' ??????????????????????????? ''' #?????? X=np.array(np.zeros([ret.shape[1],1])) #????????????? for i in range(ret.shape[1]): model=sm.OLS(ret.values[:,i],rm.values).fit() X[i,:]=model.params[0] X=pd.DataFrame(X) return X
def fb_reg_over_time(ret, data,interval): ''' ????????????????????????????????????????????????????????? :param DataFrame ret: ??? :param {string:DataFrame} data: ????????? :param DataFrame ind_ret: ??????????? :param [int] interval???????????? :return: DataFrame X: ?????????????;?????? ''' # X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???) X = np.zeros([ret.shape[1], len(data)]) # num_of_factor????????factor??????????????1 num_of_factor = 0 # name of factors,prepared for converting X to DataFrame,with columns=factor_name factor_name = [] # ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]?????? for i in data.items(): factor_name = factor_name + [i[0]] interval_data=i[1].ix[interval] # ????????????????0 for j in range(i[1].shape[1]): # ??j?????????????????????? model = sm.OLS(ret[j].values, interval_data[j].values).fit() # ????????????? X[j, num_of_factor] = model.params[0] # ?????????????1 num_of_factor += 1 # ?X??DataFrame???? X = pd.DataFrame(X) X.fillna(0, inplace=True) X.columns = factor_name #??????????DataFrame return X
def fb_reg_over_all_time(ret, data): ''' ??????????????????????????????????????????????????????? :param DataFrame ret: ??? :param {string:DataFrame} data: ????????? :param DataFrame ind_ret: ??????????? :param [int] interval???????????? :return: DataFrame X: ?????????????;?????? ''' # X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???) X = np.zeros([ret.shape[1], len(data)]) # num_of_factor????????factor??????????????1 num_of_factor = 0 # name of factors,prepared for converting X to DataFrame,with columns=factor_name factor_name = [] # ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]?????? for i in data.items(): factor_name = factor_name + [i[0]] # ????????????????0 for j in range(i[1].shape[1]): # ??j?????????????????????? model = sm.OLS(ret[j].values, i[1][j].values).fit() # ????????????? X[j, num_of_factor] = model.params[0] # ?????????????1 num_of_factor += 1 # ?X??DataFrame???? X = pd.DataFrame(X) X.fillna(0, inplace=True) X.columns = factor_name #??????????DataFrame return X
def fit(self, values, targets, penalty=0.0): self._knots = _get_percentiles(values, num_percentiles=self.num_percentiles) self.basis_matrix = _get_basis_vector(values, self._knots) X = np.vstack((self.basis_matrix, np.sqrt(penalty) * self._penalty_matrix())) y = np.asarray(targets + np.zeros((self.num_percentiles + 2, 1)).flatten().tolist()) self.spline = sm.OLS(y, X).fit() return self
def _filter_data(lhs, rhs, weights=None): """ Cleans the input for single OLS. Parameters ---------- lhs : Series Dependent variable in the regression. rhs : dict, whose values are Series, DataFrame, or dict Explanatory variables of the regression. weights : array-like, optional 1d array of weights. If None, equivalent to an unweighted OLS. Returns ------- Series, DataFrame Cleaned lhs and rhs """ if not isinstance(lhs, Series): if len(lhs) != len(rhs): raise AssertionError("length of lhs must equal length of rhs") lhs = Series(lhs, index=rhs.index) rhs = _combine_rhs(rhs) lhs = DataFrame({'__y__': lhs}, dtype=float) pre_filt_rhs = rhs.dropna(how='any') combined = rhs.join(lhs, how='outer') if weights is not None: combined['__weights__'] = weights valid = (combined.count(1) == len(combined.columns)).values index = combined.index combined = combined[valid] if weights is not None: filt_weights = combined.pop('__weights__') else: filt_weights = None filt_lhs = combined.pop('__y__') filt_rhs = combined if hasattr(filt_weights, 'to_dense'): filt_weights = filt_weights.to_dense() return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights, pre_filt_rhs.to_dense(), index, valid)
def _residualNet(data,uselog=True,c=None,p=None,x=None,useaggregate=True,numericalControls=[],categoricalControls=[]): ''' Given the data on a bipartite network of the form source,target,flow Parameters ---------- data : pandas.DataFrame Raw data. It has source,target,volume (trade, number of people etc.). c,p,x : str (optional) Labels of the columns in data used for source,target,volume. If not provided it will use the first, second, and third. numericalControls : list List of columns to use as numerical controls. categoricalControls : list List of columns to use as categorical controls. uselog : boolean (True) If True it will use the logarithm of the provided weight. useaggregate : boolean (True) If true it will calculate the aggregate of the volume on both sides (c and p) and use as numbercal controls. Returns ------- net : pandas.Dataframe Table with c,p,x,x_res, where x_res is the residual of regressing x on the given control variables. ''' c = data.columns.values[0] if c is None else c p = data.columns.values[1] if p is None else p x = data.columns.values[2] if x is None else x data_ = data[[c,p,x]+numericalControls+categoricalControls] if useaggregate: data_ = merge(data_,data.groupby(c).sum()[[x]].reset_index().rename(columns={x:x+'_'+c})) data_ = merge(data_,data.groupby(p).sum()[[x]].reset_index().rename(columns={x:x+'_'+p})) numericalControls+=[x+'_'+c,x+'_'+p] if uselog: data_ = data_[data_[x]!=0] data_[x] = np.log10(data_[x]) if useaggregate: data_[x+'_'+c] = np.log10(data_[x+'_'+c]) data_[x+'_'+p] = np.log10(data_[x+'_'+p]) _categoricalControls = [] for var in ser(categoricalControls): vals = list(set(data_[var])) for v in vals[1:]: _categoricalControls.append(var+'_'+str(v)) data_[var+'_'+str(v)]=0 data_.loc[data_[var]==v,var+'_'+str(v)]=1 Y = data_[x].values X = data_[list(set(numericalControls))+list(set(_categoricalControls))].values X = sm.add_constant(X) model = sm.OLS(Y,X).fit() data_[x+'_res'] = Y-model.predict(X) return data_[[c,p,x,x+'_res']]
def wt_plot(self, pdf): """ Create a plot of the linear fit of the wild type variant. *pdf* is an open PdfPages instance. Only created for selections that use WLS or OLS scoring and have a wild type specified. Uses :py:func:`~plots.fit_axes` for the plotting. """ logging.info("Creating wild type fit plot", extra={'oname' : self.name}) # get the data and calculate log ratios if "variants" in self.labels: wt_label = "variants" elif "identifiers" in self.labels: wt_label = "identifiers" data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0] sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index") # sum of complete cases (N') yvalues = np.log(data + 0.5) - np.log(sums + 0.5) xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints] # fit the line X = sm.add_constant(xvalues) # fit intercept if self.scoring_method == "WLS": W = 1 / (1 / (data + 0.5) + 1 / (sums + 0.5)) fit = sm.WLS(yvalues, X, weights=W).fit() elif self.scoring_method == "OLS": fit = sm.OLS(yvalues, X).fit() else: raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name)) intercept, slope = fit.params slope_se = fit.bse['x1'] # make the plot fig, ax = plt.subplots() fig.set_tight_layout(True) fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints) fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se)) ax.set_title("Wild Type Shape\n{}".format(self.name)) ax.set_ylabel("Log Ratio (Complete Cases)") pdf.savefig(fig) plt.close(fig)