Python statsmodels.api 模块,OLS 实例源码

我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用statsmodels.api.OLS

项目:DSI-personal-reference-kit    作者:teb311    | 项目源码 | 文件源码
def build_model(y, x, add_constant=True):
    '''
        Build a linear regression model from the provided data

        Provided:
        y: a series or single column dataframe holding our solution vector for linear regression
        x: a dataframe that runs parallel to y, with all the features for our linear regression
        add_constant: a boolean, if true it will add a constant row to our provided x data. Otherwise
                      this method assumes you've done-so already, or do not want one for some good reason

        Return: a linear regression model from StatsModels and the data which was used to train the model.
                If add_constant was true this will be a new dataframe, otherwise it will be x
    '''
    if add_constant:
        x = sm.add_constant(x)

    model = sm.OLS(y, x).fit()
    return (model, x)
项目:DriverPower    作者:smshuai    | 项目源码 | 文件源码
def dispersion_test(yhat, y, k=100):
    """ Implement the regression based dispersion test with k re-sampling.

    Args:
        yhat (np.array): predicted mutation count
        y (np.array): observed mutation count
        k (int):

    Returns:
        float, float: p-value, theta

    """
    theta = 0
    pval = 0
    for i in range(k):
        y_sub, yhat_sub = resample(y, yhat, random_state=i)
        # (np.power((y - yhat), 2) - y) / yhat for Poisson regression
        aux = (np.power((y_sub - yhat_sub), 2) - yhat_sub) / yhat_sub
        mod = sm.OLS(aux, yhat_sub)
        res = mod.fit()
        theta += res.params[0]
        pval += res.pvalues[0]
    theta = theta/k
    pval = pval/k
    return pval, theta
项目:plotnine    作者:has2k1    | 项目源码 | 文件源码
def lm(data, xseq, **params):
    """
    Fit OLS / WLS if data has weight
    """
    X = sm.add_constant(data['x'])
    Xseq = sm.add_constant(xseq)

    try:
        model = sm.WLS(data['y'], X, weights=data['weight'])
    except KeyError:
        model = sm.OLS(data['y'], X)

    results = model.fit(**params['method_args'])
    data = pd.DataFrame({'x': xseq})
    data['y'] = results.predict(Xseq)

    if params['se']:
        alpha = 1 - params['level']
        prstd, iv_l, iv_u = wls_prediction_std(
            results, Xseq, alpha=alpha)
        data['se'] = prstd
        data['ymin'] = iv_l
        data['ymax'] = iv_u

    return data
项目:rdtools    作者:NREL    | 项目源码 | 文件源码
def _degradation_CI(results):
    '''
    Description
    -----------
    Monte Carlo estimation of uncertainty in degradation rate from OLS results

    Parameters
    ----------
    results: OLSResults object from fitting a model of the form:
    results = sm.OLS(endog = df.energy_ma, exog = df.loc[:,['const','years']]).fit()
    Returns
    -------
    68.2% confidence interval for degradation rate

    '''

    sampled_normal = np.random.multivariate_normal(results.params, results.cov_params(), 10000)
    dist = sampled_normal[:, 1] / sampled_normal[:, 0]
    Rd_CI = np.percentile(dist, [50 - 34.1, 50 + 34.1]) * 100.0
    return Rd_CI
项目:chainladder-python    作者:jbogaardt    | 项目源码 | 文件源码
def __get_tail_weighted_time_period(self):
        """ Method to approximate the weighted-average development age assuming
        exponential tail fit.

        Returns: float32
        """
        #n = self.triangle.ncol-1
        #y = self.f[:n]
        #x = np.array([i + 1 for i in range(len(y))]) 
        #ldf_reg = stats.linregress(x, np.log(y - 1))
        #time_pd = (np.log(self.f[n] - 1) - ldf_reg[1]) / ldf_reg[0]

        n = self.triangle.ncol-1
        y = Series(self.f[:n])
        x = [num+1 for num, item in enumerate(y)]
        y.index = x
        x = sm.add_constant((y.index)[y>1])
        y = y[y>1]
        ldf_reg = sm.OLS(np.log(y-1),x).fit()
        time_pd = (np.log(self.f[n] - 1) - ldf_reg.params[0]) / ldf_reg.params[1]
        #tail_factor = np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1)

        return time_pd
项目:chainladder-python    作者:jbogaardt    | 项目源码 | 文件源码
def __init__(self, x, y, w):
        self.x = x
        self.y = y
        self.w = w
        WLS = sm.WLS(y,x, w)
        OLS = sm.OLS(WLS.wendog,WLS.wexog).fit()
        self.coefficient = OLS.params[0]
        self.WSSResidual = OLS.ssr
        self.fittedvalues = OLS.predict(x)
        self.residual = OLS.resid
        if len(x) == 1:
            self.mean_square_error = np.nan 
            self.standard_error = np.nan
            self.sigma = np.nan
            self.std_resid = np.nan
        else:
            self.mean_square_error = OLS.mse_resid 
            self.standard_error = OLS.params[0]/OLS.tvalues[0]
            self.sigma = np.sqrt(self.mean_square_error)
            self.std_resid = OLSInfluence(OLS).resid_studentized_internal
项目:chainladder-python    作者:jbogaardt    | 项目源码 | 文件源码
def __get_MCL_model(self): 
        modelsI=[]
        modelsP=[]
        for i in range(len(self.Incurred.data.columns)):
                modelsI.append(cl.WRTO(self.Incurred.data.iloc[:,i].dropna(), self.Paid.data.iloc[:,i].dropna(), w=1/self.Incurred.data.iloc[:,i].dropna()))
                modelsP.append(cl.WRTO(self.Paid.data.iloc[:,i].dropna(), self.Incurred.data.iloc[:,i].dropna(), w=1/self.Paid.data.iloc[:,i].dropna()))       
        q_f = np.array([item.coefficient for item in modelsI])
        qinverse_f = np.array([item.coefficient for item in modelsP])
        rhoI_sigma = np.array([item.sigma for item in modelsI])
        rhoP_sigma = np.array([item.sigma for item in modelsP])
        #y = np.log(rhoI_sigma[:-1])
        #x = np.array([i + 1 for i in range(len(y))])
        #x = sm.add_constant(x)
        #OLS = sm.OLS(y,x).fit()
        #tailsigma = np.exp((x[:,1][-1]+ 1) * OLS.params[1] + OLS.params[0])
        return rhoI_sigma, rhoP_sigma, q_f, qinverse_f
项目:rsmtool    作者:EducationalTestingService    | 项目源码 | 文件源码
def model_fit_to_dataframe(fit):
    """
    Take an object containing a statsmodels OLS model fit and extact
    the main model fit metrics into a data frame.

    Parameters
    ----------
    fit : a statsmodels fit object
        Model fit object obtained from a linear model trained using
        `statsmodels.OLS`.

    Returns
    -------
    df_fit : pandas DataFrame
        Data frame with the main model fit metrics.
    """

    df_fit = pd.DataFrame({"N responses": [int(fit.nobs)]})
    df_fit['N features'] = int(fit.df_model)
    df_fit['R2'] = fit.rsquared
    df_fit['R2_adjusted'] = fit.rsquared_adj
    return df_fit
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def checkOLS(self, exog, endog, x, y):
        reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit()
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x)

        # check that sparse version is the same
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            sparse_result = ols(y=y.to_sparse(), x=x.to_sparse())
        _compare_ols_results(result, sparse_result)

        assert_almost_equal(reference.params, result._beta_raw)
        assert_almost_equal(reference.df_model, result._df_model_raw)
        assert_almost_equal(reference.df_resid, result._df_resid_raw)
        assert_almost_equal(reference.fvalue, result._f_stat_raw[0])
        assert_almost_equal(reference.pvalues, result._p_value_raw)
        assert_almost_equal(reference.rsquared, result._r2_raw)
        assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw)
        assert_almost_equal(reference.resid, result._resid_raw)
        assert_almost_equal(reference.bse, result._std_err_raw)
        assert_almost_equal(reference.tvalues, result._t_stat_raw)
        assert_almost_equal(reference.cov_params(), result._var_beta_raw)
        assert_almost_equal(reference.fittedvalues, result._y_fitted_raw)

        _check_non_raw_results(result)
项目:astetik    作者:mikkokotila    | 项目源码 | 文件源码
def ols(data,iv,dv1,dv2,dv3,title='OLS Summary',table_title=''):

    if len(table_title) == 0:

        table_title = "Independent Variable : " + str(iv)

    features = str(iv + ' ~ ' + dv1 + ' + ' + dv2 + ' + ' + dv3)
    y, X = dmatrices(features, data=data, return_type='dataframe')
    mod = sm.OLS(y, X)
    res = mod.fit()
    result = pd.DataFrame.transpose(pd.DataFrame([res.params,res.tvalues,res.pvalues,res.bse]))
    result.columns = ['coef','t','p_t','std_error']

    data = result.round(decimals=4)

    html = display(HTML("<style type=\"text/css\"> .tg {border-collapse:collapse;border-spacing:0;border:none;} .tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;} .tg .tg-ejgj{font-family:Verdana, Geneva, sans-serif !important;;vertical-align:top} .tg .tg-anay{font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} .tg .tg-jua3{font-weight:bold;font-family:Verdana, Geneva, sans-serif !important;;text-align:right;vertical-align:top} h5{font-family:Verdana;} h4{font-family:Verdana;} hr{height: 3px; background-color: #333;} .hr2{height: 1px; background-color: #333;} </style> <table class=\"tg\" style=\"undefined;table-layout: fixed; width: 500px; border-style: hidden; border-collapse: collapse;\"> <colgroup> <col style=\"width: 150px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> <col style=\"width: 120px\"> </colgroup> <h5>" + str(table_title) + "</h5> <h4><i>" + str(title) + "</i></h4> <hr align=\"left\", width=\"630\"> <tr> <th class=\"tg-ejgj\"></th> <th class=\"tg-anay\">" + str(data.keys()[0]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[1]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[2]) + "</th> <th class=\"tg-anay\">" + str(data.keys()[3]) + "</th> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][0]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][0]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][1]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][1]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][2]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][2]) + "</td> </tr> <tr> <td class=\"tg-ejgj\">" + str(data.index[3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[0]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[1]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[2]][3]) + "</td> <td class=\"tg-jua3\">" + str(data[data.keys()[3]][3]) + "</td> </tr>"))

    return html
项目:panel_reg    作者:metjush    | 项目源码 | 文件源码
def estimate(self):
        """
        Estimate the FE model with OLS
        :return: results
        """
        # demean
        self.__demean()

        # first, make a dataframe out of the panel of indvars
        x_dataframe = self.x_demeaned.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs
        # unstack the depvar dataframe into a series
        y_series = self.y_demeaned.unstack()

        # fit regression model with statsmodels
        results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit()

        print(results.summary(self.depvar, self.indvars))

        self.result = results
项目:panel_reg    作者:metjush    | 项目源码 | 文件源码
def estimate(self):
        """
        Estimate the first differenced OLS
        :return: results
        """
        # first difference data
        self.__first_diff()

        # first, make a dataframe out of the panel of indvars
        x_dataframe = self.fd_x.transpose(2,0,1).to_frame(False) # set to False to not drop NaNs
        # unstack the depvar dataframe into a series
        y_series = self.fd_y.unstack()

        # fit regression model with statsmodels
        results = sm.OLS(y_series.values, x_dataframe.values, missing='drop').fit()

        print(results.summary(self.depvar, self.indvars))

        self.result = results
项目:Machine-Learning    作者:zjuzpz    | 项目源码 | 文件源码
def solution2():
    print("Start problem 2")
    print("Start data preparation for problem 2")
    dataPreparation()
    print("Results: ")
    tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \
    'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl']
    #tweets, retweets, sum of followers, maximum followers, time of the day
    for tag in tags:
        tweets, parameters = [], []
        f = open('problem 2 ' + tag + ' data.txt')
        line = f.readline()
        while len(line):
            p = line.split()
            tweets.append(float(p[0]))
            parameters.append([float(p[i]) for i in range(len(p))])
            line = f.readline()
        del(tweets[0])
        next_hour_tweets = tweets
        parameters.pop()
        res = sm.OLS(next_hour_tweets, parameters).fit()
        print("Result of " + tag)
        print(res.summary())
        f.close()
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def macro_reg_ret(ret, macro_data):
    '''
    ?????????????????????????????????????????????????
    ??????????????????????????*???????????????-??????
    :param DataFrame ret: ??????????
    :param DataFrame macro_index: ??????????
    :return: [DataFrame,dict] [loading,significant_list]: ???????????????;??????????
    '''
    macro_loading = pd.DataFrame(np.zeros([ret.shape[1], macro_data.shape[1]]))
    # ??????????
    for i in range(ret.shape[1]):
        y = ret.values[:, i]
        # ??????????????
        for j in range(macro_data.shape[1]):
            x = macro_data.values[:, j]
            model = sm.OLS(y, x).fit()
            macro_loading.iloc[i, j] = model.params[0]
    return macro_loading
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def ret_reg_loading(tech_loading,ret,dummy):
    '''
    ???????111????Loading??????111??????????????????????????
    :param tech_loading:
    :param ret:
    :return:
    '''
    # ???????
    significant_days=dict()
    for tech in tech_loading.columns:
        significant_days[tech]=0
    # ??????111?????loading????
    for tech in tech_loading.columns:
        # ?????111???????????
        for i in range(ret.shape[0]):
            model = sm.OLS(ret.iloc[i,:].values, pd.concat([tech_loading[tech],dummy],axis=1).values).fit()
            pvalue=model.pvalues[0]
            if pvalue<0.1:
                significant_days[tech]+=1
    return significant_days
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def ind_reg_ret(ind_ret,ret):
    '''
    ???????????????????????????????????????????????
    :param DataFrame ind_ret: ????????*????
    :param DataFrame ret: ????????*????
    :return:
    '''
    #???pvalue????0.1
    pvalue_threshold=0.1
    ind_loading=np.zeros([ret.shape[1],ind_ret.shape[1]])
    # ?????????????????????????
    for i in range(ret.shape[1]):
        # ?????????????
        for j in range(ind_ret.shape[1]):
            model=sm.OLS(ret.values[:,i],ind_ret.values[:,j]).fit()
            #???????????
            ind_loading[i,j]=model.params[0]
        #??pvalue?????????????????significant_stocks_list?1
    ind_loading=pd.DataFrame(ind_loading,columns=ind_ret.columns)
    return ind_loading
项目:elections    作者:justin-nonwork    | 项目源码 | 文件源码
def Regress(self, print_data):
    a = np.ndarray(shape=(len(self._rows), len(self._REGRESS_COLUMNS)))
    b = np.zeros(shape=(len(self._rows)))
    rn = 0
    if print_data:
      print('Found %d rows' % (len(self._rows)))
      print('Columns')
      print('\t'.join(sorted(self._REGRESS_COLUMNS)))
    for juris,cols in self._rows.iteritems():
      if not self._IsValidInputData(cols):
        continue
      b[rn] = self._values[juris]
      cn = 0
      for c in sorted(self._REGRESS_COLUMNS):
        a[rn,cn] = cols[c]
        cn += 1
      if print_data:
        print('\t'.join(str(x) for x in a[rn:rn+1,][0].tolist()))
      rn += 1
    a.resize((rn, len(self._REGRESS_COLUMNS)))
    b.resize(rn)
    if print_data:
      print('LogValue')
      print('\n'.join(str(x) for x in b.tolist()))
    results = sm.OLS(b, a).fit()
    print(results.summary(yname='log(Votes)',
      xname=sorted(self._REGRESS_COLUMNS), alpha=0.01))
    i = 0
    for juris,cols in sorted(self._rows.iteritems()):
      p = []
      for c in sorted(self._REGRESS_COLUMNS):
        p.append(cols[c])
      pred = results.predict(p)
      diff = math.e**self._values[juris] - math.e**pred
      print('%-20s\t%7d\t%7d\t%7.4f\t%7d' % (juris, math.e**pred,
        math.e**self._values[juris], diff / math.e**pred, diff))
    return results
项目:chainladder-python    作者:jbogaardt    | 项目源码 | 文件源码
def get_tail_factor(self, colname_sep='-'):
        """Estimate tail factor, idea from Thomas Mack:
        Returns a tail factor based off of an exponential fit to the LDFs.  This will
        fail if the product of 2nd and 3rd to last LDF < 1.0001.  This also fails if
        the estimated tail is larger than 2.0.  In other areas, exponential fit is
        rejected if the slope parameter p-value >0.5.  This is currently representative
        of the R implementation of this package, but may be enhanced in the future to be
        p-value based.

        Parameters:    
            colname_sep : str
                text to join the names of two adjacent columns representing the
                age-to-age factor column name.

        Returns:
            Pandas.Series of the tail factor.        
        """
        LDF = np.array(self.get_LDF()[:self.triangle.ncol-1])
        if self.tail==False:
            tail_factor=1
        elif len(LDF[LDF>1]) < 2:
            warn("Not enough factors larger than 1.0 to fit an exponential regression.")
            tail_factor = 1
        elif (LDF[-3] * LDF[-2] > 1.0001):
            y = Series(LDF)
            x = sm.add_constant((y.index+1)[y>1])
            y = LDF[LDF>1]
            n, = np.where(LDF==y[-1])[0]
            tail_model = sm.OLS(np.log(y-1),x).fit()
            tail_factor = np.product(np.exp(tail_model.params[0] + np.array([i+2 for i in range(n,n+100)]) * tail_model.params[1]).astype(float) + 1)
            if tail_factor > 2:
                warn("The estimate tail factor was bigger than 2 and has been reset to 1.")
                tail_factor = 1
            if tail_model.f_pvalue > 0.05:
                warn("The p-value of the exponential tail fit is insignificant and tail has been set to 1.")
                tail_factor = 1
        else:
            warn("LDF[-2] * LDF[-1] is not greater than 1.0001 and tail has been set to 1.")
            tail_factor = 1
        return Series(tail_factor, index = [self.triangle.data.columns[-1] + colname_sep + 'Ult'])
项目:Quantopian_Pairs_Trader    作者:bartchr808    | 项目源码 | 文件源码
def apply_half_life(self, time_series):
        lag = np.roll(time_series, 1)
        lag[0] = 0
        ret = time_series - lag
        ret[0] = 0

        # adds intercept terms to X variable for regression
        lag2 = sm.add_constant(lag)

        model = sm.OLS(ret, lag2)
        res = model.fit()

        self.half_life = -np.log(2) / res.params[1]
项目:Quantopian_Pairs_Trader    作者:bartchr808    | 项目源码 | 文件源码
def hedge_ratio(Y, X):
    # Look into using Kalman Filter to calculate the hedge ratio
    X = sm.add_constant(X)
    model = sm.OLS(Y, X).fit()
    return model.params[1]
项目:rsmtool    作者:EducationalTestingService    | 项目源码 | 文件源码
def ols_coefficients_to_dataframe(coefs):
    """
    Take a series containing OLS coefficients and convert it
    to a data frame.

    Parameters
    ----------
    coefs : pandas Series
        Series with feature names in the index and the coefficient
        values as the data, obtained from a linear model trained
        using `statsmodels.OLS`.

    Returns
    -------
    df_coef : pandas DataFrame
        Data frame with two columns, the first being the feature name
        and the second being the coefficient value.

    Note
    ----
    The first row in the output data frame is always for the intercept
    and the rest are sorted by feature name.
    """
    # first create a sorted data frame for all the non-intercept features
    non_intercept_columns = [c for c in coefs.index if c != 'const']
    df_non_intercept = pd.DataFrame(coefs.filter(non_intercept_columns), columns=['coefficient'])
    df_non_intercept.index.name = 'feature'
    df_non_intercept = df_non_intercept.sort_index()
    df_non_intercept.reset_index(inplace=True)

    # now create a data frame that just has the intercept
    df_intercept = pd.DataFrame([{'feature': 'Intercept',
                                  'coefficient': coefs['const']}])

    # append the non-intercept frame to the intercept one
    df_coef = df_intercept.append(df_non_intercept, ignore_index=True)

    # we always want to have the feature column first
    df_coef = df_coef[['feature', 'coefficient']]

    return df_coef
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def checkForSeries(self, x, y, series_x, series_y, **kwds):
        # Consistency check with simple OLS.
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = ols(y=y, x=x, **kwds)
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            reference = ols(y=series_y, x=series_x, **kwds)

        self.compare(reference, result)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None,
                 nw_overlap=False):
        import warnings
        warnings.warn("The pandas.stats.ols module is deprecated and will be "
                      "removed in a future version. We refer to external packages "
                      "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html",
                      FutureWarning, stacklevel=4)

        try:
            import statsmodels.api as sm
        except ImportError:
            import scikits.statsmodels.api as sm

        self._x_orig = x
        self._y_orig = y
        self._weights_orig = weights
        self._intercept = intercept
        self._nw_lags = nw_lags
        self._nw_overlap = nw_overlap

        (self._y, self._x, self._weights, self._x_filtered,
         self._index, self._time_has_obs) = self._prepare_data()

        if self._weights is not None:
            self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0)
            self._y_trans = self._y * np.sqrt(self._weights)
            self.sm_ols = sm.WLS(self._y.get_values(),
                                 self._x.get_values(),
                                 weights=self._weights.values).fit()
        else:
            self._x_trans = self._x
            self._y_trans = self._y
            self.sm_ols = sm.OLS(self._y.get_values(),
                                 self._x.get_values()).fit()
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _prepare_data(self):
        """
        Cleans the input for single OLS.

        Parameters
        ----------
        lhs: Series
            Dependent variable in the regression.
        rhs: dict, whose values are Series, DataFrame, or dict
            Explanatory variables of the regression.

        Returns
        -------
        Series, DataFrame
            Cleaned lhs and rhs
        """
        (filt_lhs, filt_rhs, filt_weights,
         pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig,
                                                    self._weights_orig)
        if self._intercept:
            filt_rhs['intercept'] = 1.
            pre_filt_rhs['intercept'] = 1.

        if hasattr(filt_weights, 'to_dense'):
            filt_weights = filt_weights.to_dense()

        return (filt_lhs, filt_rhs, filt_weights,
                pre_filt_rhs, index, valid)
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def summary_as_matrix(self):
        """Returns the formatted results of the OLS as a DataFrame."""
        results = self._results
        beta = results['beta']
        data = {'beta': results['beta'],
                't-stat': results['t_stat'],
                'p-value': results['p_value'],
                'std err': results['std_err']}
        return DataFrame(data, beta.index).T
项目:PythonPackages    作者:wanhanwan    | 项目源码 | 文件源码
def Orthogonalize(left_data, right_data, left_name, right_name):
    """?????
       ?????????[date,IDs,factor1,factor2...]
    ??
    --------
    left_name: str
        ??1?????
    right_name: str or list
        ??2??????????????????
    industry: str
        ??????????None?????????
    """

    def OLS(data, left_name):
        tempData = data.copy()
        yData = np.array(tempData.pop(left_name))
        xData = np.array(tempData)
        NaNInd = pd.notnull(yData) & pd.notnull(xData).all(axis=1)
        model = sm.OLS(yData, xData, missing='drop')
        res = model.fit()
        data[left_name+'_orthogonalized'] = np.nan
        data.ix[NaNInd, left_name+'_orthogonalized'] = res.resid
        return data

    factor_1 = left_data[[left_name]]
    if not isinstance(right_name, list):
        right_name = [right_name]
    factor_2 = right_data[right_name]
    factor = pd.concat([factor_1, factor_2], axis=1)
    factor['alpha'] = 1  # ?????
    factor = factor.groupby(level=0).apply(OLS, left_name=left_name)
    return factor[[left_name+'_orthogonalized']]
项目:MarketMakingProfitability    作者:MiesJansen    | 项目源码 | 文件源码
def Calculate_Liquidity_Coeff(df_list):
    #list of lists of liq coeff per bond
    liq_arr_list = []

    #print 'df_list size: ', len(df_list)
    for df in df_list:
        if df.empty:
            continue

        # A temporary array for holding liquidity beta for each month
        liq_arr = [np.nan] * num_months_CONST
        #print df['cusip_id'][0]

        # Group dataframe on index by month
        for date, df_group in df.groupby(pd.TimeGrouper("M")):
            month = ''.join([str(date.month),str(date.year)])
            month_key = month_keys[month]

            # When there are some data in current month,
            if df_group.shape[0] > 0:
                # Run regression (as equation (2)) to get liquidity measure
                y,X = dmatrices('excess_return_1 ~ yld_pt + volume_and_sign', 
                                data=df_group, return_type='dataframe')
                #print date, X.shape
                mod = sm.OLS(y,X)
                res = mod.fit()

                #set specific months with liquidity factors
                #res.params(2) = liquidity coefficient
                liq_arr[month_key] = res.params[2]

        liq_arr_list.append(liq_arr)    #store all liq coeff for each month per bond

    return liq_arr_list
项目:MarketMakingProfitability    作者:MiesJansen    | 项目源码 | 文件源码
def Run_Regression(liq_month_list):
    df = pd.DataFrame(index = month_list)    #set dates as index

    # Liquidity pi_t
    df['liq_month_list_1'] = liq_month_list
    # Liquidity pi_t-1
    df['liq_month_list'] = df['liq_month_list_1'].shift(1)
    # After shift, the first row is not longer valid data
    df = df.drop(df.index[0], inplace = False)

    ## FIX A scaling factor to delta_pi_t is dropped here because we do not
    ##  having historical amount outstanding of the bonds in the portfolio.
    ##  The scaling factor is (M_t-1 - M_1), where M_t is total dollar value at
    ##  end of month t-1, representing the total dollar value of the bonds
    ##  in month t
    # delta_pi_t = pi_t - pi_t-1
    df['liq_delta_1'] = df['liq_month_list_1'] - df['liq_month_list']
    # delta_pi_t-1
    df['liq_delta'] = df['liq_delta_1'].shift(1)
    # After shift, the first row is not longer valid data
    df = df.drop(df.index[0], inplace = False)

    # Run linear regression using equation (4)
    y,X = dmatrices('liq_delta_1 ~ liq_delta + liq_month_list', 
                    data = df, return_type = 'dataframe')
    mod = sm.OLS(y,X)
    res = mod.fit()

    # Calculate the predicted change in liquidty values
    df['liq_proxy_values'] = \
        res.params[0] + res.params[1] * df['liq_delta'] + \
        res.params[2] * df['liq_month_list']

    # Calculate the actual - predicted change in liquidity --> the residual term
    #  --> the liquidity risk
    df['residual_term'] = df['liq_delta_1'] - df['liq_proxy_values']

    # Scale the magnitude of the liquidity risk for convenient use in later steps
    df['residual_term'] = df['residual_term'] * 10000

    return df
项目:genetest    作者:pgxcentre    | 项目源码 | 文件源码
def fit(self, y, X, handler=None):
        """Fit the model.

        Args:
            y (pandas.DataFrame): The vector of endogenous variable.
            X (pandas.DataFrame): The matrix of exogenous variables.

        """
        # Creating the OLS model from StatsModels and fitting it
        model = sm.OLS(y, X)
        handler = self._results_handler if handler is None else handler
        return handler(model.fit())
项目:mrqap-python    作者:lisette-espin    | 项目源码 | 文件源码
def ols(self, x, y):
        xflatten = np.delete(x, [i*(x.shape[0]+1)for i in range(x.shape[0])])
        yflatten = np.delete(y, [i*(y.shape[0]+1)for i in range(y.shape[0])])
        xflatten = sm.add_constant(xflatten)
        model = sm.OLS(yflatten,xflatten)
        results = model.fit()
        print results.summary()
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def regression_apply(row, timepoints, weighted):
    """
    :py:meth:`pandas.DataFrame.apply` apply function for calculating 
    enrichment using linear regression. If *weighted* is ``True`` perform
    weighted least squares; else perform ordinary least squares.

    Weights for weighted least squares are included in *row*.

    Returns a :py:class:`pandas.Series` containing regression coefficients,
    residuals, and statistics.
    """
    # retrieve log ratios from the row
    y = row[['L_{}'.format(t) for t in timepoints]]

    # re-scale the x's to fall within [0, 1]
    xvalues = [x / float(max(timepoints)) for x in timepoints]

    # perform the fit
    X = sm.add_constant(xvalues) # fit intercept
    if weighted:
        W = row[['W_{}'.format(t) for t in timepoints]]
        fit = sm.WLS(y, X, weights=W).fit()
    else:
        fit = sm.OLS(y, X).fit()

    # re-format as a data frame row
    values = np.concatenate([fit.params,  [fit.bse['x1'], fit.tvalues['x1'], 
                            fit.pvalues['x1']], fit.resid])
    index = ['intercept', 'slope', 'SE_slope', 't', 'pvalue_raw'] + \
            ['e_{}'.format(t) for t in timepoints]
    return pd.Series(data=values, index=index)
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def validate(self):
        """
        Raises an informative ``ValueError`` if the time points in the analysis are not suitable.

        Calls validate method on all child SeqLibs.
        """
        # check the time points
        if 0 not in self.timepoints:
            raise ValueError("Missing timepoint 0 [{}]".format(self.name))
        if self.timepoints[0] != 0:
            raise ValueError("Invalid negative timepoint [{}]".format(self.name))
        if len(self.timepoints) < 2:
            raise ValueError("Multiple timepoints required [{}]".format(self.name))
        elif len(self.timepoints) < 3 and self.scoring_method in ("WLS", "OLS"):
            raise ValueError("Insufficient number of timepoints for regression scoring [{}]".format(self.name))

        # check the wild type sequences
        if self.has_wt_sequence():
            for child in self.children[1:]:
                if self.children[0].wt != child.wt:
                    logging.warning("Inconsistent wild type sequences", extra={'oname' : self.name})
                    break

        # check that we're not doing wild type normalization on something with no wild type
        #if not self.has_wt_sequence() and self.logr_method == "wt":
        #    raise ValueError("No wild type sequence for wild type normalization [{}]".format(self.name))

        # validate children
        for child in self.children:
            child.validate()
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def calculate(self):
        """
        Wrapper method to calculate counts and enrichment scores 
        for all data in the :py:class:`~selection.Selection`.
        """
        if len(self.labels) == 0:
            raise ValueError("No data present across all sequencing libraries [{}]".format(self.name))
        for label in self.labels:
            self.merge_counts_unfiltered(label)
            self.filter_counts(label)
        if self.is_barcodevariant() or self.is_barcodeid():
            self.combine_barcode_maps()
        if self.scoring_method == "counts":
            pass
        elif self.scoring_method == "ratios":
            for label in self.labels:
                self.calc_ratios(label)
        elif self.scoring_method == "simple":
            for label in self.labels:
                self.calc_simple_ratios(label)
        elif self.scoring_method in ("WLS", "OLS"):
            if len(self.timepoints) <= 2:
                raise ValueError("Regression-based scoring requires three or more time points.")
            for label in self.labels:
                self.calc_log_ratios(label)
                if self.scoring_method == "WLS":
                    self.calc_weights(label)
                self.calc_regression(label)
        else:
            raise ValueError('Invalid scoring method "{}" [{}]'.format(self.scoring_method, self.name))

        if self.scoring_method in ("ratios" , "WLS", "OLS") and self.component_outliers:
            if self.is_barcodevariant() or self.is_barcodeid():
                self.calc_outliers("barcodes")
            if self.is_coding():
                self.calc_outliers("variants")
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def calc_regression(self, label):
        """
        Calculate least squares regression for *label*. If *weighted* is ``True``, calculates weighted least squares; else ordinary least squares.

        Regression results are stored in ``'/main/label/scores'``

        """
        if self.check_store("/main/{}/scores".format(label)):
            return
        elif "/main/{}/scores".format(label) in self.store.keys():
            # need to remove the current keys because we are using append
            self.store.remove("/main/{}/scores".format(label))

        logging.info("Calculating {} regression coefficients ({})".format(self.scoring_method, label), extra={'oname' : self.name})
        # append is required because it takes the "min_itemsize" argument, and put doesn't
        longest = self.store.select("/main/{}/log_ratios".format(label), "columns='index'").index.map(len).max()
        chunk = 1
        if self.scoring_method == "WLS":
            for data in self.store.select_as_multiple(["/main/{}/log_ratios".format(label), "/main/{}/weights".format(label)], chunksize=self.chunksize):
                logging.info("Calculating weighted least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, True], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        elif self.scoring_method == "OLS":
            for data in self.store.select("/main/{}/log_ratios".format(label), chunksize=self.chunksize):
                logging.info("Calculating ordinary least squares for chunk {} ({} rows)".format(chunk, len(data.index)), extra={'oname' : self.name})
                result = data.apply(regression_apply, args=[self.timepoints, False], axis="columns")
                self.store.append("/main/{}/scores".format(label), result, min_itemsize={"index" : longest})
                chunk += 1
        else:
            raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))

        # need to read from the file, calculate percentiles, and rewrite it
        logging.info("Calculating slope standard error percentiles ({})".format(label), extra={'oname' : self.name})
        data = self.store['/main/{}/scores'.format(label)]
        data['score'] = data['slope']
        data['SE'] = data['SE_slope']
        data['SE_pctile'] = [stats.percentileofscore(data['SE'], x, "weak") for x in data['SE']]
        data = data[['score', 'SE', 'SE_pctile', 'slope', 'intercept', 'SE_slope', 't', 'pvalue_raw']] # reorder columns
        self.store.put("/main/{}/scores".format(label), data, format="table", data_columns=data.columns)
项目:Python-for-Finance-Second-Edition    作者:PacktPublishing    | 项目源码 | 文件源码
def breusch_pagan_test(y,x): 
    results=sm.OLS(y,x).fit() 
    resid=results.resid
    n=len(resid)
    sigma2 = sum(resid**2)/n 
    f = resid**2/sigma2 - 1
    results2=sm.OLS(f,x).fit() 
    fv=results2.fittedvalues 
    bp=0.5 * sum(fv**2) 
    df=results2.df_model
    p_value=1-sp.stats.chi.cdf(bp,df)
    return round(bp,6), df, round(p_value,7)
#
项目:aq_weather    作者:eliucidate    | 项目源码 | 文件源码
def sm_multireg(self,Xtrain,ytrain, Xtest, ytest):    
                self.normalize(Xtrain)
                results = sm.OLS(ytrain, Xtrain).fit_regularized()
                #print "coefficient: ", results.params
                # train accuracy
                predictions = results.predict(Xtrain)
                correct = 0
                for i in range(len(predictions)):
                    if round(predictions[i], 1) == ytrain[i]:
                        correct += 1
                accuracy = correct * 1.0 / len(ytrain)
                print "train accuracy: ", accuracy * 100, "%"
                # calculate SSE, SSM & SST
                SSE = 0
                for i in range(len(predictions)):
                    SSE += (predictions[i] - ytrain[i])**2
                yAverage = np.mean(ytrain)
                SSM = 0
                for pred in predictions:
                    SSM += (pred - yAverage)**2
                print "SSM:", SSM
                SST = SSE + SSM
                print "SST:", SST
                # calculate PVE = SSM / SST
                PVE = SSM / SST
                print "PVE:", PVE
                # test accuracy
                self.normalize(Xtest)
                predictions = results.predict(Xtest)
                correct = 0
                for i in range(len(predictions)):
                    print round(predictions[i], 1), ytest[i]
                    if round(predictions[i], 1) == ytest[i]:
                        correct += 1
                accuracy = correct * 1.0 / len(ytest)
                print "test accuracy: ", accuracy * 100, "%"
                return results
项目:deep-iv    作者:allentran    | 项目源码 | 文件源码
def fit_ols(y, x, idx=-1):
        ols = OLS(y, add_constant(x))
        results = ols.fit()
        return results.params.values[idx], results.cov_params().values[idx, idx]
项目:deep-iv    作者:allentran    | 项目源码 | 文件源码
def weak_instruments(self, n_sims=20):

        np.random.seed(1692)

        model = feedforward.FeedForwardModel(19, 1, dense_size=60, n_dense_layers=2)

        treatment_effects = []
        ols_betas, ols_ses = [], []
        old_corrs, new_corrs = [], []
        for _ in xrange(n_sims):
            df = self.treatment_gen.simulate_data(False)

            X = np.hstack((self.x, df['new_treat'].values[:, None]))
            Z = np.hstack((self.x, df['instrument'].values[:, None]))

            ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
            ols_betas.append(ols_beta)
            ols_ses.append(ols_se)

            old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
            new_instrument, new_corr = model.fit_instruments(X, Z, df['treatment_effect'].values, batchsize=128)
            new_corrs.append(new_corr)
            old_corrs.append(old_corr)

            Z2 = Z.copy()
            Z2[:, -1] = new_instrument[:, 0]

            iv = IV2SLS(df['treatment_effect'].values.flatten(), add_constant(X), add_constant(Z2))

            model.reset_params()

        if new_corr:
            logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs))
        logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses))
        logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
项目:deep-iv    作者:allentran    | 项目源码 | 文件源码
def estimate(self, n_sims=20):

        np.random.seed(1692)

        model = feedforward.FeedForwardModel(19, 1, dense_size=10, n_dense_layers=2)

        treatment_effects = []
        ols_betas, ols_ses = [], []
        old_corrs, new_corrs = [], []
        for _ in xrange(n_sims):
            df = self.treatment_gen.simulate_data(True)

            X = np.hstack((self.x, df['new_treat'].values[:, None]))
            Z = np.hstack((self.x, df['instrument'].values[:, None]))

            ols_beta, ols_se = self.fit_ols(df['treatment_effect'], X)
            ols_betas.append(ols_beta)
            ols_ses.append(ols_se)

            new_corr = model.fit(X, Z, df['treatment_effect'].values, instrument=True, batchsize=64)
            if new_corr:
                old_corr = df[['instrument', 'new_treat']].corr().values[0, 1]
                new_corrs.append(new_corr)
                old_corrs.append(old_corr)

            treatment_effects.append(model.get_treatment_effect(X))
            model.reset_params()

        if new_corr:
            logger.info("Old corr: %.2f, New corr: %.2f", np.mean(old_corrs), np.mean(new_corrs))
        logger.info("Treatment effect (OLS): %.3f (%.4f)", np.mean(ols_betas), np.mean(ols_ses))
        logger.info("Treatment effect: %.3f (%.4f)", np.mean(treatment_effects), np.std(treatment_effects))
项目:Machine-Learning    作者:zjuzpz    | 项目源码 | 文件源码
def solution3():
    print("Start problem 3")
    print("Start data preparation for problem 3")
    # firstly prepare data for problem 3, all text files will be saved at current path
    data_preparation()
    print("Results: ")

    tags = ['tweets_#gohawks', 'tweets_#gopatriots', 'tweets_#nfl', \
    'tweets_#patriots', 'tweets_#sb49', 'tweets_#superbowl']

    # idx for tag index in tags
    idx=0
    for tag in tags:
        tweets, parameters = [], []
        f = open('problem 3 ' + tag + ' data.txt')
        line = f.readline()
        while len(line):
            p = line.split()
            tweets.append(float(p[0]))
            parameters.append([float(p[i]) for i in range(len(p))])
            line = f.readline()
        del(tweets[0])
        next_hour_tweets = np.array(tweets)
        parameters.pop()
        X = np.matrix(parameters)
        res = sm.OLS(next_hour_tweets, X).fit()
        print("Result of " + tag)
        print(res.summary())
        if tag == 'tweets_#gohawks' or tag=='tweets_#nfl' or tag=='tweets_#patriots':
            scatterPlot(idx,0,1,2,X,next_hour_tweets,tag)
        elif tag ==  'tweets_#gopatriots':
            scatterPlot(idx,1,3,4,X,next_hour_tweets, tag)
        elif tag == 'tweets_#sb49':
            scatterPlot(idx,0,1,5,X,next_hour_tweets,tag)
        elif tag == 'tweets_#superbowl':
            scatterPlot(idx,0,2,5,X, next_hour_tweets,tag)
        idx += 1    
        f.close()
项目:QuantProject    作者:sirius27    | 项目源码 | 文件源码
def macro_regression(ret, macro_index,macro_index_list):
    '''
    ?????????????????????????????????????????????????
    ??????????????????????????*???????????????-??????
    :param DataFrame ret: ??????????
    :param DataFrame macro_index: ??????????
    :return: [DataFrame,dict] [loading,significant_list]: ???????????????;??????????
    '''
    loading = pd.DataFrame(np.zeros([ret.shape[1], macro_index.shape[1]]), columns=macro_index.columns)
    # ??????????????????
    significant_list = dict()
    # ??????
    for i in range(len(macro_index_list)):
        significant_list[macro_index_list[i]] = 0
    # p???????????
    pvalue_threshold = 0.1
    # ??????????
    for i in range(ret.shape[1]):
        y = ret.values[:, i]
        # ??????????????
        for j in range(len(macro_index_list)):
            x = macro_index.values[:, j]
            model = sm.OLS(y, x).fit()
            loading.iloc[i, j] = model.params[0]
            if model.pvalues < pvalue_threshold:
                significant_list[macro_index_list[j]] += 1
    return [loading, significant_list]
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def rm_reg_ri(rm,ret):
    '''
    ???????????????????????????
    '''
    #??????
    X=np.array(np.zeros([ret.shape[1],1]))
    #?????????????
    for i in range(ret.shape[1]):
        model=sm.OLS(ret.values[:,i],rm.values).fit()
        X[i,:]=model.params[0]
    X=pd.DataFrame(X)
    return X
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def fb_reg_over_time(ret, data,interval):
    '''
    ?????????????????????????????????????????????????????????
    :param DataFrame ret: ???
    :param {string:DataFrame} data: ?????????
    :param DataFrame ind_ret: ???????????
    :param [int] interval????????????
    :return: DataFrame X: ?????????????;??????
    '''
    # X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???)
    X = np.zeros([ret.shape[1], len(data)])
    # num_of_factor????????factor??????????????1
    num_of_factor = 0
    # name of factors,prepared for converting X to DataFrame,with columns=factor_name
    factor_name = []
    # ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]??????
    for i in data.items():
        factor_name = factor_name + [i[0]]
        interval_data=i[1].ix[interval]
        # ????????????????0
        for j in range(i[1].shape[1]):
            # ??j??????????????????????
            model = sm.OLS(ret[j].values, interval_data[j].values).fit()
            # ?????????????
            X[j, num_of_factor] = model.params[0]
            # ?????????????1
        num_of_factor += 1
    # ?X??DataFrame????
    X = pd.DataFrame(X)
    X.fillna(0, inplace=True)
    X.columns = factor_name
    #??????????DataFrame
    return X
项目:Multiple-factor-risk-model    作者:icezerowjj    | 项目源码 | 文件源码
def fb_reg_over_all_time(ret, data):
    '''
    ???????????????????????????????????????????????????????
    :param DataFrame ret: ???
    :param {string:DataFrame} data: ?????????
    :param DataFrame ind_ret: ???????????
    :param [int] interval????????????
    :return: DataFrame X: ?????????????;??????
    '''
    # X???????????????????X[i,j]???(i+1)???????(j+1)????(row????col???)
    X = np.zeros([ret.shape[1], len(data)])
    # num_of_factor????????factor??????????????1
    num_of_factor = 0
    # name of factors,prepared for converting X to DataFrame,with columns=factor_name
    factor_name = []
    # ?????????,i???tuple,i[0]?????i[1]???DataFrame???[???,????]??????
    for i in data.items():
        factor_name = factor_name + [i[0]]
        # ????????????????0
        for j in range(i[1].shape[1]):
            # ??j??????????????????????
            model = sm.OLS(ret[j].values, i[1][j].values).fit()
            # ?????????????
            X[j, num_of_factor] = model.params[0]
            # ?????????????1
        num_of_factor += 1
    # ?X??DataFrame????
    X = pd.DataFrame(X)
    X.fillna(0, inplace=True)
    X.columns = factor_name
    #??????????DataFrame
    return X
项目:dstk    作者:jotterbach    | 项目源码 | 文件源码
def fit(self, values, targets, penalty=0.0):
        self._knots = _get_percentiles(values, num_percentiles=self.num_percentiles)
        self.basis_matrix = _get_basis_vector(values, self._knots)

        X = np.vstack((self.basis_matrix, np.sqrt(penalty) * self._penalty_matrix()))
        y = np.asarray(targets + np.zeros((self.num_percentiles + 2, 1)).flatten().tolist())

        self.spline = sm.OLS(y, X).fit()
        return self
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda    作者:SignalMedia    | 项目源码 | 文件源码
def _filter_data(lhs, rhs, weights=None):
    """
    Cleans the input for single OLS.

    Parameters
    ----------
    lhs : Series
        Dependent variable in the regression.
    rhs : dict, whose values are Series, DataFrame, or dict
        Explanatory variables of the regression.
    weights : array-like, optional
        1d array of weights.  If None, equivalent to an unweighted OLS.

    Returns
    -------
    Series, DataFrame
        Cleaned lhs and rhs
    """
    if not isinstance(lhs, Series):
        if len(lhs) != len(rhs):
            raise AssertionError("length of lhs must equal length of rhs")
        lhs = Series(lhs, index=rhs.index)

    rhs = _combine_rhs(rhs)
    lhs = DataFrame({'__y__': lhs}, dtype=float)
    pre_filt_rhs = rhs.dropna(how='any')

    combined = rhs.join(lhs, how='outer')
    if weights is not None:
        combined['__weights__'] = weights

    valid = (combined.count(1) == len(combined.columns)).values
    index = combined.index
    combined = combined[valid]

    if weights is not None:
        filt_weights = combined.pop('__weights__')
    else:
        filt_weights = None

    filt_lhs = combined.pop('__y__')
    filt_rhs = combined

    if hasattr(filt_weights, 'to_dense'):
        filt_weights = filt_weights.to_dense()

    return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights,
            pre_filt_rhs.to_dense(), index, valid)
项目:binet    作者:crisjf    | 项目源码 | 文件源码
def _residualNet(data,uselog=True,c=None,p=None,x=None,useaggregate=True,numericalControls=[],categoricalControls=[]):
    '''
    Given the data on a bipartite network of the form source,target,flow

    Parameters
    ----------
    data : pandas.DataFrame
        Raw data. It has source,target,volume (trade, number of people etc.).
    c,p,x : str (optional)
        Labels of the columns in data used for source,target,volume. 
        If not provided it will use the first, second, and third.
    numericalControls : list
        List of columns to use as numerical controls.
    categoricalControls : list
        List of columns to use as categorical controls.
    uselog : boolean (True)
        If True it will use the logarithm of the provided weight.
    useaggregate : boolean (True)
        If true it will calculate the aggregate of the volume on both sides (c and p) and use as numbercal controls.

    Returns
    -------
    net : pandas.Dataframe
        Table with c,p,x,x_res, where x_res is the residual of regressing x on the given control variables.
    '''
    c = data.columns.values[0] if c is None else c
    p = data.columns.values[1] if p is None else p
    x = data.columns.values[2] if x is None else x
    data_ = data[[c,p,x]+numericalControls+categoricalControls]
    if useaggregate:
        data_ = merge(data_,data.groupby(c).sum()[[x]].reset_index().rename(columns={x:x+'_'+c}))
        data_ = merge(data_,data.groupby(p).sum()[[x]].reset_index().rename(columns={x:x+'_'+p}))
        numericalControls+=[x+'_'+c,x+'_'+p]
    if uselog:
        data_ = data_[data_[x]!=0]
        data_[x] = np.log10(data_[x])
        if useaggregate:
            data_[x+'_'+c] = np.log10(data_[x+'_'+c])
            data_[x+'_'+p] = np.log10(data_[x+'_'+p])
    _categoricalControls = []
    for var in ser(categoricalControls):
        vals = list(set(data_[var]))
        for v in vals[1:]:
            _categoricalControls.append(var+'_'+str(v))
            data_[var+'_'+str(v)]=0
            data_.loc[data_[var]==v,var+'_'+str(v)]=1

    Y = data_[x].values
    X = data_[list(set(numericalControls))+list(set(_categoricalControls))].values
    X = sm.add_constant(X)

    model = sm.OLS(Y,X).fit()
    data_[x+'_res'] = Y-model.predict(X)
    return data_[[c,p,x,x+'_res']]
项目:Enrich2    作者:FowlerLab    | 项目源码 | 文件源码
def wt_plot(self, pdf):
        """
        Create a plot of the linear fit of the wild type variant.

        *pdf* is an open PdfPages instance.

        Only created for selections that use WLS or OLS scoring and have a wild type specified. 
        Uses :py:func:`~plots.fit_axes` for the plotting.
        """
        logging.info("Creating wild type fit plot", extra={'oname' : self.name})

        # get the data and calculate log ratios
        if "variants" in self.labels:
            wt_label = "variants"
        elif "identifiers" in self.labels:
            wt_label = "identifiers"
        data = self.store.select("/main/{}/counts".format(wt_label), where='index = "{}"'.format(WILD_TYPE_VARIANT)).ix[0]
        sums = self.store['/main/{}/counts'.format(wt_label)].sum(axis="index")  # sum of complete cases (N')
        yvalues = np.log(data + 0.5) - np.log(sums + 0.5)
        xvalues = [tp / float(max(self.timepoints)) for tp in self.timepoints]

        # fit the line
        X = sm.add_constant(xvalues) # fit intercept
        if self.scoring_method == "WLS":
            W =  1 / (1 / (data + 0.5) + 1 / (sums + 0.5))
            fit = sm.WLS(yvalues, X, weights=W).fit()
        elif self.scoring_method == "OLS":
            fit = sm.OLS(yvalues, X).fit()
        else:
            raise ValueError('Invalid regression scoring method "{}" [{}]'.format(self.scoring_method, self.name))
        intercept, slope = fit.params
        slope_se = fit.bse['x1']

        # make the plot
        fig, ax = plt.subplots()
        fig.set_tight_layout(True)
        fit_axes(ax, xvalues, yvalues, slope, intercept, xlabels=self.timepoints)
        fit_axes_text(ax, cornertext="Slope {:3.2f}\nSE {:.1f}".format(slope, slope_se))
        ax.set_title("Wild Type Shape\n{}".format(self.name))
        ax.set_ylabel("Log Ratio (Complete Cases)")

        pdf.savefig(fig)
        plt.close(fig)