Python scipy 模块，stats() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用scipy.stats()。

项目：Projects 作者：it2school | 项目源码 | 文件源码

def ppf(x, n):
    try:
        from scipy import stats
    except ImportError:
        stats = None

    if stats:
        if n < 30:
            return stats.t.ppf(x, n)
        return stats.norm.ppf(x)
    else:
        if n < 30:
            # TODO: implement power series:
            # http://eprints.maths.ox.ac.uk/184/1/tdist.pdf
            raise ImportError(
                'You must have scipy installed to use t-student '
                'when sample_size is below 30')
        return norm_ppf(x)

# According to http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/
# BS704_Confidence_Intervals/BS704_Confidence_Intervals_print.html

项目：privcount 作者：privcount | 项目源码 | 文件源码

def get_sanity_check_counter():
    '''
    Provide a dictionary with the standard sanity check counter values.
    It is typically used like:
        counters['ZeroCount'] = get_sanity_check_counter()
    All of these values are unused, except for:
        bins:
            - [-inf, inf] (a long counter is appended by SecureCounters,
                           it should only ever have blinding values added)
        estimated_value: 0.0 (TODO: used for checking if stats have changed)
        sigma: 0.0 (used for adding noise, 0.0 means no noise is added)
    '''
    sanity_check = {}
    sanity_check['bins'] = []
    single_bin = [float('-inf'), float('inf')]
    sanity_check['bins'].append(single_bin)
    sanity_check['sensitivity'] = 0.0
    sanity_check['estimated_value'] = 0.0
    sanity_check['sigma'] = 0.0
    sanity_check['epsilon'] = 0.0
    sanity_check['expected_noise_ratio'] = 0.0
    return sanity_check

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_rank_methods_frame(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randint(0, 21, (100, 26))
        xs = (xs - 10.0) / 10.0
        cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            df = DataFrame(vals, columns=cols)

            for ax in [0, 1]:
                for m in ['average', 'min', 'max', 'first', 'dense']:
                    result = df.rank(axis=ax, method=m)
                    sprank = np.apply_along_axis(
                        rankdata, ax, vals,
                        m if m != 'first' else 'ordinal')
                    sprank = sprank.astype(np.float64)
                    expected = DataFrame(sprank, columns=cols)

                    if LooseVersion(scipy.__version__) >= '0.17.0':
                        expected = expected.astype('float64')
                    tm.assert_frame_equal(result, expected)

项目：smt-for-gec 作者：cnap | 项目源码 | 文件源码

def gleu(self, stats, smooth=False):
        """Compute GLEU from collected statistics obtained by call(s) to gleu_stats"""
        # smooth 0 counts for sentence-level scores
        if smooth:
            stats = [s if s != 0 else 1 for s in stats]
        if len(filter(lambda x: x == 0, stats)) > 0:
            return 0
        (c, r) = stats[:2]
        log_gleu_prec = sum([math.log(float(x) / y)
                             for x, y in zip(stats[2::2], stats[3::2])]) / 4
        for i, (x, y) in enumerate(zip(stats[2::2], stats[3::2])) :
            pass
            #print 'Precision', i+1, '=', x, '/', y, '=', 1.*x/y
#        log_gleu_prec = sum([math.log(float(x) / y)
#                             for x, y in zip(stats[2::2], stats[3::2])]) / 4

        return math.exp(min([0, 1 - float(r) / c]) + log_gleu_prec)

项目：singlecell-dash 作者：czbiohub | 项目源码 | 文件源码

def clean_mapping_stats(mapping_stats_original, convert_to_percentage=None):
    """Remove whitespace from all values and convert to numbers"""

    if convert_to_percentage is None:
        convert_to_percentage = set()

    mapping_stats_original = mapping_stats_original.applymap(
        lambda x: (x.replace(',', '').strip().strip('%')
                   if isinstance(x, str) else x))

    numeric = mapping_stats_original.apply(maybe_to_numeric)

    numeric.columns = numeric.columns.map(str.strip)

    # for 10X mapping stats
    numeric.columns = numeric.columns.map(
            lambda x: ('Percent {}'.format(x.replace('Fraction ', ''))
                       if x in convert_to_percentage else x)
    )

    return numeric

项目：brainiak 作者：brainiak | 项目源码 | 文件源码

def prior_GP_var_half_cauchy(y_invK_y, n_y, tau_range):
    """ Imposing a half-Cauchy prior onto the standard deviation (tau)
        of the Gaussian Process which is in turn a prior imposed over
        a function y = f(x).
        The scale parameter of the half-Cauchy prior is tau_range.
        The function returns the MAP estimate of tau^2 and
        log(p(tau|tau_range)) for the MAP value of tau^2,
        where tau_range describes the reasonable range of tau
        in the half-Cauchy prior.
        An alternative form of prior is inverse-Gamma prior on tau^2.
        Inverse-Gamma prior penalizes for both very small and very
        large values of tau, while half-Cauchy prior only penalizes
        for very large values of tau.
        For more information on usage, see description in BRSA class:
        `.BRSA`
    """
    tau2 = (y_invK_y - n_y * tau_range**2
            + np.sqrt(n_y**2 * tau_range**4 + (2 * n_y + 8)
                      * tau_range**2 * y_invK_y + y_invK_y**2))\
        / 2 / (n_y + 2)
    log_ptau = scipy.stats.halfcauchy.logpdf(
        tau2**0.5, scale=tau_range)
    return tau2, log_ptau

项目：brainiak 作者：brainiak | 项目源码 | 文件源码

def _zscore(a):
    """ Calculating z-score of data on the first axis.
        If the numbers in any column are all equal, scipy.stats.zscore
        will return NaN for this column. We shall correct them all to
        be zeros.

    Parameters
    ----------
    a: numpy array

    Returns
    -------
    zscore: numpy array
        The z-scores of input "a", with any columns including non-finite
        numbers replaced by all zeros.
    """
    assert a.ndim > 1, 'a must have more than one dimensions'
    zscore = scipy.stats.zscore(a, axis=0)
    zscore[:, np.logical_not(np.all(np.isfinite(zscore), axis=0))] = 0
    return zscore

项目：kaggle-seizure-prediction 作者：sics-lm | 项目源码 | 文件源码

def mad(self, median=None):
        """
        Return the median absolute deviation for this segment only,
        scaled by phi-1(3/4) to approximate the standard deviation.
        :param median: If this is not None, it will be used as the median from which the deviation is calculated.
        :return: A NDarray of shape (n_channels, 1) with the MAD for the channels of this segment.
        """

        c = scipy.stats.norm.ppf(3 / 4)
        if median is None:
            subject_median = self.median()
        else:
            subject_median = median
        median_residuals = self.mat_struct.data - subject_median  # deviation between median and data
        mad = np.median(np.abs(median_residuals), axis=1)[:, np.newaxis]
        return mad / c

项目：kaggle-seizure-prediction 作者：sics-lm | 项目源码 | 文件源码

def median_absolute_deviation(data, subject_median=None, axis=0):
    """
    Returns the median absolute deviation (MAD) for the given data.
    :param data: A DataFrame holding the data to calculate the MAD from.
    :param subject_median: If given, these will be used as the medians which the deviation is calculated from. If None
                           The median of the given data is used.
    :param axis: The axis to calculate over.
    :return: A Series object with the median absolute deviations over the given *axis* for the *data*.
    """
    """A robust estimate of the standard deviation"""
    # The scaling factor for estimating the standard deviation from the MAD
    c = scipy.stats.norm.ppf(3 / 4)
    if isinstance(data, pd.DataFrame):
        if subject_median is None:
            subject_median = data.median(axis=axis)
        median_residuals = data - subject_median
        mad = median_residuals.abs().median(axis=axis)
    else:
        if subject_median is None:
            subject_median = np.median(data, axis=axis)
        median_residuals = data - subject_median[:, np.newaxis]  # deviation between median and data
        mad = np.median(np.abs(median_residuals), axis=axis)
    return mad / c

项目：kaggle-seizure-prediction 作者：sics-lm | 项目源码 | 文件源码

def read_folder(stats_folder, metrics=None):
    """
    Reads all segment statistics files from the given folder and returns the statistics as a single DataFrame.
    :param stats_folder: The folder to reat statistics files from.
    :param metrics: A subset of the metrics to load.
    :return: A DataFrame with all the statistics available in stats_folder.
    """
    if metrics is None:
        metrics = ['absolute mean', 'absolute median', 'kurtosis', 'skew', 'std']
    glob_pattern = os.path.join(stats_folder, '*segments_statistics.csv')
    files = glob.glob(glob_pattern)
    if files:
        stats = pd.concat([read_stats(stat_file, metrics) for stat_file in files])
        return stats
    else:
        raise IOError("No segment statistics file in folder {}".format(stats_folder))

项目：kaggle-seizure-prediction 作者：sics-lm | 项目源码 | 文件源码

def get_subject_metric(stats_df, metric_name, aggregator='{dataframe}.median()', channel_ordering=None, use_cache=True):
    """
    Returns the metric given by stats df as a NDArray-like of shape (n_channels, 1), obtained by aggregating the
    given metric from the dataframe.
    :param stats_df: The statistics dataframe acquired from read_stats.
    :param metric_name: The metric to collect.
    :param aggregator: A string with an expression used to aggregate the per segment statistic to a single statistic for
                       the whole subject.
    :param channel_ordering: An optional ordered sequence of channel names, which will ensure that the outputted
    statistics vector has the same order as the segment which the statistic should be applied on.
    :param use_cache: If True, the metrics will be cached by the function so that calling it multiple times for the
                      same subject is fast.
    :return: A NDArray of shape (n_channels, 1) where each element along axis 0 correspond to the aggregated statistic
             that channel.
    """
    cache = get_subject_metric.cache
    assert isinstance(stats_df, pd.DataFrame)
    if use_cache and id(stats_df) in cache and (metric_name, aggregator) in cache[id(stats_df)]:
        return cache[id(stats_df)][(metric_name, aggregator)]

    # The stats dataframes have a 2-level column index, where the first level are the channel names and the seconde
    # the metric name. To get the metric but keep the channels we slice the first level with all the entries using
    # slice(None), this is equivalent to [:] for regular slices.
    if channel_ordering is None:
        segment_metrics = stats_df.loc[:, (slice(None), metric_name)]
    else:
        segment_metrics = stats_df.loc[:, (channel_ordering, metric_name)]
    aggregated_metric = eval(aggregator.format(dataframe='segment_metrics'))
    added_axis = aggregated_metric[:, np.newaxis]
    cache[id(stats_df)][(metric_name, aggregator)] = added_axis
    return added_axis

项目：standard_precip 作者：e-baumer | 项目源码 | 文件源码

def cdf_to_ppf(self, data, params):
        '''
        Take the specific distributions fitted parameters and calculate the
        cdf. Apply the inverse normal distribution to the cdf to get the SPI 
        SPEI. This process is best described in Lloyd-Hughes and Saunders, 2002
        which is included in the documentation.

        '''

        # Calculate the CDF of observed precipitation on a given time scale
        cdf = self.distr.cdf(
            data, *params[:-2], loc=params[-2], scale=params[-1]
        )

        # Apply inverse normal distribution
        norm_ppf = scipy.stats.norm.ppf(cdf)

        return norm_ppf

项目：nmmn 作者：rsnemmen | 项目源码 | 文件源码

def joint_density(X, Y, bounds=None):
    """
Plots joint distribution of variables.
Inherited from method in src/graphics.py module in project 
git://github.com/aflaxman/pymc-example-tfr-hdi.git
    """
    if bounds:
        X_min, X_max, Y_min, Y_max = bounds
    else:
        X_min = X.min()
        X_max = X.max()
        Y_min = Y.min()
        Y_max = Y.max()

    pylab.plot(X, Y, linestyle='none', marker='o', color='green', mec='green', alpha=.2, zorder=-99)

    gkde = scipy.stats.gaussian_kde([X, Y])
    x,y = pylab.mgrid[X_min:X_max:(X_max-X_min)/25.,Y_min:Y_max:(Y_max-Y_min)/25.]
    z = pylab.array(gkde.evaluate([x.flatten(), y.flatten()])).reshape(x.shape)
    pylab.contour(x, y, z, linewidths=2)

    pylab.axis([X_min, X_max, Y_min, Y_max])

项目：nmmn 作者：rsnemmen | 项目源码 | 文件源码

def scatterfit(x,y,a=None,b=None):
    """
Compute the mean deviation of the data about the linear model given if A,B
(*y=ax+b*) provided as arguments. Otherwise, compute the mean deviation about 
the best-fit line.

:param x,y: assumed to be Numpy arrays. 
:param a,b: scalars.
:rtype: float sd with the mean deviation.
    """

    if a==None: 
        # Performs linear regression
        a, b, r, p, err = scipy.stats.linregress(x,y)

    # Std. deviation of an individual measurement (Bevington, eq. 6.15)
    N=numpy.size(x)
    sd=1./(N-2.)* numpy.sum((y-a*x-b)**2)
    sd=numpy.sqrt(sd)

    return sd

项目：nmmn 作者：rsnemmen | 项目源码 | 文件源码

def r2p(r,n):
    """
Given a value of the Pearson r coefficient, this method gives the
corresponding p-value of the null hypothesis (i.e. no correlation).

Code copied from scipy.stats.pearsonr source.

Usage:

>>> p=r2p(r,n)

where 'r' is the Pearson coefficient and n is the number of data
points.

:returns: p-value of the null hypothesis.
    """
    df = n-2
    if abs(r) == 1.0:
        prob = 0.0
    else:
        t_squared = r*r * (df / ((1.0 - r) * (1.0 + r)))
        prob = scipy.stats.betai(0.5*df, 0.5, df / (df + t_squared))

    return prob

项目：graynet 作者：raamana | 项目源码 | 文件源码

def test_run_roi_stats_via_API():
    "Tests whether roi stats can be computed (not their accuracy) and the return values match in size."

    summary_methods = ['median', 'mean', 'std', 'variation', 'entropy', 'skew', 'kurtosis']
    # 'mode' returns more than one value; 'gmean' requires only positive values,
    # 'hmean' can not always be computed
    from scipy.stats import  trim_mean, kstat
    from functools import partial
    trimmed_mean = partial(trim_mean, proportiontocut=0.05)
    third_kstat = partial(kstat, n=3)

    summary_methods.extend([trimmed_mean, third_kstat])
    # checking support for nan-handling callables
    summary_methods.extend([np.nanmedian, np.nanmean])

    for summary_method in summary_methods:
        roi_medians = graynet.roiwise_stats_indiv(subject_id_list, fs_dir, base_feature=base_feature,
                                                  chosen_roi_stats=summary_method, atlas=atlas,
                                                  smoothing_param=fwhm, out_dir=out_dir, return_results=True)
        for sub in subject_id_list:
            if roi_medians[sub].size != num_roi_wholebrain:
                raise ValueError('invalid summary stats - #nodes do not match.')

项目：pyprocessmacro 作者：QuentinAndre | 项目源码 | 文件源码

def coeff_summary(self):
        """
        Get the estimates of the terms in the model.
        :return: A DataFrame of betas, se, t (or z), p, llci, ulci for all variables of the model.
        """
        results = self.estimation_results
        if results:
            if "t" in results.keys():  # Model has t-stats rather than z-stats
                coeffs = np.array(
                    [results["betas"], results["se"], results["t"], results["p"], results["llci"], results["ulci"]]).T
                df = pd.DataFrame(coeffs, index=results["names"],
                                  columns=["coeff", "se", "t", "p", "LLCI", "ULCI"])
            else:  # Model has z-stats.
                coeffs = np.array(
                    [results["betas"], results["se"], results["z"], results["p"], results["llci"], results["ulci"]]).T
                df = pd.DataFrame(coeffs, index=results["names"],
                                  columns=["coeff", "se", "Z", "p", "LLCI", "ULCI"])
        else:
            raise NotImplementedError(
                "The model has not been estimated yet. Please estimate the model first."
            )
        return df

项目：pyprocessmacro 作者：QuentinAndre | 项目源码 | 文件源码

def _estimate(self):
        """
        Estimates the direct effect of X on Y, and return the results into as a dictionary.
        :return: dict
            A dictionary of parameters and model estimates.
        """
        mod_values = [i for i in product(*self._moderators_values)]
        mod_symb = self._moderators_symb
        betas, se, llci, ulci = self._get_conditional_direct_effects(mod_symb, mod_values)
        t = betas / se
        if self._is_logit:
            p = stats.norm.sf(np.abs(t)) * 2
        else:
            df_e = self._model.estimation_results["df_e"]
            p = stats.t.sf(np.abs(t), df_e) * 2
        estimation_results = {"betas": betas,
                              "se": se,
                              "t": t,
                              "p": p,
                              "llci": llci,
                              "ulci": ulci}
        return estimation_results

项目：pohmm 作者：vmonaco | 项目源码 | 文件源码

def _compute_log_likelihood(self, obs, pstates_idx):

        q = np.zeros(shape=(len(obs), self.n_hidden_states, self.n_features))

        for col, (feature_name, feature_distr) in enumerate(zip(self.emission_name, self.emission_distr)):
            if feature_distr == 'normal':
                mu = self.emission[feature_name]['mu'][pstates_idx]
                sigma = self.emission[feature_name]['sigma'][pstates_idx]
                for j in range(self.n_hidden_states):
                    q[:, j, col] = np.log(
                        np.maximum(MIN_PROBA, stats.norm.pdf(obs[:, col], loc=mu[:, j], scale=sigma[:, j])))
            if feature_distr == 'lognormal':
                logmu = self.emission[feature_name]['logmu'][pstates_idx]
                logsigma = self.emission[feature_name]['logsigma'][pstates_idx]
                for j in range(self.n_hidden_states):
                    q[:, j, col] = np.log(np.maximum(MIN_PROBA,
                                                     stats.lognorm.pdf(obs[:, col], logsigma[:, j], loc=0,
                                                                       scale=np.exp(logmu[:, j]))))

        q = q.sum(axis=2)
        return q

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_rank_methods_series(self):
        tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
        import scipy
        from scipy.stats import rankdata

        xs = np.random.randn(9)
        xs = np.concatenate([xs[i:] for i in range(0, 9, 2)])  # add duplicates
        np.random.shuffle(xs)

        index = [chr(ord('a') + i) for i in range(len(xs))]

        for vals in [xs, xs + 1e6, xs * 1e-6]:
            ts = Series(vals, index=index)

            for m in ['average', 'min', 'max', 'first', 'dense']:
                result = ts.rank(method=m)
                sprank = rankdata(vals, m if m != 'first' else 'ordinal')
                expected = Series(sprank, index=index)

                if LooseVersion(scipy.__version__) >= '0.17.0':
                    expected = expected.astype('float64')
                tm.assert_series_equal(result, expected)

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_skew(self):
        tm._skip_if_no_scipy()

        from scipy.stats import skew
        alt = lambda x: skew(x, bias=False)
        self._check_stat_op('skew', alt)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.skew()))
                self.assertTrue(np.isnan(df.skew()).all())
            else:
                self.assertEqual(0, s.skew())
                self.assertTrue((df.skew() == 0).all())

项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者：SignalMedia | 项目源码 | 文件源码

def test_kurt(self):
        tm._skip_if_no_scipy()

        from scipy.stats import kurtosis
        alt = lambda x: kurtosis(x, bias=False)
        self._check_stat_op('kurt', alt)

        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar'])

        # test corner cases, kurt() returns NaN unless there's at least 4
        # values
        min_N = 4
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.kurt()))
                self.assertTrue(np.isnan(df.kurt()).all())
            else:
                self.assertEqual(0, s.kurt())
                self.assertTrue((df.kurt() == 0).all())

项目：kernel-gof 作者：wittawatj | 项目源码 | 文件源码

def perform_test(self, dat):
        """
        dat: a instance of Data
        """
        with util.ContextTimer() as t:
            alpha = self.alpha
            X = dat.data()
            n = X.shape[0]

            # H: length-n vector
            _, H = self.compute_stat(dat, return_pointwise_stats=True)
            test_stat = np.sqrt(n/2)*np.mean(H)
            stat_var = np.mean(H**2) 
            pvalue = stats.norm.sf(test_stat, loc=0, scale=np.sqrt(stat_var) )

        results = {'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat,
                 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, 
                 }
        return results

项目：SynBioMTS 作者：reisalex | 项目源码 | 文件源码

def correlation(x,y,name="Pearson"):
    # Linear or rank correlation
    # Returns:
    #   r,rho, or tau (float) = the test statistic
    #   pvalue (float) = the p-value for the test

    assert len(x)==len(y), "Arrays x & y must be equal length."

    if name == "Pearson":
        (r,pvalue) = scipy.stats.pearsonr(x,y)
        return r,pvalue

    elif name == "Spearman":
        (rho,pvalue) = scipy.stats.spearmanr(x,y)
        return rho,pvalue

    elif name == "Kendall":
        (tau,pvalue) = scipy.stats.kendalltau(x,y)
        return tau,pvalue

    else:
        error = ("The {} correlation is not available."
                 "Please use 'Pearson', 'Spearman', or 'Kendall.'")
        error.format(name)
        raise ValueError(error)

项目：decoding_challenge_cortana_2016_3rd 作者：kingjr | 项目源码 | 文件源码

def _pval_from_histogram(T, H0, tail):
    """Get p-values from stats values given an H0 distribution

    For each stat compute a p-value as percentile of its statistics
    within all statistics in surrogate data
    """
    if tail not in [-1, 0, 1]:
        raise ValueError('invalid tail parameter')

    # from pct to fraction
    if tail == -1:  # up tail
        pval = np.array([np.sum(H0 <= t) for t in T])
    elif tail == 1:  # low tail
        pval = np.array([np.sum(H0 >= t) for t in T])
    else:  # both tails
        pval = np.array([np.sum(abs(H0) >= abs(t)) for t in T])

    pval = (pval + 1.0) / (H0.size + 1.0)  # the init data is one resampling
    return pval

项目：blackbox 作者：wrwrwr | 项目源码 | 文件源码

def instantiate_dist(name, *opts):
    """
    Creates distribution for the given command-line arguments.
    """
    try:
        dist = getattr(stats, name)
    except AttributeError:
        raise ValueError("No such distribution {}".format(name))
    opts = list(opts)
    try:
        loc = float(opts.pop(0))
    except IndexError:
        loc = 0
    try:
        scale = float(opts.pop(0))
    except IndexError:
        scale = 1
    return dist(*map(float, opts), loc=loc, scale=scale)

项目：siHMM 作者：Ardavans | 项目源码 | 文件源码

def resample(self,data=[],temperature=1.,stats=None):
        stats = self._get_statistics(data) if stats is None else stats

        alphas_n, betas_n, mu_n, nus_n = self._natural_to_standard(
                self.natural_hypparam + stats / temperature)

        D = mu_n.shape[0]
        self.sigmas = 1/np.random.gamma(alphas_n,scale=1/betas_n)
        self.mu = np.sqrt(self.sigmas/nus_n)*np.random.randn(D) + mu_n

        assert not np.isnan(self.mu).any()
        assert not np.isnan(self.sigmas).any()

        # NOTE: next line is to use Gibbs sampling to initialize mean field
        self.mf_mu = self.mu

        assert self.sigmas.ndim == 1
        return self

项目：siHMM 作者：Ardavans | 项目源码 | 文件源码

def max_likelihood(self,data,weights=None,stats=None):
        if stats is not None:
            n, tot = stats
        elif weights is None:
            n, tot = super(NegativeBinomialIntegerR,self)._get_statistics(data)
        else:
            n, tot = super(NegativeBinomialIntegerR,self)._get_weighted_statistics(data,weights)

        if n > 1:
            rs = self.r_support
            ps = self._max_likelihood_ps(n,tot,rs)

            # TODO TODO this isn't right for weighted data: do weighted sums
            if isinstance(data,np.ndarray):
                likelihoods = np.array([self.log_likelihood(data,r=r,p=p).sum()
                                            for r,p in zip(rs,ps)])
            else:
                likelihoods = np.array([sum(self.log_likelihood(d,r=r,p=p).sum()
                                            for d in data) for r,p in zip(rs,ps)])

            argmax = likelihoods.argmax()
            self.r = self.r_support[argmax]
            self.p = ps[argmax]
        return self

项目：livespin 作者：biocompibens | 项目源码 | 文件源码

def bootstrap_extradata(self, nBoot, extradataA, nbins = 20):
        pops =[]
        meanpop = [[] for i in data.cat]
        pylab.figure(figsize = (14,14))
        for i in xrange(min(4, len(extradataA))):
            #pylab.subplot(2,2,i+1)
            if  i ==0:
                pylab.title("Bootstrap on means", fontsize = 20.)
            pop = extradataA[i]# & (self.GFP > 2000)]#
            for index in xrange(nBoot):
                newpop = np.random.choice(pop, size=len(pop), replace=True)

                #meanpop[i].append(np.mean(newpop))
            pops.append(newpop)
            pylab.legend()
        #pylab.title(cat[i])
            pylab.xlabel("Angle(degree)", fontsize = 15)
            pylab.xlim([0., 90.])
        for i in xrange(len(extradataA)):
            for j in xrange(i+1, len(extradataA)):
                statT, pvalue = scipy.stats.ttest_ind(pops[i], pops[j], equal_var=False)
                print "cat{0} & cat{1} get {2} ({3})".format(i,j, pvalue,statT)
        pylab.savefig("/users/biocomp/frose/frose/Graphics/FINALRESULTS-diff-f3/mean_nBootstrap{0}_bins{1}_GFPsup{2}_FLO_{3}.png".format(nBoot, nbins, 'all', randint(0,999)))

项目：privcount 作者：privcount | 项目源码 | 文件源码

def satisfies_dp(sensitivity, epsilon, delta, std):
    '''
    Return True if (epsilon, delta)-differential privacy is satisfied.
    '''
    # find lowest value at which epsilon differential-privacy is satisfied
    lower_x = -(float(epsilon) * (std**2.0) / sensitivity) + sensitivity/2.0
    # determine lower tail probability of normal distribution w/ mean of zero
    lower_tail_prob = scipy.stats.norm.cdf(lower_x, 0, std)
    # explicitly return Boolean value to avoid returning numpy type
    if (lower_tail_prob <= delta):
        return True
    else:
        return False

项目：privcount 作者：privcount | 项目源码 | 文件源码

def print_privacy_allocation(stats_parameters, sigmas, epsilons, excess_noise_ratio):
    '''
    Print information about sigmas and noise ratios for each statistic.
    '''
    # print epsilons sorted by epsilon allocation
    epsilons_sorted = epsilons.keys()
    epsilons_sorted.sort(key = lambda x: epsilons[x], reverse = True)
    print('Epsilon values\n-----')
    for param in epsilons_sorted:
        print('{}: {}'.format(param, epsilons[param]))
    # print equalized ratio of sigma to expected value
    for param, stats in stats_parameters.iteritems():
        ratio = get_expected_noise_ratio(excess_noise_ratio,
                                         sigmas[param],
                                         stats[1])
        print('Ratio of sigma value to expected value: {}'.format(ratio))
        break

    print ""

    # print allocation of privacy budget
    sigma_params_sorted = sigmas.keys()
    # add dummy counter for full counters.noise.yaml
    dummy_counter = DEFAULT_DUMMY_COUNTER_NAME
    sigma_params_sorted.append(dummy_counter)
    sigmas[dummy_counter] = get_sanity_check_counter()['sigma']
    epsilons[dummy_counter] = get_sanity_check_counter()['epsilon']
    sigma_params_sorted.sort()
    print('Sigma values\n-----')
    print('counters:')
    for param in sigma_params_sorted:
        sigma = sigmas[param]
        print('    {}:'.format(param))
        print('        sigma: {:4f}'.format(sigma))

    print ""

项目：BlueWhale 作者：caffe2 | 项目源码 | 文件源码

def _setup_initial_blobs(self):
        # Define models
        self.score_model = ModelHelper(name="score_" + self.model_id)
        self.train_model = ModelHelper(name="train_" + self.model_id)

        # Create input, output, labels, and loss blobs
        self.input_blob = "ModelInput_" + self.model_id
        workspace.FeedBlob(self.input_blob, np.zeros(1, dtype=np.float32))
        self.output_blob = "ModelOutput_" + self.model_id
        workspace.FeedBlob(self.output_blob, np.zeros(1, dtype=np.float32))
        self.labels_blob = "ModelLabels_" + self.model_id
        workspace.FeedBlob(self.labels_blob, np.zeros(1, dtype=np.float32))
        self.loss_blob = "loss"  # "ModelLoss_" + self.model_id
        workspace.FeedBlob(self.loss_blob, np.zeros(1, dtype=np.float32))

        # Create blobs for model parameters
        self.weights: List[str] = []
        self.biases: List[str] = []

        for x in six.moves.range(len(self.layers) - 1):
            dim_in = self.layers[x]
            dim_out = self.layers[x + 1]

            weight_name = "Weights_" + str(x) + "_" + self.model_id
            bias_name = "Biases_" + str(x) + "_" + self.model_id
            self.weights.append(weight_name)
            self.biases.append(bias_name)

            bias = np.zeros(
                shape=[
                    dim_out,
                ], dtype=np.float32
            )
            workspace.FeedBlob(bias_name, bias)

            gain = np.sqrt(2) if self.activations[x] == 'relu' else 1
            weights = scipy.stats.norm(0, gain * np.sqrt(1 / dim_in)).rvs(
                size=[dim_out, dim_in]
            ).astype(np.float32)
            workspace.FeedBlob(weight_name, weights)

项目：BlueWhale 作者：caffe2 | 项目源码 | 文件源码

def _setup_initial_blobs(self):
        MLTrainer._setup_initial_blobs(self)

        self.output_conv_blob = "Conv_output_{}".format(self.model_id)
        workspace.FeedBlob(self.output_conv_blob, np.zeros(1, dtype=np.float32))

        self.conv_weights: List[str] = []
        self.conv_biases: List[str] = []

        for x in six.moves.range(len(self.dims) - 1):
            dim_in = self.dims[x]
            dim_out = self.dims[x + 1]
            kernel_h = self.conv_height_kernels[x]
            kernel_w = self.conv_width_kernels[x]

            weight_shape = [dim_out, kernel_h, kernel_w, dim_in]
            bias_shape = [dim_out, ]

            conv_weight_name = "ConvWeights_" + str(x) + "_" + self.model_id
            bias_name = "ConvBiases_" + str(x) + "_" + self.model_id
            self.conv_weights.append(conv_weight_name)
            self.conv_biases.append(bias_name)

            conv_bias = np.zeros(shape=bias_shape, dtype=np.float32)
            workspace.FeedBlob(bias_name, conv_bias)

            conv_weights = scipy.stats.norm(0, np.sqrt(1 / dim_in)).rvs(
                size=weight_shape
            ).astype(np.float32)
            workspace.FeedBlob(conv_weight_name, conv_weights)

项目：pyrsss 作者：butala | 项目源码 | 文件源码

def robust_std(l, alpha=1/scipy.stats.norm.ppf(0.75)):
    """
    Compute a robust estimate of the standard deviation for the list
    of values *l* (by default, for normally distributed samples ---
    see https://en.wikipedia.org/wiki/Median_absolute_deviation).
    """
    return alpha * mad(l)[0]

项目：pyrsss 作者：butala | 项目源码 | 文件源码

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = ArgumentParser('Report mean and standard deviation for the stream of numbers read from stdin',
                            formatter_class=ArgumentDefaultsHelpFormatter)
    args = parser.parse_args(argv[1:])

    stats = Stats()
    for line in sys.stdin:
        stats(float(line))
    print(stats)

项目：plotnine 作者：has2k1 | 项目源码 | 文件源码

def get(name):
    try:
        return getattr(stats, name)
    except AttributeError:
        raise PlotnineError(
            "Unknown distribution '{}'".format(name))

项目：plotnine 作者：has2k1 | 项目源码 | 文件源码

def get_continuous_distribution(name):
    if name not in continuous:
        msg = "Unknown continuous distribution '{}'"
        raise ValueError(msg.format(name))

    return getattr(stats, name)

项目：smt-for-gec 作者：cnap | 项目源码 | 文件源码

def get_gleu_stats(self, scores):
        """calculate mean and confidence interval from all GLEU iterations"""
        mean = np.mean(scores)
        std = np.std(scores)
        ci = scipy.stats.norm.interval(0.95, loc=mean, scale=std)
        return ['%f' % mean,
                '%f' % std,
                '(%.3f,%.3f)' % (ci[0], ci[1])]

项目：singlecell-dash 作者：czbiohub | 项目源码 | 文件源码

def diff_exp(matrix, group1, group2, index):
    """Computes differential expression between group 1 and group 2
    for each column in the dataframe counts.

    Returns a dataframe of Z-scores and p-values."""

    g1 = matrix[group1, :]
    g2 = matrix[group2, :]

    g1mu = g1.mean(0)
    g2mu = g2.mean(0)

    mean_diff = np.asarray(g1mu - g2mu).flatten()
    # E[X^2] - (E[X])^2
    pooled_sd = np.sqrt(
        ((g1.power(2)).mean(0) - np.power(g1mu, 2)) / len(group1)
        + ((g2.power(2)).mean(0) - np.power(g2mu, 2)) / len(group2))
    pooled_sd = np.asarray(pooled_sd).flatten()

    z_scores = np.zeros_like(pooled_sd)
    nz = pooled_sd > 0
    z_scores[nz] = np.nan_to_num(mean_diff[nz] / pooled_sd[nz])

    # t-test
    p_vals = (1 - stats.norm.cdf(np.abs(z_scores))) * 2

    df = pd.DataFrame(OrderedDict([('z', z_scores), ('p', p_vals)]),
                      index=index)

    return df

项目：singlecell-dash 作者：czbiohub | 项目源码 | 文件源码

def calculate_plate_summaries(self):
        """Get mean reads, percent mapping, etc summaries for each plate"""
        well_map = self.cell_metadata.groupby(Plates.SAMPLE_MAPPING)

        # these stats are from STAR mapping
        star_cols = ['Number of input reads', 'Uniquely mapped reads number']
        star_stats = self.mapping_stats[star_cols].groupby(
                self.cell_metadata[Plates.SAMPLE_MAPPING]).sum()

        total_reads = star_stats['Number of input reads']
        unique_reads = star_stats['Uniquely mapped reads number']

        percent_ercc = well_map.sum()['ercc'].divide(total_reads, axis=0)
        percent_mapped_reads = unique_reads / total_reads - percent_ercc

        plate_summaries = pd.DataFrame(OrderedDict([
            (Plates.MEAN_READS_PER_CELL, total_reads / well_map.size()),
            (Plates.MEDIAN_GENES_PER_CELL, well_map.median()['n_genes']),
            ('Percent not uniquely aligned', 100 * well_map.sum()['alignment_not_unique'].divide(total_reads, axis=0)),
            (Plates.PERCENT_MAPPED_READS, 100 * percent_mapped_reads),
            ('Percent no feature', 100 * well_map.sum()['no_feature'].divide(total_reads, axis=0)),
            ('Percent Rn45s', 100 * self.genes['Rn45s'].groupby(
                    self.cell_metadata[Plates.SAMPLE_MAPPING]).sum() / total_reads),
            (Plates.PERCENT_ERCC, 100 * percent_ercc),
            ('n_wells', well_map.size())
        ]))

        return plate_summaries

项目：brainiak 作者：brainiak | 项目源码 | 文件源码

def _bin_exp(self, n_bin, scale=1.0):
        """ Calculate the bin locations to approximate exponential distribution.
            It breaks the cumulative probability of exponential distribution
            into n_bin equal bins, each covering 1 / n_bin probability. Then it
            calculates the center of mass in each bins and returns the
            centers of mass. So, it approximates the exponential distribution
            with n_bin of Delta function weighted by 1 / n_bin, at the
            locations of these centers of mass.
        Parameters:
        -----------
        n_bin: int
            The number of bins to approximate the exponential distribution
        scale: float, default: 1.0
            The scale parameter of the exponential distribution, defined in
            the same way as scipy.stats. It does not influence the ratios
            between the bins, but just controls the spacing between the bins.
            So generally users should not change its default.
        Returns:
        --------
        bins: numpy array of size [n_bin,]
            The centers of mass for each segment of the
            exponential distribution.
        """
        boundaries = np.flip(scipy.stats.expon.isf(
            np.linspace(0, 1, n_bin + 1),
            scale=scale), axis=0)
        bins = np.empty(n_bin)
        for i in np.arange(n_bin):
            bins[i] = utils.center_mass_exp(
                (boundaries[i], boundaries[i + 1]), scale=scale)
        return bins

项目：brainiak 作者：brainiak | 项目源码 | 文件源码

def _set_SNR_grids(self):
        """ Set the grids and weights for SNR used in numerical integration
            of SNR parameters.
        """
        if self.SNR_prior == 'unif':
            SNR_grids = np.linspace(0, 1, self.SNR_bins)
            SNR_weights = np.ones(self.SNR_bins) / (self.SNR_bins - 1)
            SNR_weights[0] = SNR_weights[0] / 2.0
            SNR_weights[-1] = SNR_weights[-1] / 2.0
        elif self.SNR_prior == 'lognorm':
            dist = scipy.stats.lognorm
            alphas = np.arange(np.mod(self.SNR_bins, 2),
                               self.SNR_bins + 2, 2) / self.SNR_bins
            # The goal here is to divide the area under the pdf curve
            # to segments representing equal probabilities.
            bounds = dist.interval(alphas, (self.logS_range,))
            bounds = np.unique(bounds)
            # bounds contain the boundaries which equally separate
            # the probability mass of the distribution
            SNR_grids = np.zeros(self.SNR_bins)
            for i in np.arange(self.SNR_bins):
                SNR_grids[i] = dist.expect(
                    lambda x: x, args=(self.logS_range,),
                    lb=bounds[i], ub=bounds[i + 1]) * self.SNR_bins
            # Center of mass of each segment between consecutive
            # bounds are set as the grids for SNR.
            SNR_weights = np.ones(self.SNR_bins) / self.SNR_bins
        else:  # SNR_prior == 'exp'
            SNR_grids = self._bin_exp(self.SNR_bins)
            SNR_weights = np.ones(self.SNR_bins) / self.SNR_bins
        SNR_weights = SNR_weights / np.sum(SNR_weights)
        return SNR_grids, SNR_weights