我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.stats()。
def ppf(x, n): try: from scipy import stats except ImportError: stats = None if stats: if n < 30: return stats.t.ppf(x, n) return stats.norm.ppf(x) else: if n < 30: # TODO: implement power series: # http://eprints.maths.ox.ac.uk/184/1/tdist.pdf raise ImportError( 'You must have scipy installed to use t-student ' 'when sample_size is below 30') return norm_ppf(x) # According to http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/ # BS704_Confidence_Intervals/BS704_Confidence_Intervals_print.html
def get_sanity_check_counter(): ''' Provide a dictionary with the standard sanity check counter values. It is typically used like: counters['ZeroCount'] = get_sanity_check_counter() All of these values are unused, except for: bins: - [-inf, inf] (a long counter is appended by SecureCounters, it should only ever have blinding values added) estimated_value: 0.0 (TODO: used for checking if stats have changed) sigma: 0.0 (used for adding noise, 0.0 means no noise is added) ''' sanity_check = {} sanity_check['bins'] = [] single_bin = [float('-inf'), float('inf')] sanity_check['bins'].append(single_bin) sanity_check['sensitivity'] = 0.0 sanity_check['estimated_value'] = 0.0 sanity_check['sigma'] = 0.0 sanity_check['epsilon'] = 0.0 sanity_check['expected_noise_ratio'] = 0.0 return sanity_check
def test_rank_methods_frame(self): tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') import scipy from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 cols = [chr(ord('z') - i) for i in range(xs.shape[1])] for vals in [xs, xs + 1e6, xs * 1e-6]: df = DataFrame(vals, columns=cols) for ax in [0, 1]: for m in ['average', 'min', 'max', 'first', 'dense']: result = df.rank(axis=ax, method=m) sprank = np.apply_along_axis( rankdata, ax, vals, m if m != 'first' else 'ordinal') sprank = sprank.astype(np.float64) expected = DataFrame(sprank, columns=cols) if LooseVersion(scipy.__version__) >= '0.17.0': expected = expected.astype('float64') tm.assert_frame_equal(result, expected)
def gleu(self, stats, smooth=False): """Compute GLEU from collected statistics obtained by call(s) to gleu_stats""" # smooth 0 counts for sentence-level scores if smooth: stats = [s if s != 0 else 1 for s in stats] if len(filter(lambda x: x == 0, stats)) > 0: return 0 (c, r) = stats[:2] log_gleu_prec = sum([math.log(float(x) / y) for x, y in zip(stats[2::2], stats[3::2])]) / 4 for i, (x, y) in enumerate(zip(stats[2::2], stats[3::2])) : pass #print 'Precision', i+1, '=', x, '/', y, '=', 1.*x/y # log_gleu_prec = sum([math.log(float(x) / y) # for x, y in zip(stats[2::2], stats[3::2])]) / 4 return math.exp(min([0, 1 - float(r) / c]) + log_gleu_prec)
def clean_mapping_stats(mapping_stats_original, convert_to_percentage=None): """Remove whitespace from all values and convert to numbers""" if convert_to_percentage is None: convert_to_percentage = set() mapping_stats_original = mapping_stats_original.applymap( lambda x: (x.replace(',', '').strip().strip('%') if isinstance(x, str) else x)) numeric = mapping_stats_original.apply(maybe_to_numeric) numeric.columns = numeric.columns.map(str.strip) # for 10X mapping stats numeric.columns = numeric.columns.map( lambda x: ('Percent {}'.format(x.replace('Fraction ', '')) if x in convert_to_percentage else x) ) return numeric
def prior_GP_var_half_cauchy(y_invK_y, n_y, tau_range): """ Imposing a half-Cauchy prior onto the standard deviation (tau) of the Gaussian Process which is in turn a prior imposed over a function y = f(x). The scale parameter of the half-Cauchy prior is tau_range. The function returns the MAP estimate of tau^2 and log(p(tau|tau_range)) for the MAP value of tau^2, where tau_range describes the reasonable range of tau in the half-Cauchy prior. An alternative form of prior is inverse-Gamma prior on tau^2. Inverse-Gamma prior penalizes for both very small and very large values of tau, while half-Cauchy prior only penalizes for very large values of tau. For more information on usage, see description in BRSA class: `.BRSA` """ tau2 = (y_invK_y - n_y * tau_range**2 + np.sqrt(n_y**2 * tau_range**4 + (2 * n_y + 8) * tau_range**2 * y_invK_y + y_invK_y**2))\ / 2 / (n_y + 2) log_ptau = scipy.stats.halfcauchy.logpdf( tau2**0.5, scale=tau_range) return tau2, log_ptau
def _zscore(a): """ Calculating z-score of data on the first axis. If the numbers in any column are all equal, scipy.stats.zscore will return NaN for this column. We shall correct them all to be zeros. Parameters ---------- a: numpy array Returns ------- zscore: numpy array The z-scores of input "a", with any columns including non-finite numbers replaced by all zeros. """ assert a.ndim > 1, 'a must have more than one dimensions' zscore = scipy.stats.zscore(a, axis=0) zscore[:, np.logical_not(np.all(np.isfinite(zscore), axis=0))] = 0 return zscore
def mad(self, median=None): """ Return the median absolute deviation for this segment only, scaled by phi-1(3/4) to approximate the standard deviation. :param median: If this is not None, it will be used as the median from which the deviation is calculated. :return: A NDarray of shape (n_channels, 1) with the MAD for the channels of this segment. """ c = scipy.stats.norm.ppf(3 / 4) if median is None: subject_median = self.median() else: subject_median = median median_residuals = self.mat_struct.data - subject_median # deviation between median and data mad = np.median(np.abs(median_residuals), axis=1)[:, np.newaxis] return mad / c
def median_absolute_deviation(data, subject_median=None, axis=0): """ Returns the median absolute deviation (MAD) for the given data. :param data: A DataFrame holding the data to calculate the MAD from. :param subject_median: If given, these will be used as the medians which the deviation is calculated from. If None The median of the given data is used. :param axis: The axis to calculate over. :return: A Series object with the median absolute deviations over the given *axis* for the *data*. """ """A robust estimate of the standard deviation""" # The scaling factor for estimating the standard deviation from the MAD c = scipy.stats.norm.ppf(3 / 4) if isinstance(data, pd.DataFrame): if subject_median is None: subject_median = data.median(axis=axis) median_residuals = data - subject_median mad = median_residuals.abs().median(axis=axis) else: if subject_median is None: subject_median = np.median(data, axis=axis) median_residuals = data - subject_median[:, np.newaxis] # deviation between median and data mad = np.median(np.abs(median_residuals), axis=axis) return mad / c
def read_folder(stats_folder, metrics=None): """ Reads all segment statistics files from the given folder and returns the statistics as a single DataFrame. :param stats_folder: The folder to reat statistics files from. :param metrics: A subset of the metrics to load. :return: A DataFrame with all the statistics available in stats_folder. """ if metrics is None: metrics = ['absolute mean', 'absolute median', 'kurtosis', 'skew', 'std'] glob_pattern = os.path.join(stats_folder, '*segments_statistics.csv') files = glob.glob(glob_pattern) if files: stats = pd.concat([read_stats(stat_file, metrics) for stat_file in files]) return stats else: raise IOError("No segment statistics file in folder {}".format(stats_folder))
def get_subject_metric(stats_df, metric_name, aggregator='{dataframe}.median()', channel_ordering=None, use_cache=True): """ Returns the metric given by stats df as a NDArray-like of shape (n_channels, 1), obtained by aggregating the given metric from the dataframe. :param stats_df: The statistics dataframe acquired from read_stats. :param metric_name: The metric to collect. :param aggregator: A string with an expression used to aggregate the per segment statistic to a single statistic for the whole subject. :param channel_ordering: An optional ordered sequence of channel names, which will ensure that the outputted statistics vector has the same order as the segment which the statistic should be applied on. :param use_cache: If True, the metrics will be cached by the function so that calling it multiple times for the same subject is fast. :return: A NDArray of shape (n_channels, 1) where each element along axis 0 correspond to the aggregated statistic that channel. """ cache = get_subject_metric.cache assert isinstance(stats_df, pd.DataFrame) if use_cache and id(stats_df) in cache and (metric_name, aggregator) in cache[id(stats_df)]: return cache[id(stats_df)][(metric_name, aggregator)] # The stats dataframes have a 2-level column index, where the first level are the channel names and the seconde # the metric name. To get the metric but keep the channels we slice the first level with all the entries using # slice(None), this is equivalent to [:] for regular slices. if channel_ordering is None: segment_metrics = stats_df.loc[:, (slice(None), metric_name)] else: segment_metrics = stats_df.loc[:, (channel_ordering, metric_name)] aggregated_metric = eval(aggregator.format(dataframe='segment_metrics')) added_axis = aggregated_metric[:, np.newaxis] cache[id(stats_df)][(metric_name, aggregator)] = added_axis return added_axis
def cdf_to_ppf(self, data, params): ''' Take the specific distributions fitted parameters and calculate the cdf. Apply the inverse normal distribution to the cdf to get the SPI SPEI. This process is best described in Lloyd-Hughes and Saunders, 2002 which is included in the documentation. ''' # Calculate the CDF of observed precipitation on a given time scale cdf = self.distr.cdf( data, *params[:-2], loc=params[-2], scale=params[-1] ) # Apply inverse normal distribution norm_ppf = scipy.stats.norm.ppf(cdf) return norm_ppf
def joint_density(X, Y, bounds=None): """ Plots joint distribution of variables. Inherited from method in src/graphics.py module in project git://github.com/aflaxman/pymc-example-tfr-hdi.git """ if bounds: X_min, X_max, Y_min, Y_max = bounds else: X_min = X.min() X_max = X.max() Y_min = Y.min() Y_max = Y.max() pylab.plot(X, Y, linestyle='none', marker='o', color='green', mec='green', alpha=.2, zorder=-99) gkde = scipy.stats.gaussian_kde([X, Y]) x,y = pylab.mgrid[X_min:X_max:(X_max-X_min)/25.,Y_min:Y_max:(Y_max-Y_min)/25.] z = pylab.array(gkde.evaluate([x.flatten(), y.flatten()])).reshape(x.shape) pylab.contour(x, y, z, linewidths=2) pylab.axis([X_min, X_max, Y_min, Y_max])
def scatterfit(x,y,a=None,b=None): """ Compute the mean deviation of the data about the linear model given if A,B (*y=ax+b*) provided as arguments. Otherwise, compute the mean deviation about the best-fit line. :param x,y: assumed to be Numpy arrays. :param a,b: scalars. :rtype: float sd with the mean deviation. """ if a==None: # Performs linear regression a, b, r, p, err = scipy.stats.linregress(x,y) # Std. deviation of an individual measurement (Bevington, eq. 6.15) N=numpy.size(x) sd=1./(N-2.)* numpy.sum((y-a*x-b)**2) sd=numpy.sqrt(sd) return sd
def r2p(r,n): """ Given a value of the Pearson r coefficient, this method gives the corresponding p-value of the null hypothesis (i.e. no correlation). Code copied from scipy.stats.pearsonr source. Usage: >>> p=r2p(r,n) where 'r' is the Pearson coefficient and n is the number of data points. :returns: p-value of the null hypothesis. """ df = n-2 if abs(r) == 1.0: prob = 0.0 else: t_squared = r*r * (df / ((1.0 - r) * (1.0 + r))) prob = scipy.stats.betai(0.5*df, 0.5, df / (df + t_squared)) return prob
def test_run_roi_stats_via_API(): "Tests whether roi stats can be computed (not their accuracy) and the return values match in size." summary_methods = ['median', 'mean', 'std', 'variation', 'entropy', 'skew', 'kurtosis'] # 'mode' returns more than one value; 'gmean' requires only positive values, # 'hmean' can not always be computed from scipy.stats import trim_mean, kstat from functools import partial trimmed_mean = partial(trim_mean, proportiontocut=0.05) third_kstat = partial(kstat, n=3) summary_methods.extend([trimmed_mean, third_kstat]) # checking support for nan-handling callables summary_methods.extend([np.nanmedian, np.nanmean]) for summary_method in summary_methods: roi_medians = graynet.roiwise_stats_indiv(subject_id_list, fs_dir, base_feature=base_feature, chosen_roi_stats=summary_method, atlas=atlas, smoothing_param=fwhm, out_dir=out_dir, return_results=True) for sub in subject_id_list: if roi_medians[sub].size != num_roi_wholebrain: raise ValueError('invalid summary stats - #nodes do not match.')
def coeff_summary(self): """ Get the estimates of the terms in the model. :return: A DataFrame of betas, se, t (or z), p, llci, ulci for all variables of the model. """ results = self.estimation_results if results: if "t" in results.keys(): # Model has t-stats rather than z-stats coeffs = np.array( [results["betas"], results["se"], results["t"], results["p"], results["llci"], results["ulci"]]).T df = pd.DataFrame(coeffs, index=results["names"], columns=["coeff", "se", "t", "p", "LLCI", "ULCI"]) else: # Model has z-stats. coeffs = np.array( [results["betas"], results["se"], results["z"], results["p"], results["llci"], results["ulci"]]).T df = pd.DataFrame(coeffs, index=results["names"], columns=["coeff", "se", "Z", "p", "LLCI", "ULCI"]) else: raise NotImplementedError( "The model has not been estimated yet. Please estimate the model first." ) return df
def _estimate(self): """ Estimates the direct effect of X on Y, and return the results into as a dictionary. :return: dict A dictionary of parameters and model estimates. """ mod_values = [i for i in product(*self._moderators_values)] mod_symb = self._moderators_symb betas, se, llci, ulci = self._get_conditional_direct_effects(mod_symb, mod_values) t = betas / se if self._is_logit: p = stats.norm.sf(np.abs(t)) * 2 else: df_e = self._model.estimation_results["df_e"] p = stats.t.sf(np.abs(t), df_e) * 2 estimation_results = {"betas": betas, "se": se, "t": t, "p": p, "llci": llci, "ulci": ulci} return estimation_results
def _compute_log_likelihood(self, obs, pstates_idx): q = np.zeros(shape=(len(obs), self.n_hidden_states, self.n_features)) for col, (feature_name, feature_distr) in enumerate(zip(self.emission_name, self.emission_distr)): if feature_distr == 'normal': mu = self.emission[feature_name]['mu'][pstates_idx] sigma = self.emission[feature_name]['sigma'][pstates_idx] for j in range(self.n_hidden_states): q[:, j, col] = np.log( np.maximum(MIN_PROBA, stats.norm.pdf(obs[:, col], loc=mu[:, j], scale=sigma[:, j]))) if feature_distr == 'lognormal': logmu = self.emission[feature_name]['logmu'][pstates_idx] logsigma = self.emission[feature_name]['logsigma'][pstates_idx] for j in range(self.n_hidden_states): q[:, j, col] = np.log(np.maximum(MIN_PROBA, stats.lognorm.pdf(obs[:, col], logsigma[:, j], loc=0, scale=np.exp(logmu[:, j])))) q = q.sum(axis=2) return q
def test_rank_methods_series(self): tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata') import scipy from scipy.stats import rankdata xs = np.random.randn(9) xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates np.random.shuffle(xs) index = [chr(ord('a') + i) for i in range(len(xs))] for vals in [xs, xs + 1e6, xs * 1e-6]: ts = Series(vals, index=index) for m in ['average', 'min', 'max', 'first', 'dense']: result = ts.rank(method=m) sprank = rankdata(vals, m if m != 'first' else 'ordinal') expected = Series(sprank, index=index) if LooseVersion(scipy.__version__) >= '0.17.0': expected = expected.astype('float64') tm.assert_series_equal(result, expected)
def test_skew(self): tm._skip_if_no_scipy() from scipy.stats import skew alt = lambda x: skew(x, bias=False) self._check_stat_op('skew', alt) # test corner cases, skew() returns NaN unless there's at least 3 # values min_N = 3 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: self.assertTrue(np.isnan(s.skew())) self.assertTrue(np.isnan(df.skew()).all()) else: self.assertEqual(0, s.skew()) self.assertTrue((df.skew() == 0).all())
def test_kurt(self): tm._skip_if_no_scipy() from scipy.stats import kurtosis alt = lambda x: kurtosis(x, bias=False) self._check_stat_op('kurt', alt) index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar']) # test corner cases, kurt() returns NaN unless there's at least 4 # values min_N = 4 for i in range(1, min_N + 1): s = Series(np.ones(i)) df = DataFrame(np.ones((i, i))) if i < min_N: self.assertTrue(np.isnan(s.kurt())) self.assertTrue(np.isnan(df.kurt()).all()) else: self.assertEqual(0, s.kurt()) self.assertTrue((df.kurt() == 0).all())
def perform_test(self, dat): """ dat: a instance of Data """ with util.ContextTimer() as t: alpha = self.alpha X = dat.data() n = X.shape[0] # H: length-n vector _, H = self.compute_stat(dat, return_pointwise_stats=True) test_stat = np.sqrt(n/2)*np.mean(H) stat_var = np.mean(H**2) pvalue = stats.norm.sf(test_stat, loc=0, scale=np.sqrt(stat_var) ) results = {'alpha': self.alpha, 'pvalue': pvalue, 'test_stat': test_stat, 'h0_rejected': pvalue < alpha, 'time_secs': t.secs, } return results
def correlation(x,y,name="Pearson"): # Linear or rank correlation # Returns: # r,rho, or tau (float) = the test statistic # pvalue (float) = the p-value for the test assert len(x)==len(y), "Arrays x & y must be equal length." if name == "Pearson": (r,pvalue) = scipy.stats.pearsonr(x,y) return r,pvalue elif name == "Spearman": (rho,pvalue) = scipy.stats.spearmanr(x,y) return rho,pvalue elif name == "Kendall": (tau,pvalue) = scipy.stats.kendalltau(x,y) return tau,pvalue else: error = ("The {} correlation is not available." "Please use 'Pearson', 'Spearman', or 'Kendall.'") error.format(name) raise ValueError(error)
def _pval_from_histogram(T, H0, tail): """Get p-values from stats values given an H0 distribution For each stat compute a p-value as percentile of its statistics within all statistics in surrogate data """ if tail not in [-1, 0, 1]: raise ValueError('invalid tail parameter') # from pct to fraction if tail == -1: # up tail pval = np.array([np.sum(H0 <= t) for t in T]) elif tail == 1: # low tail pval = np.array([np.sum(H0 >= t) for t in T]) else: # both tails pval = np.array([np.sum(abs(H0) >= abs(t)) for t in T]) pval = (pval + 1.0) / (H0.size + 1.0) # the init data is one resampling return pval
def instantiate_dist(name, *opts): """ Creates distribution for the given command-line arguments. """ try: dist = getattr(stats, name) except AttributeError: raise ValueError("No such distribution {}".format(name)) opts = list(opts) try: loc = float(opts.pop(0)) except IndexError: loc = 0 try: scale = float(opts.pop(0)) except IndexError: scale = 1 return dist(*map(float, opts), loc=loc, scale=scale)
def resample(self,data=[],temperature=1.,stats=None): stats = self._get_statistics(data) if stats is None else stats alphas_n, betas_n, mu_n, nus_n = self._natural_to_standard( self.natural_hypparam + stats / temperature) D = mu_n.shape[0] self.sigmas = 1/np.random.gamma(alphas_n,scale=1/betas_n) self.mu = np.sqrt(self.sigmas/nus_n)*np.random.randn(D) + mu_n assert not np.isnan(self.mu).any() assert not np.isnan(self.sigmas).any() # NOTE: next line is to use Gibbs sampling to initialize mean field self.mf_mu = self.mu assert self.sigmas.ndim == 1 return self
def max_likelihood(self,data,weights=None,stats=None): if stats is not None: n, tot = stats elif weights is None: n, tot = super(NegativeBinomialIntegerR,self)._get_statistics(data) else: n, tot = super(NegativeBinomialIntegerR,self)._get_weighted_statistics(data,weights) if n > 1: rs = self.r_support ps = self._max_likelihood_ps(n,tot,rs) # TODO TODO this isn't right for weighted data: do weighted sums if isinstance(data,np.ndarray): likelihoods = np.array([self.log_likelihood(data,r=r,p=p).sum() for r,p in zip(rs,ps)]) else: likelihoods = np.array([sum(self.log_likelihood(d,r=r,p=p).sum() for d in data) for r,p in zip(rs,ps)]) argmax = likelihoods.argmax() self.r = self.r_support[argmax] self.p = ps[argmax] return self
def bootstrap_extradata(self, nBoot, extradataA, nbins = 20): pops =[] meanpop = [[] for i in data.cat] pylab.figure(figsize = (14,14)) for i in xrange(min(4, len(extradataA))): #pylab.subplot(2,2,i+1) if i ==0: pylab.title("Bootstrap on means", fontsize = 20.) pop = extradataA[i]# & (self.GFP > 2000)]# for index in xrange(nBoot): newpop = np.random.choice(pop, size=len(pop), replace=True) #meanpop[i].append(np.mean(newpop)) pops.append(newpop) pylab.legend() #pylab.title(cat[i]) pylab.xlabel("Angle(degree)", fontsize = 15) pylab.xlim([0., 90.]) for i in xrange(len(extradataA)): for j in xrange(i+1, len(extradataA)): statT, pvalue = scipy.stats.ttest_ind(pops[i], pops[j], equal_var=False) print "cat{0} & cat{1} get {2} ({3})".format(i,j, pvalue,statT) pylab.savefig("/users/biocomp/frose/frose/Graphics/FINALRESULTS-diff-f3/mean_nBootstrap{0}_bins{1}_GFPsup{2}_FLO_{3}.png".format(nBoot, nbins, 'all', randint(0,999)))
def satisfies_dp(sensitivity, epsilon, delta, std): ''' Return True if (epsilon, delta)-differential privacy is satisfied. ''' # find lowest value at which epsilon differential-privacy is satisfied lower_x = -(float(epsilon) * (std**2.0) / sensitivity) + sensitivity/2.0 # determine lower tail probability of normal distribution w/ mean of zero lower_tail_prob = scipy.stats.norm.cdf(lower_x, 0, std) # explicitly return Boolean value to avoid returning numpy type if (lower_tail_prob <= delta): return True else: return False
def print_privacy_allocation(stats_parameters, sigmas, epsilons, excess_noise_ratio): ''' Print information about sigmas and noise ratios for each statistic. ''' # print epsilons sorted by epsilon allocation epsilons_sorted = epsilons.keys() epsilons_sorted.sort(key = lambda x: epsilons[x], reverse = True) print('Epsilon values\n-----') for param in epsilons_sorted: print('{}: {}'.format(param, epsilons[param])) # print equalized ratio of sigma to expected value for param, stats in stats_parameters.iteritems(): ratio = get_expected_noise_ratio(excess_noise_ratio, sigmas[param], stats[1]) print('Ratio of sigma value to expected value: {}'.format(ratio)) break print "" # print allocation of privacy budget sigma_params_sorted = sigmas.keys() # add dummy counter for full counters.noise.yaml dummy_counter = DEFAULT_DUMMY_COUNTER_NAME sigma_params_sorted.append(dummy_counter) sigmas[dummy_counter] = get_sanity_check_counter()['sigma'] epsilons[dummy_counter] = get_sanity_check_counter()['epsilon'] sigma_params_sorted.sort() print('Sigma values\n-----') print('counters:') for param in sigma_params_sorted: sigma = sigmas[param] print(' {}:'.format(param)) print(' sigma: {:4f}'.format(sigma)) print ""
def _setup_initial_blobs(self): # Define models self.score_model = ModelHelper(name="score_" + self.model_id) self.train_model = ModelHelper(name="train_" + self.model_id) # Create input, output, labels, and loss blobs self.input_blob = "ModelInput_" + self.model_id workspace.FeedBlob(self.input_blob, np.zeros(1, dtype=np.float32)) self.output_blob = "ModelOutput_" + self.model_id workspace.FeedBlob(self.output_blob, np.zeros(1, dtype=np.float32)) self.labels_blob = "ModelLabels_" + self.model_id workspace.FeedBlob(self.labels_blob, np.zeros(1, dtype=np.float32)) self.loss_blob = "loss" # "ModelLoss_" + self.model_id workspace.FeedBlob(self.loss_blob, np.zeros(1, dtype=np.float32)) # Create blobs for model parameters self.weights: List[str] = [] self.biases: List[str] = [] for x in six.moves.range(len(self.layers) - 1): dim_in = self.layers[x] dim_out = self.layers[x + 1] weight_name = "Weights_" + str(x) + "_" + self.model_id bias_name = "Biases_" + str(x) + "_" + self.model_id self.weights.append(weight_name) self.biases.append(bias_name) bias = np.zeros( shape=[ dim_out, ], dtype=np.float32 ) workspace.FeedBlob(bias_name, bias) gain = np.sqrt(2) if self.activations[x] == 'relu' else 1 weights = scipy.stats.norm(0, gain * np.sqrt(1 / dim_in)).rvs( size=[dim_out, dim_in] ).astype(np.float32) workspace.FeedBlob(weight_name, weights)
def _setup_initial_blobs(self): MLTrainer._setup_initial_blobs(self) self.output_conv_blob = "Conv_output_{}".format(self.model_id) workspace.FeedBlob(self.output_conv_blob, np.zeros(1, dtype=np.float32)) self.conv_weights: List[str] = [] self.conv_biases: List[str] = [] for x in six.moves.range(len(self.dims) - 1): dim_in = self.dims[x] dim_out = self.dims[x + 1] kernel_h = self.conv_height_kernels[x] kernel_w = self.conv_width_kernels[x] weight_shape = [dim_out, kernel_h, kernel_w, dim_in] bias_shape = [dim_out, ] conv_weight_name = "ConvWeights_" + str(x) + "_" + self.model_id bias_name = "ConvBiases_" + str(x) + "_" + self.model_id self.conv_weights.append(conv_weight_name) self.conv_biases.append(bias_name) conv_bias = np.zeros(shape=bias_shape, dtype=np.float32) workspace.FeedBlob(bias_name, conv_bias) conv_weights = scipy.stats.norm(0, np.sqrt(1 / dim_in)).rvs( size=weight_shape ).astype(np.float32) workspace.FeedBlob(conv_weight_name, conv_weights)
def robust_std(l, alpha=1/scipy.stats.norm.ppf(0.75)): """ Compute a robust estimate of the standard deviation for the list of values *l* (by default, for normally distributed samples --- see https://en.wikipedia.org/wiki/Median_absolute_deviation). """ return alpha * mad(l)[0]
def main(argv=None): if argv is None: argv = sys.argv parser = ArgumentParser('Report mean and standard deviation for the stream of numbers read from stdin', formatter_class=ArgumentDefaultsHelpFormatter) args = parser.parse_args(argv[1:]) stats = Stats() for line in sys.stdin: stats(float(line)) print(stats)
def get(name): try: return getattr(stats, name) except AttributeError: raise PlotnineError( "Unknown distribution '{}'".format(name))
def get_continuous_distribution(name): if name not in continuous: msg = "Unknown continuous distribution '{}'" raise ValueError(msg.format(name)) return getattr(stats, name)
def get_gleu_stats(self, scores): """calculate mean and confidence interval from all GLEU iterations""" mean = np.mean(scores) std = np.std(scores) ci = scipy.stats.norm.interval(0.95, loc=mean, scale=std) return ['%f' % mean, '%f' % std, '(%.3f,%.3f)' % (ci[0], ci[1])]
def diff_exp(matrix, group1, group2, index): """Computes differential expression between group 1 and group 2 for each column in the dataframe counts. Returns a dataframe of Z-scores and p-values.""" g1 = matrix[group1, :] g2 = matrix[group2, :] g1mu = g1.mean(0) g2mu = g2.mean(0) mean_diff = np.asarray(g1mu - g2mu).flatten() # E[X^2] - (E[X])^2 pooled_sd = np.sqrt( ((g1.power(2)).mean(0) - np.power(g1mu, 2)) / len(group1) + ((g2.power(2)).mean(0) - np.power(g2mu, 2)) / len(group2)) pooled_sd = np.asarray(pooled_sd).flatten() z_scores = np.zeros_like(pooled_sd) nz = pooled_sd > 0 z_scores[nz] = np.nan_to_num(mean_diff[nz] / pooled_sd[nz]) # t-test p_vals = (1 - stats.norm.cdf(np.abs(z_scores))) * 2 df = pd.DataFrame(OrderedDict([('z', z_scores), ('p', p_vals)]), index=index) return df
def calculate_plate_summaries(self): """Get mean reads, percent mapping, etc summaries for each plate""" well_map = self.cell_metadata.groupby(Plates.SAMPLE_MAPPING) # these stats are from STAR mapping star_cols = ['Number of input reads', 'Uniquely mapped reads number'] star_stats = self.mapping_stats[star_cols].groupby( self.cell_metadata[Plates.SAMPLE_MAPPING]).sum() total_reads = star_stats['Number of input reads'] unique_reads = star_stats['Uniquely mapped reads number'] percent_ercc = well_map.sum()['ercc'].divide(total_reads, axis=0) percent_mapped_reads = unique_reads / total_reads - percent_ercc plate_summaries = pd.DataFrame(OrderedDict([ (Plates.MEAN_READS_PER_CELL, total_reads / well_map.size()), (Plates.MEDIAN_GENES_PER_CELL, well_map.median()['n_genes']), ('Percent not uniquely aligned', 100 * well_map.sum()['alignment_not_unique'].divide(total_reads, axis=0)), (Plates.PERCENT_MAPPED_READS, 100 * percent_mapped_reads), ('Percent no feature', 100 * well_map.sum()['no_feature'].divide(total_reads, axis=0)), ('Percent Rn45s', 100 * self.genes['Rn45s'].groupby( self.cell_metadata[Plates.SAMPLE_MAPPING]).sum() / total_reads), (Plates.PERCENT_ERCC, 100 * percent_ercc), ('n_wells', well_map.size()) ])) return plate_summaries
def _bin_exp(self, n_bin, scale=1.0): """ Calculate the bin locations to approximate exponential distribution. It breaks the cumulative probability of exponential distribution into n_bin equal bins, each covering 1 / n_bin probability. Then it calculates the center of mass in each bins and returns the centers of mass. So, it approximates the exponential distribution with n_bin of Delta function weighted by 1 / n_bin, at the locations of these centers of mass. Parameters: ----------- n_bin: int The number of bins to approximate the exponential distribution scale: float, default: 1.0 The scale parameter of the exponential distribution, defined in the same way as scipy.stats. It does not influence the ratios between the bins, but just controls the spacing between the bins. So generally users should not change its default. Returns: -------- bins: numpy array of size [n_bin,] The centers of mass for each segment of the exponential distribution. """ boundaries = np.flip(scipy.stats.expon.isf( np.linspace(0, 1, n_bin + 1), scale=scale), axis=0) bins = np.empty(n_bin) for i in np.arange(n_bin): bins[i] = utils.center_mass_exp( (boundaries[i], boundaries[i + 1]), scale=scale) return bins
def _set_SNR_grids(self): """ Set the grids and weights for SNR used in numerical integration of SNR parameters. """ if self.SNR_prior == 'unif': SNR_grids = np.linspace(0, 1, self.SNR_bins) SNR_weights = np.ones(self.SNR_bins) / (self.SNR_bins - 1) SNR_weights[0] = SNR_weights[0] / 2.0 SNR_weights[-1] = SNR_weights[-1] / 2.0 elif self.SNR_prior == 'lognorm': dist = scipy.stats.lognorm alphas = np.arange(np.mod(self.SNR_bins, 2), self.SNR_bins + 2, 2) / self.SNR_bins # The goal here is to divide the area under the pdf curve # to segments representing equal probabilities. bounds = dist.interval(alphas, (self.logS_range,)) bounds = np.unique(bounds) # bounds contain the boundaries which equally separate # the probability mass of the distribution SNR_grids = np.zeros(self.SNR_bins) for i in np.arange(self.SNR_bins): SNR_grids[i] = dist.expect( lambda x: x, args=(self.logS_range,), lb=bounds[i], ub=bounds[i + 1]) * self.SNR_bins # Center of mass of each segment between consecutive # bounds are set as the grids for SNR. SNR_weights = np.ones(self.SNR_bins) / self.SNR_bins else: # SNR_prior == 'exp' SNR_grids = self._bin_exp(self.SNR_bins) SNR_weights = np.ones(self.SNR_bins) / self.SNR_bins SNR_weights = SNR_weights / np.sum(SNR_weights) return SNR_grids, SNR_weights
def lnfunc(self,args): # define the function which needs to be sampled (log posterior) # Do not use self.args in this function. You can use self.eargs. return scipy.stats.norm.logpdf(args['x'],loc=args['mu'],scale=args['sigma'])
def lnfunc(self,args): # log posterior temp1=scipy.stats.norm.logpdf(args['xt'],loc=args['mu'],scale=args['sigma']) temp2=scipy.stats.norm.logpdf(args['x'],loc=args['xt'],scale=args['sigma_x']) return temp1+temp2
def lnfunc(self,args): if self.eargs['outliers'] == False: temp1=(args['y']-(self.args['m']*self.args['x']+self.args['c']))/args['sigma_y'] return -0.5*(temp1*temp1)-np.log(np.sqrt(2*np.pi)*args['sigma_y']) else: temp11=scipy.stats.norm.pdf(args['y'],loc=(self.args['m']*self.args['x']+self.args['c']),scale=args['sigma_y']) sigma_b=np.sqrt(np.square(args['sigma_y'])+np.square(args['sigma_b'])) temp22=scipy.stats.norm.pdf(args['y'],loc=self.args['mu_b'],scale=sigma_b) return np.log((1-args['p_b'])*temp11+args['p_b']*temp22)
def lnfunc(self,args): # log posterior, xt has been integrated out sigma=np.sqrt(args['sigma']*args['sigma']+args['sigma_x']*args['sigma_x']) temp=scipy.stats.norm.logpdf(args['x'],loc=args['mu'],scale=sigma) return temp
def lnfunc(self,args): # log posterior temp1=[] for i,y in enumerate(self.data['y']): temp1.append(np.sum(self.lngauss(y-args['alpha'][i],self.eargs['sigma']))) temp1=np.array(temp1) temp2=scipy.stats.norm.logpdf(args['alpha'],loc=args['mu'],scale=args['omega']) return temp1+temp2
def __init__(self, stats, channels, target_dim): self.stats = stats self.channels = channels self.target_dim = (target_dim[0], target_dim[1]) self.illum_corr_func = np.zeros((self.target_dim[0], self.target_dim[1], len(self.channels))) # Based on Sing et al. 2014 paper
def channel_function(self, mean_channel, disk_size): #TODO: get np.type from other source or parameterize or compute :/ # We currently assume 16 bit images operator = skimage.morphology.disk(disk_size) filtered_channel = skimage.filters.median(mean_channel.astype(np.uint16), operator) filtered_channel = skimage.transform.resize(filtered_channel, self.target_dim, preserve_range=True) robust_minimum = scipy.stats.scoreatpercentile(filtered_channel, 2) filtered_channel = np.maximum(filtered_channel, robust_minimum) illum_corr_func = filtered_channel / robust_minimum return illum_corr_func
def compute_all(self, median_filter_size): disk_size = median_filter_size / 2 # From diameter to radius for ch in range(len(self.channels)): self.illum_corr_func[:, :, ch] = self.channel_function(self.stats["mean_image"][:, :, ch], disk_size) # TODO: Is image a uint16 or float32/64? What is its data type? # TODO: Update the test as appropriate. # Not being used? Not needed?