我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.percentile()。
def remove_outliers(idx_ts): """Remove outliers from a given timestamps array. Parameters ---------- idx_ts : numpy.ndarray given timestamp array. Returns ------- idx_ts_new : numpy.ndarray. outliers removed timestamp array. """ idx_ts_new = idx_ts.copy() summary = np.percentile(idx_ts_new, [25, 50, 75]) high_lim = summary[0] - 1.5 * (summary[2] - summary[1]) low_lim = summary[2] + 1.5 * (summary[2] - summary[1]) idx_ts_new = idx_ts_new[~(idx_ts_new >= low_lim)] idx_ts_new = idx_ts_new[~(idx_ts_new <= high_lim)] return idx_ts_new
def _classify_gems(counts0, counts1): """ Infer number of distinct transcriptomes present in each GEM (1 or 2) and report cr_constants.GEM_CLASS_GENOME0 for a single cell w/ transcriptome 0, report cr_constants.GEM_CLASS_GENOME1 for a single cell w/ transcriptome 1, report cr_constants.GEM_CLASS_MULTIPLET for multiple transcriptomes """ # Assumes that most of the GEMs are single-cell; model counts independently thresh0, thresh1 = [cr_constants.DEFAULT_MULTIPLET_THRESHOLD] * 2 if sum(counts0 > counts1) >= 1 and sum(counts1 > counts0) >= 1: thresh0 = np.percentile(counts0[counts0 > counts1], cr_constants.MULTIPLET_PROB_THRESHOLD) thresh1 = np.percentile(counts1[counts1 > counts0], cr_constants.MULTIPLET_PROB_THRESHOLD) doublet = np.logical_and(counts0 >= thresh0, counts1 >= thresh1) dtype = np.dtype('|S%d' % max(len(cls) for cls in cr_constants.GEM_CLASSES)) result = np.where(doublet, cr_constants.GEM_CLASS_MULTIPLET, cr_constants.GEM_CLASS_GENOME0).astype(dtype) result[np.logical_and(np.logical_not(result == cr_constants.GEM_CLASS_MULTIPLET), counts1 > counts0)] = cr_constants.GEM_CLASS_GENOME1 return result
def get_normalized_dispersion(mat_mean, mat_var, nbins=20): mat_disp = (mat_var - mat_mean) / np.square(mat_mean) quantiles = np.percentile(mat_mean, np.arange(0, 100, 100 / nbins)) quantiles = np.append(quantiles, mat_mean.max()) # merge bins with no difference in value quantiles = np.unique(quantiles) if len(quantiles) <= 1: # pathological case: the means are all identical. just return raw dispersion. return mat_disp # calc median dispersion per bin (disp_meds, _, disp_bins) = scipy.stats.binned_statistic(mat_mean, mat_disp, statistic='median', bins=quantiles) # calc median absolute deviation of dispersion per bin disp_meds_arr = disp_meds[disp_bins-1] # 0th bin is empty since our quantiles start from 0 disp_abs_dev = abs(mat_disp - disp_meds_arr) (disp_mads, _, disp_bins) = scipy.stats.binned_statistic(mat_mean, disp_abs_dev, statistic='median', bins=quantiles) # calculate normalized dispersion disp_mads_arr = disp_mads[disp_bins-1] disp_norm = (mat_disp - disp_meds_arr) / disp_mads_arr return disp_norm
def prctile(data, p_vals=[0, 25, 50, 75, 100], sorted_=False): """``prctile(data, 50)`` returns the median, but p_vals can also be a sequence. Provides for small samples or extremes IMHO better values than matplotlib.mlab.prctile or np.percentile, however also slower. """ ps = [p_vals] if np.isscalar(p_vals) else p_vals if not sorted_: data = sorted(data) n = len(data) d = [] for p in ps: fi = p * n / 100 - 0.5 if fi <= 0: # maybe extrapolate? d.append(data[0]) elif fi >= n - 1: d.append(data[-1]) else: i = int(fi) d.append((i + 1 - fi) * data[i] + (fi - i) * data[i + 1]) return d[0] if np.isscalar(p_vals) else d
def fit_predict(self, ts): """ Unsupervised training of TSBitMaps. :param ts: 1-D numpy array or pandas.Series :return labels: `+1` for normal observations and `-1` for abnormal observations """ assert self._lag_window_size > self._feature_window_size, 'lag_window_size must be >= feature_window_size' self._ref_ts = ts scores = self._slide_chunks(ts) self._ref_bitmap_scores = scores thres = np.percentile(scores[self._lag_window_size: -self._lead_window_size + 1], self._q) labels = np.full(len(ts), 1) for idx, score in enumerate(scores): if score > thres: labels[idx] = -1 return labels
def fit(self, X, y=None): old_threshold = None threshold = None self.threshold_ = 0.0 self._fit(X,y) count = 0 while count < 100 and (old_threshold is None or abs(threshold - old_threshold) > 0.01): old_threshold = threshold ss = self.decision_function(X,y) threshold = percentile(ss, 100 * self.contamination) self._fit(X[ss > threshold],y[ss > threshold] if y is not None else None) count += 1 self.threshold_ = threshold return self
def _update_quantile(self): states = np.array(self._quantile_states, dtype=np.float32) limited_action_values = self.action_values(states, self._limited_action) base_action_values = np.max( np.array( [ self.action_values(states, action) for action in six.moves.range(self.num_actions) if action != self._limited_action ] ), axis=0 ) target = np.percentile( limited_action_values - base_action_values, self._quantile ) print("REWARD PENALTY TARGET:", target) self.quantile_value += self._quantile_update_rate * target print("QUANTILE:", self.quantile_value)
def b_value(mags, mt, perc=[2.5, 97.5], n_reps=None): """Compute the b-value and optionally its confidence interval.""" # Extract magnitudes above completeness threshold m = mags[mags >= mt] # Compute b-value b = (np.mean(m) - mt) * np.log(10) # Draw bootstrap replicates if n_reps is None: return b else: m_bs_reps = dcst.draw_bs_reps(m, np.mean, size=n_reps) # Compute b-value from replicates b_bs_reps = (m_bs_reps - mt) * np.log(10) # Compute confidence interval conf_int = np.percentile(b_bs_reps, perc) return b, conf_int
def simple_slope_percentiles(res, df, target, varying, percs=[25, 50, 75]): exog = {} for param in res.fe_params.index: if len(param.split(":")) != 1: continue if param == "Intercept": exog[param] = 1.0 else: exog[param] = np.median(df[param]) ret_vals = collections.OrderedDict() for varying_perc in percs: exog[varying] = np.percentile(df[varying], varying_perc) ret_vals[exog[varying]] = collections.defaultdict(list) for target_perc in [25, 75]: exog[target] = np.percentile(df[target], target_perc) exog_arr = np.array([exog[param] if len(param.split(":")) == 1 else exog[param.split(":")[0]] * exog[param.split(":")[1]] for param in res.fe_params.index]) ret_vals[exog[varying]]["endog"].append(res.model.predict(res.fe_params, exog=exog_arr)) ret_vals[exog[varying]]["target"].append(exog[target]) return ret_vals
def simple_slope_categories(res, df, target, cat, cats): exog = {} for param in res.fe_params.index: if len(param.split(":")) != 1: continue if param == "Intercept": exog[param] = 1.0 elif param in cats: exog[param] = 0 else: exog[param] = np.mean(df[param]) if cat != None: exog[cat] = 1 x_points = [] y_points = [] for target_perc in [10, 90]: exog[target] = np.percentile(df[target], target_perc) # exog[target] = target_perc exog_arr = np.array([exog[param] if len(param.split(":")) == 1 else exog[param.split(":")[0]] * exog[param.split(":")[1]] for param in res.fe_params.index]) y_points.append(res.model.predict(res.fe_params, exog=exog_arr)) x_points.append(exog[target]) return x_points, y_points
def test_value_at_risk(self): value_at_risk = self.empyrical.value_at_risk returns = [1.0, 2.0] assert_almost_equal(value_at_risk(returns, cutoff=0.0), 1.0) assert_almost_equal(value_at_risk(returns, cutoff=0.3), 1.3) assert_almost_equal(value_at_risk(returns, cutoff=1.0), 2.0) returns = [1, 81, 82, 83, 84, 85] assert_almost_equal(value_at_risk(returns, cutoff=0.1), 41) assert_almost_equal(value_at_risk(returns, cutoff=0.2), 81) assert_almost_equal(value_at_risk(returns, cutoff=0.3), 81.5) # Test a returns stream of 21 data points at different cutoffs. returns = np.random.normal(0, 0.02, 21) for cutoff in (0, 0.0499, 0.05, 0.20, 0.999, 1): assert_almost_equal( value_at_risk(returns, cutoff), np.percentile(returns, cutoff * 100), )
def value_at_risk(returns, cutoff=0.05): """ Value at risk (VaR) of a returns stream. Parameters ---------- returns : pandas.Series or 1-D numpy.array Non-cumulative daily returns. cutoff : float, optional Decimal representing the percentage cutoff for the bottom percentile of returns. Defaults to 0.05. Returns ------- VaR : float The VaR value. """ return np.percentile(returns, 100 * cutoff)
def get_packets_per_second(trace, features): """ Gets the total number of packets per second along with mean, standard deviation, min, max and median @return a 1D list that contains the packets per second """ packets_per_second = {} for val in trace: second = floor(val[0]) if second not in packets_per_second: packets_per_second[second] = 0 packets_per_second[second] += 1 l = list(packets_per_second.values()) features.append(sum(l) / len(l)) features.append(np.std(l)) features.append(np.percentile(l, 50)) features.append(min(l)) features.append(max(l)) return l
def get_inter_arrival_time(trace, in_trace, out_trace, features): """ For both the complete trace, the trace only containing incoming and the trace only containing outcoming packets, we calculate the inter-arrival time. Next, we add the max, mean, standard deviation and third quartile as features. @param in_trace contains all the incoming packets and nothing else @param out_trace contains all the outcoming packets and nothing else """ complete_inter_arrival_time = inter_packet_time(trace) in_inter_arrival_time = inter_packet_time(in_trace) out_inter_arrival_time = inter_packet_time(out_trace) inter_arrival_times = [complete_inter_arrival_time, in_inter_arrival_time, out_inter_arrival_time] for time in inter_arrival_times: if len(time) == 0: features.extend([0, 0, 0, 0]) else: features.append(max(time)) features.append(sum(time) / len(time)) features.append(np.std(time)) features.append(np.percentile(time, 75))
def get_transmission_time_stats(trace, in_trace, out_trace, features): """ For the complete trace and the traces only containing incoming and outcoming packets, we extract the following: - First, second and third quartile - Total transmission time """ in_times = [x[0] for x in in_trace] out_times = [x[0] for x in out_trace] total_times = [x[0] for x in trace] times = [total_times, in_times, out_times] for time in times: if len(time) == 0: features.extend([0, 0, 0, 0]) else: features.append(np.percentile(time, 25)) features.append(np.percentile(time, 50)) features.append(np.percentile(time, 75)) features.append(np.percentile(time, 100))
def deltasCSVWriter(self, name='ant'): "Changes" header = array([h.name[1:] for h in self.test.headers[:-2]]) oldRows = [r for r, p in zip(self.test._rows, self.pred) if p > 0] delta = array([self.delta(t) for t in oldRows]) y = median(delta, axis=0) yhi, ylo = percentile(delta, q=[75, 25], axis=0) dat1 = sorted( [(h, a, b, c) for h, a, b, c in zip(header, y, ylo, yhi)], key=lambda F: F[1]) dat = asarray([(d[0], n, d[1], d[2], d[3]) for d, n in zip(dat1, range(1, 21))]) with open('/Users/rkrsn/git/GNU-Plots/rkrsn/errorbar/%s.csv' % (name), 'w') as csvfile: writer = csv.writer(csvfile, delimiter=' ') for el in dat[()]: writer.writerow(el) # new = [self.newRow(t) for t in oldRows]
def _nanpercentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False): """ Private function that doesn't support extended axis or keepdims. These methods are extended to this function using _ureduce See nanpercentile for parameter usage """ if axis is None: part = a.ravel() result = _nanpercentile1d(part, q, overwrite_input, interpolation) else: result = np.apply_along_axis(_nanpercentile1d, axis, a, q, overwrite_input, interpolation) # apply_along_axis fills in collapsed axis with results. # Move that axis to the beginning to match percentile's # convention. if q.ndim != 0: result = np.rollaxis(result, axis) if out is not None: out[...] = result return result
def test_out(self): mat = np.random.rand(3, 3) nan_mat = np.insert(mat, [0, 2], np.nan, axis=1) resout = np.zeros(3) tgt = np.percentile(mat, 42, axis=1) res = np.nanpercentile(nan_mat, 42, axis=1, out=resout) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) # 0-d output: resout = np.zeros(()) tgt = np.percentile(mat, 42, axis=None) res = np.nanpercentile(nan_mat, 42, axis=None, out=resout) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout) assert_almost_equal(res, resout) assert_almost_equal(res, tgt)
def test_multiple_percentiles(self): perc = [50, 100] mat = np.ones((4, 3)) nan_mat = np.nan * mat # For checking consistency in higher dimensional case large_mat = np.ones((3, 4, 5)) large_mat[:, 0:2:4, :] = 0 large_mat[:, :, 3:] *= 2 for axis in [None, 0, 1]: for keepdim in [False, True]: with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') val = np.percentile(mat, perc, axis=axis, keepdims=keepdim) nan_val = np.nanpercentile(nan_mat, perc, axis=axis, keepdims=keepdim) assert_equal(nan_val.shape, val.shape) val = np.percentile(large_mat, perc, axis=axis, keepdims=keepdim) nan_val = np.nanpercentile(large_mat, perc, axis=axis, keepdims=keepdim) assert_equal(nan_val, val) megamat = np.ones((3, 4, 5, 6)) assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6))
def test_keepdims(self): d = np.ones((3, 5, 7, 11)) assert_equal(np.percentile(d, 7, axis=None, keepdims=True).shape, (1, 1, 1, 1)) assert_equal(np.percentile(d, 7, axis=(0, 1), keepdims=True).shape, (1, 1, 7, 11)) assert_equal(np.percentile(d, 7, axis=(0, 3), keepdims=True).shape, (1, 5, 7, 1)) assert_equal(np.percentile(d, 7, axis=(1,), keepdims=True).shape, (3, 1, 7, 11)) assert_equal(np.percentile(d, 7, (0, 1, 2, 3), keepdims=True).shape, (1, 1, 1, 1)) assert_equal(np.percentile(d, 7, axis=(0, 1, 3), keepdims=True).shape, (1, 1, 7, 1)) assert_equal(np.percentile(d, [1, 7], axis=(0, 1, 3), keepdims=True).shape, (2, 1, 1, 7, 1)) assert_equal(np.percentile(d, [1, 7], axis=(0, 3), keepdims=True).shape, (2, 1, 5, 7, 1))
def test_out_nan(self): with warnings.catch_warnings(record=True): warnings.filterwarnings('always', '', RuntimeWarning) o = np.zeros((4,)) d = np.ones((3, 4)) d[2, 1] = np.nan assert_equal(np.percentile(d, 0, 0, out=o), o) assert_equal( np.percentile(d, 0, 0, interpolation='nearest', out=o), o) o = np.zeros((3,)) assert_equal(np.percentile(d, 1, 1, out=o), o) assert_equal( np.percentile(d, 1, 1, interpolation='nearest', out=o), o) o = np.zeros(()) assert_equal(np.percentile(d, 1, out=o), o) assert_equal( np.percentile(d, 1, interpolation='nearest', out=o), o)
def quantile(x, q, weights=None): """ Like numpy.percentile, but: * Values of q are quantiles [0., 1.] rather than percentiles [0., 100.] * scalar q not supported (q must be iterable) * optional weights on x """ if weights is None: return np.percentile(x, [100. * qi for qi in q]) else: idx = np.argsort(x) xsorted = x[idx] cdf = np.add.accumulate(weights[idx]) cdf /= cdf[-1] return np.interp(q, cdf, xsorted).tolist()
def _degradation_CI(results): ''' Description ----------- Monte Carlo estimation of uncertainty in degradation rate from OLS results Parameters ---------- results: OLSResults object from fitting a model of the form: results = sm.OLS(endog = df.energy_ma, exog = df.loc[:,['const','years']]).fit() Returns ------- 68.2% confidence interval for degradation rate ''' sampled_normal = np.random.multivariate_normal(results.params, results.cov_params(), 10000) dist = sampled_normal[:, 1] / sampled_normal[:, 0] Rd_CI = np.percentile(dist, [50 - 34.1, 50 + 34.1]) * 100.0 return Rd_CI
def info(self,burn=1000,plot=False): """ Print the summary statistics and optionally plot the results """ rows=len(self.varnames) cols=2 chain=np.array(self.chain[burn:]) nsize=chain.shape[0] # print rows,cols print '%4s %16s %12s %12s [%12s, %12s, %12s]'%('no','name','mean','stddev','16%','50%','84%') for i,name in enumerate(self.varnames): temp=np.percentile(chain[:,i],[16.0,84.0,50.0]) print '%4i %16s %12g %12g [%12g, %12g, %12g]'%(i,name,np.mean(chain[:,i]),(temp[1]-temp[0])/2.0,temp[0],temp[2],temp[1]) if plot: ax=plt.subplot(rows,cols,2*i+1) # plt.text(0.05,0.9,r'$\tau$='+'%5.1f'%(acor.acor(chain[:,i])[0]),transform=ax.transAxes) plt.plot(chain[:,i]) plt.ylabel(self.model.descr[name][3]) plt.xlabel('Iteration') ax=plt.subplot(rows,cols,2*i+2) plt.hist(chain[:,i],bins=100,histtype='step') plt.text(0.05,0.9,sround(np.mean(chain[:,i]),temp[0],temp[1]),transform=ax.transAxes) plt.xlabel(self.model.descr[name][3]) # plt.text(0.05,0.9,'%6g %3g (%4g-%4g)'%(np.mean(chain[:,i]),(temp[1]-temp[0])/2.0,temp[0],temp[1]),transform=ax.transAxes)
def filterout_outliers(l_data, l_date): ''' Return a list with data filtered from outliers :param l_data: list. data to be analyzed ''' # === old code to filter outliers ======= # Q3 = np.percentile(l_data, 98) # Q1 = np.percentile(l_data, 2) # step = (Q3 - Q1) * 1.5 # step = max(3000., step) # na_val = np.array(l_data) # na_val = na_val[(na_val >= Q1 - step) & (na_val <= Q3 + step)] # return na_val # ======================================= # group by minute df_filter = pd.Series(np.array(l_date)/60).astype(int) l_filter = list((df_filter != df_filter.shift()).values) l_filter[0] = True l_filter[-1] = True return np.array(pd.Series(l_data)[l_filter].values)
def _cut_windows_vertically(self, door_top, roof_top, sky_sig, win_strip): win_sig = np.percentile(win_strip, 85, axis=1) win_sig[sky_sig > 0.5] = 0 if win_sig.max() > 0: win_sig /= win_sig.max() win_sig[:roof_top] = 0 win_sig[door_top:] = 0 runs, starts, values = run_length_encode(win_sig > 0.5) win_heights = runs[values] win_tops = starts[values] if len(win_heights) > 0: win_bottom = win_tops[-1] + win_heights[-1] win_top = win_tops[0] win_vertical_spacing = np.diff(win_tops).mean() if len(win_tops) > 1 else 0 else: win_bottom = win_top = win_vertical_spacing = -1 self.top = int(win_top) self.bottom = int(win_bottom) self.vertical_spacing = int(win_vertical_spacing) self.vertical_scores = make_list(win_sig) self.heights = np.array(win_heights) self.tops = np.array(win_tops)
def _first_interval(x, n_bins): """ Gets the first interval based on the percentiles, either a closed interval containing the same value multiple times or a closed-open interval with a different lower and upper bound. """ # calculate the percentiles percentiles = np.linspace(0., 100., n_bins + 1) bounds = np.percentile(x, q=percentiles, interpolation='higher') lower = bounds[0] upper = bounds[1] if lower == upper: return lower, upper, True, True else: return lower, upper, True, False #------- private methods for categorical binnings-------#
def random_submatrix(Z,g,s,o,threshpct=99,returnIdx=False): xg = np.zeros(Z.shape[0],dtype=np.bool) xg[:g] = True np.random.shuffle(xg) xt = np.zeros(Z.shape[1],dtype=np.bool) xt[:s] = True np.random.shuffle(xt) X0 = Z[xg][:,xt] thresh = np.percentile(X0,threshpct) X0[(X0 > thresh)] = thresh xo = np.zeros(X0.shape[0],dtype=np.bool) xo[:o] = True np.random.shuffle(xo) Xobs = X0[xo] if returnIdx: return X0,xo,Xobs,xt,xg else: return X0,xo,Xobs
def filter_and_smooth_angular_velocity(angular_velocity, low_pass_kernel_size, clip_percentile, plot=False): """Reduce the noise in a velocity signal.""" max_value = np.percentile(angular_velocity, clip_percentile) print("Clipping angular velocity norms to {} rad/s ...".format(max_value)) angular_velocity_clipped = np.clip(angular_velocity, -max_value, max_value) print("Done clipping angular velocity norms...") low_pass_kernel = np.ones((low_pass_kernel_size, 1)) / low_pass_kernel_size print("Smoothing with kernel size {} samples...".format(low_pass_kernel_size)) angular_velocity_smoothed = signal.correlate(angular_velocity_clipped, low_pass_kernel, 'same') print("Done smoothing angular velocity norms...") if plot: plot_angular_velocities("Angular Velocities", angular_velocity, angular_velocity_smoothed, True) return angular_velocity_smoothed.copy()
def norm_post_sim(modes,cov_matrix): post = multivariate_normal(modes,cov_matrix) nsims = 30000 phi = np.zeros([nsims,len(modes)]) for i in range(0,nsims): phi[i] = post.rvs() chain = np.array([phi[i][0] for i in range(len(phi))]) for m in range(1,len(modes)): chain = np.vstack((chain,[phi[i][m] for i in range(len(phi))])) mean_est = [np.mean(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))] median_est = [np.median(np.array([phi[i][j] for i in range(len(phi))])) for j in range(len(modes))] upper_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),95) for j in range(len(modes))] lower_95_est = [np.percentile(np.array([phi[i][j] for i in range(len(phi))]),5) for j in range(len(modes))] return chain, mean_est, median_est, upper_95_est, lower_95_est
def calculate_aggregate(values): agg_measures = { 'avg': np.mean(values), 'std': np.std(values), 'var': np.var(values), 'med': np.median(values), '10p': np.percentile(values, 10), '25p': np.percentile(values, 25), '50p': np.percentile(values, 50), '75p': np.percentile(values, 75), '90p': np.percentile(values, 90), 'iqr': np.percentile(values, 75) - np.percentile(values, 25), 'iqm': interquartile_range_mean(values), 'mad': mean_absolute_deviation(values), 'cov': 1.0 * np.mean(values) / np.std(values), 'gin': gini_coefficient(values), 'skw': stats.skew(values), 'kur': stats.kurtosis(values), 'sum': np.sum(values) } return agg_measures
def get_pr(reference_frames,output_frames,mode='type',pr_resolution=100): # filter output by confidence confidence = collect_confidence(output_frames) conf_order = [] step = 100/float(pr_resolution) for j in range(1,pr_resolution+1): conf_order.append( np.percentile(confidence, j*step) ) conf_order = [-1] + conf_order + [2] # get curve params = [] for threshold in conf_order: params.append( [ reference_frames, output_frames, confidence, threshold, mode ] ) all_tp, all_fp, all_fn, all_prec, all_rec = zip(*pool.map(single_point, params)) all_prec = list(all_prec) #+ [0] all_rec = list(all_rec) #+ [1] all_rec, all_prec = zip(*sorted(zip(all_rec, all_prec))) AUC = metrics.auc(all_rec, all_prec) return all_rec, all_prec, AUC # create complete output for 1 mode --------------------------------------------
def results_stat(): for file in os.listdir('./results'): if file.endswith('.txt'): file_path = os.path.join('./results', file) data = [[], [], [], []] with open(file_path, 'r') as f: lines = f.read().split('\n') for line in lines: if len(line) > 0: line_data = map(eval, line.split()) for i in range(4): data[i].append(line_data[i]) lines = [[], [], [], []] for i in range(4): lines[i].append(np.mean(data[i])) lines[i].append(np.std(data[i])) lines[i].append(np.min(data[i])) lines[i].append(np.percentile(data[i], 25)) lines[i].append(np.percentile(data[i], 50)) lines[i].append(np.percentile(data[i], 75)) lines[i].append(np.max(data[i])) output_path = os.path.join('./stat', file) with open(output_path, 'w') as f: for line in lines: f.write('\t'.join(map(str, line)) + '\n')
def splitclusters(datapointwts,seeds,theta): std = {}; seeds1 = []; seedweight = []; cluster, p2cluster = point2cluster(datapointwts, seeds,theta); for cl in cluster: mang = seeds[cl][-1]; if len(cluster[cl]) > 10: std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90) clockwise = [xx for xx in cluster[cl] if greaterthanangle(xx[2], mang)]; if std[cl]>20 and len(clockwise)>0 and len(clockwise)<len(cluster[cl]): seeds1.append(avgpoint(clockwise)) seeds1.append(avgpoint([xx for xx in cluster[cl] if not greaterthanangle(xx[2], mang)])) seedweight.append(len(clockwise)) seedweight.append(len(cluster[cl]) -len(clockwise)) else: seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl])) else: seeds1.append(seeds[cl]); seedweight.append(len(cluster[cl])) return seeds1, seedweight
def splitclustersparallel(datapointwts,seeds): X = [(xx[0], xx[1]) for xx in datapointwts]; S = [(xx[0], xx[1]) for xx in seeds];cluster = {};p2cluster = []; gedges = {}; gedges1 = {}; nedges = {}; std = {}; seeds1 = []; seedweight = []; roadwidth = []; nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(S) distances, indices = nbrs.kneighbors(X) for cd in range(len(seeds)): cluster[cd] = []; roadwidth.append(0); for ii, ll in enumerate(indices): dd = [taxidist(seeds[xx], datapointwts[ii][:-1],theta) for xx in ll] cd = ll[dd.index(min(dd))]; cluster[cd].append(datapointwts[ii]) p2cluster.append(cd) for cl in cluster: mang = seeds[cl][-1]; scl = seeds[cl] if len(cluster[cl]) > 10: std[cl] = np.percentile([angledist(xx[2], mang) for xx in cluster[cl]], 90) roadwidth[cl] = 1+5*np.std([geodist(scl,xx)*np.sin(anglebetweentwopoints(scl,xx)-scl[-1]) for xx in cluster[cl]]) print(cl,scl,[(anglebetweentwopoints(scl,xx),scl[-1]) for xx in cluster[cl]])
def _write_config(self, fasta_filename): """Write daligner sensitive config to fasta_filename.sensitive.config.""" lens = [len(r.sequence) for r in ContigSetReaderWrapper(fasta_filename)] self.low_cDNA_size, self.high_cDNA_size = 0, 0 if len(lens) == 1: self.low_cDNA_size, self.high_cDNA_size = lens[0], lens[0] if len(lens) >= 2: self.low_cDNA_size = int(np.percentile(lens, 10)) self.high_cDNA_size = int(np.percentile(lens, 90)) try: with open(fasta_filename+'.sensitive.config', 'w') as f: f.write("sensitive={s}\n".format(s=self.sensitive_mode)) f.write("low={l}\n".format(l=self.low_cDNA_size)) f.write("high={h}\n".format(h=self.high_cDNA_size)) except IOError: pass # it's OK not to have write permission
def summary(self): """ Method to produce a summary table of of the Mack Chainladder model. Returns: This calculation is consistent with the R calculation BootChainLadder$summary """ IBNR = self.IBNR tri = self.tri summary = pd.DataFrame() summary['Latest'] = tri.get_latest_diagonal() summary['Mean Ultimate'] = summary['Latest'] + pd.Series([np.mean(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index) summary['Mean IBNR'] = summary['Mean Ultimate'] - summary['Latest'] summary['SD IBNR'] = pd.Series([np.std(np.array(IBNR)[:,num]) for num in range(len(tri.data))],index=summary.index) summary['IBNR 75%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=75) for num in range(len(tri.data))],index=summary.index) summary['IBNR 95%'] = pd.Series([np.percentile(np.array(IBNR)[:,num],q=95) for num in range(len(tri.data))],index=summary.index) return summary
def run_ltsd_example(): fs, d = fetch_sample_speech_tapestry() winsize = 1024 d = d.astype("float32") / 2 ** 15 d -= d.mean() pad = 3 * fs noise_pwr = np.percentile(d, 1) ** 2 noise_pwr = max(1E-9, noise_pwr) d = np.concatenate((np.zeros((pad,)) + noise_pwr * np.random.randn(pad), d)) _, vad_segments = ltsd_vad(d, fs, winsize=winsize) v_up = np.where(vad_segments == True)[0] s = v_up[0] st = v_up[-1] + int(.5 * fs) d = d[s:st] bname = "tapestry.wav".split(".")[0] wavfile.write("%s_out.wav" % bname, fs, soundsc(d))
def _compute_data_weights_topk(self, opts, density_ratios): """Put a uniform distribution on K points with largest prob real data. This is a naiive heuristic which makes next GAN concentrate on those points of the training set, which were classified correctly with largest margins. I.e., out current mixture model is not capable of generating points looking similar to these ones. """ threshold = np.percentile(density_ratios, opts["topk_constant"]*100.0) # Note that largest prob_real_data corresponds to smallest density # ratios. mask = density_ratios <= threshold data_weights = np.zeros(self._data_num) data_weights[mask] = 1.0 / np.sum(mask) return data_weights
def quantile_binning(data=None, bins=10, qrange=(0.0, 1.0), **kwargs): """Binning schema based on quantile ranges. This binning finds equally spaced quantiles. This should lead to all bins having roughly the same frequencies. Note: weights are not (yet) take into account for calculating quantiles. Parameters ---------- bins: sequence or Optional[int] Number of bins qrange: Optional[tuple] Two floats as minimum and maximum quantile (default: 0.0, 1.0) Returns ------- StaticBinning """ if np.isscalar(bins): bins = np.linspace(qrange[0] * 100, qrange[1] * 100, bins + 1) bins = np.percentile(data, bins) return static_binning(bins=make_bin_array(bins), includes_right_edge=True)
def estimate_bayes_factor(traces, logp, r=0.05): """ Esitmate Odds ratios in a random subsample of the chains in MCMC AstroML (see Eqn 5.127, pg 237) """ ndim, nsteps = traces.shape # [ndim,number of steps in chain] # compute volume of a n-dimensional (ndim) sphere of radius r Vr = np.pi ** (0.5 * ndim) / gamma(0.5 * ndim + 1) * (r ** ndim) # use neighbor count within r as a density estimator bt = BallTree(traces.T) count = bt.query_radius(traces.T, r=r, count_only=True) # BF = N*p/rho bf = logp + np.log(nsteps) + np.log(Vr) - np.log(count) #log10(bf) p25, p50, p75 = np.percentile(bf, [25, 50, 75]) return p50, 0.7413 * (p75 - p25) ########################################################################
def reducer(self, key, values): if key == 'latencies': values = list(values) yield 'average_latency', numpy.average(values) yield 'median_latency', numpy.median(values) yield '95th_percentile_latency', numpy.percentile(values, 95) elif key == 'sent': first, last, count = minmax(values) yield 'count', count yield 'sending_first', first yield 'sending_last', last yield 'sending_overhead', (last - first) / (count - 1) yield 'sending_throughput', (count - 1) / (last - first) elif key == 'received': first, last, count = minmax(values) yield 'receiving_first', first yield 'receiving_last', last yield 'receiving_overhead', (last - first) / (count - 1) yield 'receiving_throughput', (count - 1) / (last - first)
def calculate_batchsize_maxlen(texts): """ Calculates the maximum length in the provided texts and a suitable batch size. Rounds up maxlen to the nearest multiple of ten. # Arguments: texts: List of inputs. # Returns: Batch size, max length """ def roundup(x): return int(math.ceil(x / 10.0)) * 10 # Calculate max length of sequences considered # Adjust batch_size accordingly to prevent GPU overflow lengths = [len(tokenize(t)) for t in texts] maxlen = roundup(np.percentile(lengths, 80.0)) batch_size = 250 if maxlen <= 100 else 50 return batch_size, maxlen
def ss_octile(y): """Obtain the octile summary statistic. The statistic reaches the optimal performance upon a high number of observations. According to Allingham et al. (2009), it is more stable than ss_robust. Parameters ---------- y : array_like Yielded points. Returns ------- array_like of the shape (batch_size, dim_ss=8, dim_ss_point) """ octiles = np.linspace(12.5, 87.5, 7) E1, E2, E3, E4, E5, E6, E7 = np.percentile(y, octiles, axis=1) # Combining the summary statistics. ss_octile = np.hstack((E1, E2, E3, E4, E5, E6, E7)) ss_octile = ss_octile[:, :, np.newaxis] return ss_octile
def flag_outliers(series, iqr_multiplier=1.5): """Use Tukey's boxplot criterion for outlier identification. """ top_quartile_cutoff = np.percentile(series.get_values(), 75) bottom_quartile_cutoff = np.percentile(series.get_values(), 25) # Compute interquartile range iqr = top_quartile_cutoff - bottom_quartile_cutoff top_outlier_cutoff = top_quartile_cutoff + iqr * iqr_multiplier bottom_outlier_cutoff = bottom_quartile_cutoff - iqr * iqr_multiplier return series[(series < bottom_outlier_cutoff) | (series > top_outlier_cutoff)] # In[ ]:
def truncate_and_scale(data, percMin=0.01, percMax=99.9, zeroTo=1.0): """Truncate and scale the data as a preprocessing step. Parameters ---------- data : nd numpy array Data/image to be truncated and scaled. percMin : float, positive Minimum percentile to be truncated. percMax : float, positive Maximum percentile to be truncated. zeroTo : float Data will be returned in the range from 0 to this number. Returns ------- data : nd numpy array Truncated and scaled data/image. """ # adjust minimum percDataMin = np.percentile(data, percMin) data[np.where(data < percDataMin)] = percDataMin data = data - data.min() # adjust maximum percDataMax = np.percentile(data, percMax) data[np.where(data > percDataMax)] = percDataMax data = 1./data.max() * data return data * zeroTo
def preprocess_bitmap(bitmap): # contrast stretching p2, p98 = np.percentile(bitmap, (2, 98)) assert abs(p2-p98) > 10 bitmap = skimage.exposure.rescale_intensity(bitmap, in_range=(p2, p98)) # from skimage.filters import threshold_otsu # thresh = threshold_otsu(bitmap) # bitmap = bitmap > thresh return bitmap
def plot_tsne_totalcounts(chart, sample_properties, sample_data): """ Plot cells colored by total counts """ analysis = sample_data.analysis if not analysis or len(sample_properties['genomes']) > 1: return None reads_per_bc = analysis.matrix.get_reads_per_bc() vmin, vmax = np.percentile(reads_per_bc, ws_gex_constants.TSNE_TOTALCOUNTS_PRCT_CLIP) return plot_dimensions_color(chart, analysis.get_tsne().transformed_tsne_matrix, reads_per_bc, ws_gex_constants.TSNE_TOTALCOUNTS_DESCRIPTION, vmin, vmax, 1, 2)