我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.sort()。
def test_partition_cdtype(self): d = np.array([('Galahad', 1.7, 38), ('Arthur', 1.8, 41), ('Lancelot', 1.9, 38)], dtype=[('name', '|S10'), ('height', '<f8'), ('age', '<i4')]) tgt = np.sort(d, order=['age', 'height']) assert_array_equal(np.partition(d, range(d.size), order=['age', 'height']), tgt) assert_array_equal(d[np.argpartition(d, range(d.size), order=['age', 'height'])], tgt) for k in range(d.size): assert_equal(np.partition(d, k, order=['age', 'height'])[k], tgt[k]) assert_equal(d[np.argpartition(d, k, order=['age', 'height'])][k], tgt[k]) d = np.array(['Galahad', 'Arthur', 'zebra', 'Lancelot']) tgt = np.sort(d) assert_array_equal(np.partition(d, range(d.size)), tgt) for k in range(d.size): assert_equal(np.partition(d, k)[k], tgt[k]) assert_equal(d[np.argpartition(d, k)][k], tgt[k])
def plot_histogram_metric(chart, sample_properties, sample_data, **kwargs): """ Plot a HistogramMetric from the summary json """ summary_data = sample_data.summary items = summary_data.get(kwargs['metric_name'], {}).items() if len(items) < 1: return None ordering = kwargs.get('order_by', shared_constants.HISTOGRAM_METRIC_DEFAULT_ORDERING) if ordering == shared_constants.HISTOGRAM_METRIC_ORDER_INTEGER_BIN: items.sort(key=lambda x: convert_to_int_gracefully(x[0])) elif ordering == shared_constants.HISTOGRAM_METRIC_ORDER_DECREASING_FREQUENCY: items.sort(key=lambda x: -convert_to_int_gracefully(x[1])) elif ordering == shared_constants.HISTOGRAM_METRIC_ORDER_DECREASING_PROPORTION: items.sort(key=lambda x: -convert_to_float_gracefully(x[1])) x, y = zip(*items) chart['data'][0].update({'x': x, 'y': y}) return chart
def preprocess_matrix(matrix, num_bcs=None, use_bcs=None, use_genes=None, force_cells=None): if force_cells is not None: bc_counts = matrix.get_reads_per_bc() bc_indices, _, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff(bc_counts, force_cells) matrix = matrix.select_barcodes(bc_indices) elif use_bcs is not None: bc_seqs = cr_utils.load_csv_rownames(use_bcs) bc_indices = matrix.bcs_to_ints(bc_seqs) matrix = matrix.select_barcodes(bc_indices) elif num_bcs is not None and num_bcs < matrix.bcs_dim: bc_indices = np.sort(np.random.choice(np.arange(matrix.bcs_dim), size=num_bcs, replace=False)) matrix = matrix.select_barcodes(bc_indices) if use_genes is not None: gene_ids = cr_utils.load_csv_rownames(use_genes) gene_indices = matrix.gene_ids_to_ints(gene_ids) matrix = matrix.select_genes(gene_indices) matrix, _, _ = matrix.select_nonzero_axes() return matrix
def create_training_test_sets(self): # training set scale = self.data_interval_right - self.data_interval_left train_x = sp.stats.truncnorm.rvs(-2, 2, scale=0.25 * scale, size=self.data_size).astype(np.float32) train_x = np.sort(train_x) train_y = self.true_f(train_x) + 0.2 * np.random.randn(self.data_size) self.train_x = [train_x.reshape((train_x.shape[0], 1))] self.train_y = [train_y.reshape((train_y.shape[0], 1))] # test set # scale = self.test_data_interval_right - self.test_data_interval_left # test_x = sp.stats.truncnorm.rvs(-2, 2, scale=0.25 * scale, size=self.test_data_size).astype(np.float32) # test_x = np.sort(test_x) # test_y = self.true_f(test_x) self.test_x = np.arange(self.view_xrange[0], self.view_xrange[1], 0.01, dtype=np.float32) self.test_y = self.true_f(self.test_x) self.test_x = [self.test_x.reshape((self.test_x.shape[0], 1))] self.test_y = [self.test_y.reshape((self.test_y.shape[0], 1))]
def create_training_test_sets(self): # training set train_x = np.random.uniform(self.data_interval_left, self.data_interval_right, size=self.data_size) train_x = np.sort(train_x) train_y = self.true_f(train_x) + 3. * np.random.randn(self.data_size) self.train_x = [train_x.reshape((train_x.shape[0], 1))] self.train_y = [train_y.reshape((train_y.shape[0], 1))] # test set for visualisation self.test_x = np.arange(self.view_xrange[0], self.view_xrange[1], 0.01, dtype=np.float32) self.test_x = np.reshape(self.test_x, (self.test_x.shape[0], 1)) self.test_y = self.true_f(self.test_x) self.test_y = np.reshape(self.test_y, (self.test_y.shape[0], 1)) self.test_x = [self.test_x] self.test_y = [self.test_y]
def iter_keys_values(self, keys, inds=None, verbose=False): for key in keys: if key not in self.keys_: raise RuntimeError('Key %s not found in dataset. keys: %s' % (key, self.keys_)) idx, ii = 0, 0 total_chunks = len(self.meta_file_.chunks) inds = np.sort(inds) if inds is not None else None for chunk_idx, chunk in enumerate(progressbar(self.meta_file_.chunks, size=total_chunks, verbose=verbose)): data = AttrDict.load(self.get_chunk_filename(chunk_idx)) # if inds is None: items = (data[key] for key in keys) for item in izip(*items): yield item # else: # for i, item in enumerate(data[key]): # if inds[ii] == idx + i: # yield item # ii += 1 # if ii >= len(inds): break # idx += len(data[key])
def get_best_split(X, y): """ Obtain the best splitting point and resulting children for the data set X, y Args: X, y (numpy.ndarray, data set) criterion (gini or entropy) Returns: dict {index: index of the feature, value: feature value, children: left and right children} """ best_index, best_value, best_score, children = None, None, 1e10, None for index in range(len(X[0])): for value in np.sort(np.unique(X[:, index])): groups = split_node(X, y, index, value) impurity = weighted_mse([groups[0][1], groups[1][1]]) if impurity < best_score: best_index, best_value, best_score, children = index, value, impurity, groups return {'index': best_index, 'value': best_value, 'children': children}
def get_best_split(X, y, criterion): """ Obtain the best splitting point and resulting children for the data set X, y Args: X, y (numpy.ndarray, data set) criterion (gini or entropy) Returns: dict {index: index of the feature, value: feature value, children: left and right children} """ best_index, best_value, best_score, children = None, None, 1, None for index in range(len(X[0])): for value in np.sort(np.unique(X[:, index])): groups = split_node(X, y, index, value) impurity = weighted_impurity([groups[0][1], groups[1][1]], criterion) if impurity < best_score: best_index, best_value, best_score, children = index, value, impurity, groups return {'index': best_index, 'value': best_value, 'children': children}
def update_image_property(self, property_name, property_data, erase_property=False): if isinstance(property_data,list) or isinstance(property_data,np.ndarray): assert len(property_data) == len(self._labels) property_keys = self._labels elif isinstance(property_data,dict) or isinstance(property_data,array_dict): property_keys = np.sort(property_data.keys()) property_data = [property_data[l] for l in property_keys] if property_name in self._properties.keys(): if erase_property: self._properties[property_name] = array_dict(property_data,keys=property_keys) else: for l,v in zip(property_keys,property_data): self._properties[property_name][l] = v else: print "Creating property ",property_name," on image" self._properties[property_name] = array_dict(property_data,keys=property_keys)
def testBsearch(self, dtype=dtype): testarray = range(1,101) random.shuffle(testarray) a = numpy.array(testarray[:50], dtype) b = numpy.array([0] + testarray[50:] + range(101,103), dtype) a = numpy.sort(a) self.assertEqual(mapped_struct.bsearch(a, 0), 0) self.assertEqual(mapped_struct.bsearch(a, 101), len(a)) self.assertEqual(mapped_struct.bsearch(a, 102), len(a)) for x in a: ix = mapped_struct.bsearch(a, x) self.assertLess(ix, len(a)) self.assertEqual(a[ix], x) self.assertTrue(mapped_struct.sorted_contains(a, x)) for x in b: ix = mapped_struct.bsearch(a, x) self.assertTrue(ix >= len(a) or a[ix] != x) self.assertFalse(mapped_struct.sorted_contains(a, x))
def get_score_bounds_from_range(Z_min, Z_max, rho_lb, rho_ub, L0_max = None): "global variables: L0_reg_ind" edge_values = np.vstack([Z_min * rho_lb, Z_max * rho_lb, Z_min * rho_ub, Z_max * rho_ub]) if L0_max is None or L0_max == Z_min.shape[0]: s_min = np.sum(np.min(edge_values, axis = 0)) s_max = np.sum(np.max(edge_values, axis = 0)) else: min_values = np.min(edge_values, axis = 0) s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max]) s_min_no_reg = np.sum(min_values[~L0_reg_ind]) s_min = s_min_reg + s_min_no_reg max_values = np.max(edge_values, axis = 0) s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max]) s_max_no_reg = np.sum(max_values[~L0_reg_ind]) s_max = s_max_reg + s_max_no_reg return s_min, s_max #setup weights
def get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, L0_reg_ind = None, L0_max = None): edge_values = np.vstack([Z_min * rho_lb, Z_max * rho_lb, Z_min * rho_ub, Z_max * rho_ub]) if (L0_max is None) or (L0_reg_ind is None) or (L0_max == Z_min.shape[0]): s_min = np.sum(np.min(edge_values, axis=0)) s_max = np.sum(np.max(edge_values, axis=0)) else: min_values = np.min(edge_values, axis=0) s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max]) s_min_no_reg = np.sum(min_values[~L0_reg_ind]) s_min = s_min_reg + s_min_no_reg max_values = np.max(edge_values, axis=0) s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max]) s_max_no_reg = np.sum(max_values[~L0_reg_ind]) s_max = s_max_reg + s_max_no_reg return s_min, s_max
def round_solution_pool(pool, constraints): pool.distinct().sort() P = pool.P L0_reg_ind = np.isnan(constraints['coef_set'].C_0j) L0_max = constraints['L0_max'] rounded_pool = SolutionPool(P) for solution in pool.solutions: # sort from largest to smallest coefficients feature_order = np.argsort([-abs(x) for x in solution]) rounded_solution = np.zeros(shape=(1, P)) l0_norm_count = 0 for k in range(0, P): j = feature_order[k] if not L0_reg_ind[j]: rounded_solution[0, j] = np.round(solution[j], 0) elif l0_norm_count < L0_max: rounded_solution[0, j] = np.round(solution[j], 0) l0_norm_count += L0_reg_ind[j] rounded_pool.add(objvals=np.nan, solutions=rounded_solution) rounded_pool.distinct().sort() return rounded_pool
def top_uncer_items(adata, pp, n, flag = None): """ Return top a flag list of top n uncertain item that not flag """ uncertain = np.abs(pp[:,0] - 0.5) if flag != None: addition = np.asarray(flag, dtype = int)*10# flagged items are not consider, increase their value uncertain = uncertain + addition if len(uncertain) <= n: return np.nonzero(uncertain <= 10000000)[0] sorted_uncertain = np.sort(uncertain) thresh = sorted_uncertain[n] return np.nonzero(uncertain <= thresh)[0]
def items_for_expert(adata, pp, n, flag): """ take n items for expert to consider """ combined_prob = 0.8*np.asarray(adata.taken_crowd_prob) + 0.2*pp[:,1] uncertain = np.abs(combined_prob - 0.5) if flag != None: addition = np.asarray(flag, dtype = int)*10# flagged items are not consider, increase their value uncertain = uncertain + addition if len(uncertain) <= n: return np.nonzero(uncertain <= 10000000)[0] sorted_uncertain = np.sort(uncertain) thresh = sorted_uncertain[n] return np.nonzero(uncertain <= thresh)[0]
def flush(): prints = [] for name, vals in _since_last_flush.items(): prints.append("{}\t{}".format(name, np.mean(list(vals.values())))) _since_beginning[name].update(vals) x_vals = np.sort(list(_since_beginning[name].keys())) y_vals = [_since_beginning[name][x] for x in x_vals] plt.clf() plt.plot(x_vals, y_vals) plt.xlabel('iteration') plt.ylabel(name) plt.savefig('generated/'+name.replace(' ', '_')+'.jpg') print("iter {}\t{}".format(_iter[0], "\t".join(prints))) _since_last_flush.clear() with open('log.pkl', 'wb') as f: pickle.dump(dict(_since_beginning), f, 4)
def plot_feature_importances(feature_names, feature_importances, N=30): importances = list(zip(feature_names, list(feature_importances))) importances = pd.DataFrame(importances, columns=["Feature", "Importance"]) importances = importances.set_index("Feature") # Sort by the absolute value of the importance of the feature importances["sort"] = abs(importances["Importance"]) importances = importances.sort(columns="sort", ascending=False).drop("sort", axis=1) importances = importances[0:N] # Show the most important positive feature at the top of the graph importances = importances.sort(columns="Importance", ascending=True) with plt.style.context(('ggplot')): fig, ax = plt.subplots(figsize=(16,12)) ax.tick_params(labelsize=16) importances.plot(kind="barh", legend=False, ax=ax) ax.set_frame_on(False) ax.set_xlabel("Relative importance", fontsize=20) ax.set_ylabel("Feature name", fontsize=20) plt.tight_layout() plt.title("Most important features for attack", fontsize=20).set_position([.5, 0.99]) return fig
def test_swap_random(data, seed): a, b = data np.random.seed(seed) a_orig, b_orig = original.swap_random(a, b) dcst_private._seed_numba(seed) a_out, b_out = dcst.swap_random(a, b) assert len(a_out) == len(b_out) == len(a) == len(b) # Each entry should be present same number of times ab = np.sort(np.concatenate((a, b))) ab_out = np.sort(np.concatenate((a_out, b_out))) assert np.allclose(ab, ab_out, atol=atol, equal_nan=True) # Check for swaps matching for i in range(len(a)): ab = np.array([a[i], b[i]]) ab_out = np.array([a_out[i], b_out[i]]) assert ab[0] in ab_out assert ab[1] in ab_out
def _hpd_interval(self, x, width): """ Code adapted from pymc3.stats.calc_min_interval: https://github.com/pymc-devs/pymc3/blob/master/pymc3/stats.py """ x = np.sort(x) n = len(x) interval_idx_inc = int(np.floor(width * n)) n_intervals = n - interval_idx_inc interval_width = x[interval_idx_inc:] - x[:n_intervals] if len(interval_width) == 0: raise ValueError('Too few elements for interval calculation') min_idx = np.argmin(interval_width) hdi_min = x[min_idx] hdi_max = x[min_idx + interval_idx_inc] index = ['hpd{}_{}'.format(width, x) for x in ['lower', 'upper']] return pd.Series([hdi_min, hdi_max], index=index)
def _random_curve(self, nr_curves): curves = [] for i in range(nr_curves-1): curve = [(0,0)] # exlcude the 0 and 255 _x = numpy.sort(random.sample(range(1, 255), 32)) _y = numpy.sort(random.sample(range(1, 255), 32)) #_x = numpy.sort(numpy.random.randint(1, 255, 2)) #_y = numpy.sort(numpy.random.randint(1, 255, 2)) # _x[0] and _x[1] can't be the same curve.append((_x[0], _y[0])) curve.append((_x[1], _y[1])) curve.append((255,255)) curves.append(curve) curves.append([(255,255)]) return curves
def test_randomized_svd(rows, cols, rank, dtype, transpose, n_iter, target_gen, rgen): rank = min(rows, cols) - 2 if rank is 'fullrank' else rank A = target_gen(rows, cols, rank=rank, randstate=rgen, dtype=dtype) U_ref, s_ref, V_ref = utils.truncated_svd(A, k=rank) U, s, V = em.randomized_svd(A, rank, transpose=transpose, randstate=rgen, n_iter=n_iter) error_U = np.abs(U.conj().T.dot(U_ref)) - np.eye(rank) assert_allclose(np.linalg.norm(error_U), 0, atol=1e-3) error_V = np.abs(V.dot(V_ref.conj().T)) - np.eye(rank) assert_allclose(np.linalg.norm(error_V), 0, atol=1e-3) assert_allclose(s.ravel() - s_ref, 0, atol=1e-3) # Check that singular values are returned in descending order assert_array_equal(s, np.sort(s)[::-1])
def ecdf(x): ''' Computes the empirical cumulative distribution function of a dataset Args: x (`iterable`): Data. Returns: tuple containing: `numpy.ndarray`: sorted data. `numpy.ndarray`: cumulative distribution function of the data. ''' xs = np.sort(x) ys = np.arange(1, len(xs) + 1) / float(len(xs)) return xs, ys
def sort_xy(x, y): ''' Sorts a pair of x and y iterables, returning arrays in order of ascending x. Args: x (`iterable`): a list, numpy ndarray, or other iterable to sort by. y (`iterable`): a list, numpy ndarray, or other iterable that is y=f(x). Returns: tuple containing: `iterable`: an iterable containing the sorted x elements. `iterable`: an iterable containing the sorted y elements. ''' # zip x and y, sort by the 0th element (x) of each tuple in zip() _ = sorted(zip(x, y), key=itemgetter(0)) sorted_x, sorted_y = zip(*_) return sorted_x, sorted_y
def compute_group(cls, data, scales, **params): data = data.sort_values('x') n = params['n'] x_unique = data['x'].unique() if len(x_unique) < 2: # Not enough data to fit return pd.DataFrame() if data['x'].dtype.kind == 'i': if params['fullrange']: xseq = scales.x.dimension() else: xseq = np.sort(x_unique) else: if params['fullrange']: rangee = scales.x.dimension() else: rangee = [data['x'].min(), data['x'].max()] xseq = np.linspace(rangee[0], rangee[1], n) return predictdf(data, xseq, **params)
def bootstrap_statistics(series, statistic, n_samples=1000, confidence_interval=0.95, random_state=None): """ Default parameters taken from R's Hmisc smean.cl.boot """ if random_state is None: random_state = np.random alpha = 1 - confidence_interval size = (n_samples, len(series)) inds = random_state.randint(0, len(series), size=size) samples = series.values[inds] means = np.sort(statistic(samples, axis=1)) return pd.DataFrame({'ymin': means[int((alpha/2)*n_samples)], 'ymax': means[int((1-alpha/2)*n_samples)], 'y': [statistic(series)]})
def sort_base_rules(self): """ Sort the population lexicographically by truth vector. This should help speed up likelihood calculations. Note, resets the filter. """ # np.lexsort will sort columns by rows, with the last # row as the primary sort key, etc; so we rotate the # truth array by 90 degrees to get it to do what we want. new_order = np.lexsort(np.rot90(self.base_flat_truth)) self._reordering_cache = new_order self.base_flat_durations = self.base_flat_durations[new_order] self.base_flat_variable_weights = self.base_flat_variable_weights[new_order] new_flat_rules = [self.base_flat_rules[i] for i in new_order] self.base_flat_rules = new_flat_rules self.base_flat_truth = self.base_flat_truth[new_order] self.base_primitive_index = { t:i for i,t in enumerate(new_flat_rules) } self.reset_filter()
def number_classes(Yin, omitLabels=[]): """Remaps class labels to contiguous natural numbers starting at 0. In many frameworks (e.g. caffe) class labels are mapped to indices at the output of the CNN; hence this remapping. Any pixels that should be ignored will have class label of -1. """ if Yin is None: return None yAll = np.sort(np.unique(Yin)) yAll = [y for y in yAll if y not in omitLabels] Yout = -1*np.ones(Yin.shape, dtype=Yin.dtype) for yIdx, y in enumerate(yAll): Yout[Yin==y] = yIdx return Yout
def test_sort_flexible(self): # Test sort on flexible dtype. a = array( data=[(3, 3), (3, 2), (2, 2), (2, 1), (1, 0), (1, 1), (1, 2)], mask=[(0, 0), (0, 1), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0)], dtype=[('A', int), ('B', int)]) test = sort(a) b = array( data=[(1, 1), (1, 2), (2, 1), (2, 2), (3, 3), (3, 2), (1, 0)], mask=[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 0)], dtype=[('A', int), ('B', int)]) assert_equal(test, b) assert_equal(test.mask, b.mask) test = sort(a, endwith=False) b = array( data=[(1, 0), (1, 1), (1, 2), (2, 1), (2, 2), (3, 2), (3, 3), ], mask=[(1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (0, 0), ], dtype=[('A', int), ('B', int)]) assert_equal(test, b) assert_equal(test.mask, b.mask)
def compute_precision_mapping(pt): thresh_all = [] prec_all = [] for jj in xrange(1000): thresh = pt['details']['score'][:, jj] prec = pt['details']['precision'][:, jj] ind = np.argsort(thresh); # thresh, ind = torch.sort(thresh) thresh = thresh[ind]; indexes = np.unique(thresh, return_index=True)[1] indexes = np.sort(indexes); thresh = thresh[indexes] thresh = np.vstack((min(-1000, min(thresh) - 1), thresh[:, np.newaxis], max(1000, max(thresh) + 1))); prec = prec[ind]; for i in xrange(1, len(prec)): prec[i] = max(prec[i], prec[i - 1]); prec = prec[indexes] prec = np.vstack((prec[0], prec[:, np.newaxis], prec[-1])); thresh_all.append(thresh) prec_all.append(prec) precision_score = {'thresh': thresh_all, "prec": prec_all} return precision_score
def compute_precision_score_mapping(thresh, prec, score): ind = np.argsort(thresh); # thresh, ind = torch.sort(thresh) thresh = thresh[ind]; indexes = np.unique(thresh, return_index=True)[1] indexes = np.sort(indexes); thresh = thresh[indexes] thresh = np.vstack((min(-1000, min(thresh) - 1), thresh[:, np.newaxis], max(1000, max(thresh) + 1))); prec = prec[ind]; for i in xrange(1, len(prec)): prec[i] = max(prec[i], prec[i - 1]); prec = prec[indexes] prec = np.vstack((prec[0], prec[:, np.newaxis], prec[-1])); f = interp1d(thresh[:, 0], prec[:, 0]) val = f(score) return val
def argsort(a, axis=-1): """Returns the indices that would sort an array with a stable sorting. Args: a (cupy.ndarray): Array to sort. axis (int or None): Axis along which to sort. Default is -1, which means sort along the last axis. If None is supplied, the array is flattened before sorting. Returns: cupy.ndarray: Array of indices that sort ``a``. .. note:: For its implementation reason, ``cupy.argsort`` does not support ``kind`` and ``order`` parameters. .. seealso:: :func:`numpy.argsort` """ return a.argsort(axis=axis)
def msort(a): """Returns a copy of an array sorted along the first axis. Args: a (cupy.ndarray): Array to be sorted. Returns: cupy.ndarray: Array of the same type and shape as ``a``. .. note: ``cupy.msort(a)``, the CuPy counterpart of ``numpy.msort(a)``, is equivalent to ``cupy.sort(a, axis=0)``. .. seealso:: :func:`numpy.msort` """ # TODO(takagi): Support float16 and bool. return sort(a, axis=0) # TODO(okuta): Implement sort_complex
def create_component_sframe(g, baseid_name='page_id', layer_name='layer'): """Get component SFrame enriched with structural properties for each component""" columns = g.vertices.column_names() columns.remove('__id') columns.remove('component_id') # Append s to have unique column names (required by graphlab) gb_dict = {c + 's': gl.aggregate.CONCAT(c) for c in columns} gb_dict['nids'] = gl.aggregate.CONCAT('__id') gb_dict['node_count'] = gl.aggregate.COUNT('__id') comps = g.vertices.groupby('component_id', gb_dict) comps['width'] = comps.apply(lambda x: len(np.unique(x[layer_name + 's']))) comps['height'] = comps.apply(lambda x: len(np.unique(x[baseid_name + 's']))) edges = g.edges.groupby('component_id', {'src': gl.aggregate.CONCAT('__src_id'), 'tgt': gl.aggregate.CONCAT('__dst_id')}) comps = comps.join(edges, 'component_id') return comps.sort('node_count', False)
def test_multicollinearity(df, target_name, r2_threshold = 0.89): '''Tests if any of the features could be predicted from others with R2 >= 0.89 input: dataframe, name of target (to exclude) ''' r2s = pd.DataFrame() for feature in df.columns.difference([target_name]): model = sk.linear_model.Ridge() model.fit(df[df.columns.difference([target_name,feature])], df[feature]) pos = np.in1d(model.coef_, np.sort(model.coef_)[-5:]) r2s = r2s.append(pd.DataFrame({'r2':sk.metrics.r2_score(df[feature],\ model.predict(df[df.columns.difference([target_name, feature])])),\ 'predictors' : str(df.columns.difference([target_name, feature])[np.ravel(np.argwhere(pos == True))].tolist())}, index = [feature])) print('Testing', feature) print('-----------------') if len(r2s[r2s['r2'] >= r2_threshold]) > 0: print('Multicollinearity detected') print(r2s[r2s['r2'] >= r2_threshold]) else: print('No multicollinearity')
def wsparsify(w_gpu, percentage): """ Keeps only as many entries nonzero as specified by percentage. """ w = w_gpu.get() vals = sort(w)[::-1] idx = floor(prod(w.shape()) * percentage/100) zw_gpu = cua.zeros_like(w_gpu) # gpu array filled with zeros tw_gpu = cua.empty_like(w_gpu) # gpu array containing threshold tw_gpu.fill(vals[idx]) w_gpu = cua.if_positive(w_gpu > tw_gpu, w_gpu, zw_gpu) del zw_gpu del tw_gpu return w_gpu
def sparsify(x, percentage): """ Keeps only as many entries nonzero as specified by percentage. Note that only the larges values are kept. -------------------------------------------------------------------------- Usage: Call: y = sparsify(x, percentage) Input: x input ndarray x percentage percentage of nonzero entries in y Output: sparsified version of x -------------------------------------------------------------------------- Copyright (C) 2011 Michael Hirsch """ vals = np.sort(x.flatten())[::-1] idx = np.floor(np.prod(x.shape) * percentage/100) x[x < vals[idx]] = 0 return x
def buckets(x, y, size=50): assert len(x[0]) == len(y[0]) num_inputs = len(x) samples = x + y num_items = len(samples) xy = zip(*samples) xy.sort(key=lambda i: len(i[0])) t_len = size idx = 0 bucks = [[[]] for _ in range(num_items)] for item in xy: if len(item[0]) > t_len: if len(bucks[0][idx]) > 0: for buck in bucks: buck.append([]) idx += 1 while len(item[0]) > t_len: t_len += size for i in range(num_items): #print item[i] bucks[i][idx].append(item[i]) return bucks[:num_inputs], bucks[num_inputs:]
def biased_out(prediction, bias): out = [] b_pres = [] for pre in prediction: b_pres.append(pre[:,0] - pre[:,1]) props = np.concatenate(b_pres) props = np.sort(props)[::-1] idx = int(bias*len(props)) if idx == len(props): idx -= 1 th = props[idx] print 'threshold: ', th, 1 / (1 + np.exp(-th)) for pre in b_pres: pre[pre >= th] = 0 pre[pre != 0] = 1 out.append(pre) return out
def ecdf(x): """Empirical cumulative distribution function Given a 1D array of values, returns a function f(q) that outputs the fraction of values less than or equal to q. Parameters ---------- x : 1D array values for which to compute CDF Returns ---------- ecdf_fun: Callable[[float], float] function that returns the value of the CDF at a given point """ xp = np.sort(x) yp = np.arange(len(xp) + 1) / len(xp) def ecdf_fun(q): return yp[np.searchsorted(xp, q, side="right")] return ecdf_fun
def calc_volume(roi): # oar and ptv are lists using str(z) as keys # each item is an ordered list of points representing a polygon # polygon n is inside polygon n-1, then the current accumulated polygon is # polygon n subtracted from the accumulated polygon up to and including polygon n-1 # Same method DICOM uses to handle rings and islands volume = 0. all_z_values = [round(float(z), 2) for z in list(roi)] all_z_values = np.sort(all_z_values) thicknesses = np.abs(np.diff(all_z_values)) thicknesses = np.append(thicknesses, np.min(thicknesses)) all_z_values = all_z_values.tolist() for z in list(roi): # z in coord will not necessarily go in order of z, convert z to float to lookup thickness # also used to check for top and bottom slices, to add area of those contours thickness = thicknesses[all_z_values.index(round(float(z), 2))] shapely_roi = points_to_shapely_polygon(roi[z]) if shapely_roi: volume += shapely_roi.area * thickness return round(volume / 1000., 2)
def __init__(self,p=[-0.9594,4.294],pprior=None, N=50,x=None,**kwargs): f=lambda t,s: np.array([t-s*abs(t),t+s*abs(t)]) if pprior is None: self.pprior={'p'+str(i) : f(t,10) for i,t in enumerate(p) } self.label=self.pprior.keys() self.ndim=len(p) self.p=p if x is None: self.N=N self.x = np.sort(10*np.random.rand(N)) else: self.N=len(x) self.x=x self.y,self.yerr=self.data(**kwargs) # As prior, we assume an 'uniform' prior (i.e. constant prob. density)
def test_encode_data_roundtrip(): minrand, maxrand = np.sort(np.random.randint(-427, 8848, 2)) testdata = np.round((np.sum( np.dstack( np.indices((512, 512), dtype=np.float64)), axis=2) / (511. + 511.)) * maxrand, 2) + minrand baseval = -1000 interval = 0.1 rtripped = _decode(data_to_rgb(testdata.copy(), baseval, interval), baseval, interval) assert testdata.min() == rtripped.min() assert testdata.max() == rtripped.max()
def projsplx_multi(Y): n, m = Y.shape if n==1: X = projsplx(Y) else: Y1 = -np.sort(-Y,axis=1) tmpsum = np.zeros(n) tmax = np.zeros(n) bget = np.zeros(n, dtype=bool) for ii in xrange(0,m-1): active = (bget==False) tmpsum[active] = tmpsum[active] + Y1[active][:,ii] tmax[active] = (tmpsum[active] - 1)/(ii+1) deactivate = (tmax>=Y1[:,ii+1]) & active bget[deactivate] = True active = (bget==False) tmax[active] = (tmpsum[active] + Y1[active][:,m-1] - 1)/m X = (Y.transpose() - tmax).transpose() X[X<0.0] = 0.0 return X
def projsplx(y): y1 = np.array(y, copy=True) m = y1.shape[1] bget = False y1[0][::-1].sort() tmpsum = 0 for ii in xrange(0,m-1): tmpsum = tmpsum + y1[0][ii] tmax = (tmpsum - 1)/ii if tmax >= y1[0][ii+1]: bget = True break if not bget: tmax = (tmpsum + y1[0][m] -1)/m y1 = y1 - tmax y1[y1<0.0] = 0.0 return y1
def cond_projsplx_multi(Y,a_mat): n, m = Y.shape A = a_mat s = -np.sort(-(A*Y),axis=1) index = np.argsort(-(A*Y), axis=1) tmpsum = np.zeros(n) tmpsumdom = np.zeros(n) bget = np.zeros(n, dtype=bool) A_sort = A[np.arange(np.shape(A)[0])[:,np.newaxis], index] cond_s = s/(A_sort**2) tmax = np.zeros(n) for ii in xrange(0,m-1): active = (bget==False) tmpsum[active] = tmpsum[active] + cond_s[active][:,ii] tmpsumdom[active] = tmpsumdom[active]+ 1.0/A_sort[active][:,ii]**2 tmax[active] = (tmpsum[active] - 1)/tmpsumdom[active] deactivate = (tmax >= s[:,ii+1]) & active bget[deactivate] = True active = (bget==False) tmax[active] = (tmpsum[active] + cond_s[active][:,m-1] - 1)/(tmpsumdom[active]+1.0/(A_sort[active][:,m-1])**2) X = (Y - np.matlib.repmat(tmax.reshape(n,1),1,m)*1.0/A) X[X<0.0] = 0.0 X = X/A return X
def get_symmetry_code_tri(pts): if len(pts) == 1: return '_s3()' elif len(pts) == 3: # Symmetry group [[a, a, b], [a, b, a], [b, a, a]]. # Find the equal value `a`. tol = 1.0e-12 beta = pts[0] - pts[0][0] ct = numpy.count_nonzero(abs(beta) < tol) assert ct in [1, 2], beta val = pts[0][0] if ct == 2 else pts[0][1] return '_s21({:.15e})'.format(val) # Symmetry group [[a, b, c], [c, a, b], ...]. assert len(pts) == 6 # Take the two largest value from a, b, c. pt0 = numpy.sort(pts[0]) return '_s111({:.15e}, {:.15e})'.format(pt0[2], pt0[1])
def get_quadrature_points(order): """ Returns the quadrature points for Gauss-Lobatto quadrature as a function of the order of the polynomial we want to represent. See: https://en.wikipedia.org/wiki/Gaussian_quadrature """ return np.sort(np.concatenate((np.array([-1,1]), poly.basis(order).deriv().roots())))
def trataGroups(objeto): current = list(filter(None.__ne__, objeto)) current = np.sort(current, axis=0) for i in range(len(current[0])): current_ = [j[i] for j in current] mean_ = np.round(np.mean(current_, axis=0), 4) deviation_ = np.round(np.std(current_, axis=0, ddof=1), 4) return [mean_, deviation_]
def PA(samples, variables): datasets = 5000 eig_vals = [] for i in range(datasets): data = np.random.standard_normal((variables, samples)) cor_ = np.corrcoef(data) eig_vals.append(np.sort(np.linalg.eig(cor_)[0])[::-1]) quantile = (np.round(np.percentile(eig_vals, 95.0, axis=0), 4)) mean_ = (np.round(np.mean(eig_vals, axis=0), 4)) return quantile