我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.argsort()。
def PCA(data, num_components=None): # mean center the data data -= data.mean(axis=0) # calculate the covariance matrix R = np.cov(data, rowvar=False) # calculate eigenvectors & eigenvalues of the covariance matrix # use 'eigh' rather than 'eig' since R is symmetric, # the performance gain is substantial V, E = np.linalg.eigh(R) # sort eigenvalue in decreasing order idx = np.argsort(V)[::-1] E = E[:,idx] # sort eigenvectors according to same index V = V[idx] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) E = E[:, :num_components] # carry out the transformation on the data using eigenvectors # and return the re-scaled data, eigenvalues, and eigenvectors return np.dot(E.T, data.T).T, V, E
def __SubDoWavelets(self,waveforms): scales = 4 dimensions = 10 nspk,ls = waveforms.shape cc = pywt.wavedec(waveforms,"haar",mode="symmetric",level=scales,axis=-1) cc = np.hstack(cc) sd = list() for i in range(ls): test_data = cc[:,i] thr_dist = np.std(test_data,ddof=1)*3 thr_dist_min = np.mean(test_data)-thr_dist thr_dist_max = np.mean(test_data)+thr_dist aux = test_data[(test_data>thr_dist_min)&(test_data<thr_dist_max)] if aux.size > 10: sd.append(self.__test_ks(aux)) else: sd.append(0) ind = np.argsort(sd) ind = ind[::-1] coeff = ind[:dimensions] waveletspk = cc[:,coeff] return waveletspk
def get_feature_importance(list_of_features): n_estimators=10000 random_state=0 n_jobs=4 x_train=data_frame[list_of_features] y_train=data_frame.iloc[:,-1] feat_labels= data_frame.columns[1:] forest = BaggingRegressor(n_estimators=n_estimators,random_state=random_state,n_jobs=n_jobs) forest.fit(x_train,y_train) importances=forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(x_train.shape[1]): print("%2d) %-*s %f" % (f+1,30,feat_labels[indices[f]], importances[indices[f]])) plt.title("Feature Importance") plt.bar(range(x_train.shape[1]),importances[indices],color='lightblue',align='center') plt.xticks(range(x_train.shape[1]),feat_labels[indices],rotation=90) plt.xlim([-1,x_train.shape[1]]) plt.tight_layout() plt.show()
def update_sort_idcs(self): # The selected points are sorted before all the other points -- an easy # way to achieve this is to add the maximum score to their score if self.current_order == 0: score = self.score_x elif self.current_order == 1: score = self.score_y elif self.current_order == 2: score = self.score_z else: raise AssertionError(self.current_order) score = score.copy() if len(self.selected_points): score[np.array(sorted(self.selected_points))] += score.max() self.sort_idcs = np.argsort(score)
def removeTopPCs(X, numRemovePCs): t0 = time.time() X_mean = X.mean(axis=0) X -= X_mean XXT = symmetrize(blas.dsyrk(1.0, X, lower=0)) s,U = la.eigh(XXT) if (np.min(s) < -1e-4): raise Exception('Negative eigenvalues found') s[s<0]=0 ind = np.argsort(s)[::-1] U = U[:, ind] s = s[ind] s = np.sqrt(s) #remove null PCs ind = (s>1e-6) U = U[:, ind] s = s[ind] V = X.T.dot(U/s) #print 'max diff:', np.max(((U*s).dot(V.T) - X)**2) X = (U[:, numRemovePCs:]*s[numRemovePCs:]).dot((V.T)[numRemovePCs:, :]) X += X_mean return X
def _translate(seq, f_init, f_next, trg_eos_idx, src_sel, trg_sel, k, cond_init_trg, normalize, n_best, **kwargs): sample, score = gen_sample( f_init, f_next, x=numpy.array(seq).reshape([len(seq), 1]), eos_idx=trg_eos_idx, src_selector=src_sel, trg_selector=trg_sel, k=k, maxlen=3*len(seq), stochastic=False, argmax=False, cond_init_trg=cond_init_trg, **kwargs) if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if n_best == 1: sidx = numpy.argmin(score) elif n_best > 1: sidx = numpy.argsort(score)[:n_best] else: raise ValueError('n_best cannot be negative!') return sample[sidx], score[sidx]
def closestNeighbor(query, embedding_array, normed=False, top_k=1): '''Gets the index of the closest neighbor of embedding_array to the query point. Distance metric is cosine. SLOW. DO NOT USE THIS FOR RAPID COMPUTATION. ''' embedding_array = numpy.array(embedding_array) if not normed: embedding_array = numpy.array([ (embedding_array[i] / numpy.linalg.norm(embedding_array[i])) for i in range(embedding_array.shape[0]) ]) ## assuming embeddings are unit-normed by this point; ## norm(query) is a constant factor, so we can ignore it dists = numpy.array([ numpy.dot(query, embedding_array[i]) for i in range(embedding_array.shape[0]) ]) sorted_ixes = numpy.argsort(-1 * dists) return sorted_ixes[:top_k]
def testMerge(self, dtype=dtype): testarray1 = range(1,101) testarray2 = range(5,106) a = numpy.empty((100,2), dtype=dtype) b = numpy.empty((100,2), dtype=dtype) merged = numpy.empty((200,2), dtype=dtype) incompatible1 = numpy.empty((200,3), dtype=dtype) incompatible2 = numpy.empty(200, dtype=dtype) a[:,0] = numpy.arange(1,101) a[:,1] = numpy.arange(2,102) b[:,0] = numpy.arange(5,105) b[:,1] = numpy.arange(6,106) ref = numpy.concatenate([a,b]) ref = ref[numpy.argsort(ref[:,0])] self.assertEqual(mapped_struct.index_merge(a, b, merged), 200) self.assertTrue((merged == ref).all()) self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible1) self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible1, merged) self.assertRaises(ValueError, mapped_struct.index_merge, a, b, incompatible2) self.assertRaises(ValueError, mapped_struct.index_merge, a, incompatible2, merged)
def __init__(self, pos, color, mode=None): """ =============== ============================================================== **Arguments:** pos Array of positions where each color is defined color Array of RGBA colors. Integer data types are interpreted as 0-255; float data types are interpreted as 0.0-1.0 mode Array of color modes (ColorMap.RGB, HSV_POS, or HSV_NEG) indicating the color space that should be used when interpolating between stops. Note that the last mode value is ignored. By default, the mode is entirely RGB. =============== ============================================================== """ self.pos = np.array(pos) order = np.argsort(self.pos) self.pos = self.pos[order] self.color = np.array(color)[order] if mode is None: mode = np.ones(len(pos)) self.mode = mode self.stopsCache = {}
def __loadChnTimeWave(self,f,selectChan): times = list() waveforms = list() spk_startswith = "spike_{0}".format(selectChan) for chn_unit in f["spikes"].keys(): if chn_unit.startswith(spk_startswith): time = f["spikes"][chn_unit]["times"].value waveform = f["spikes"][chn_unit]["waveforms"].value times.append(time) waveforms.append(waveform) if times: times = np.hstack(times) waveforms = np.vstack(waveforms) sort_index = np.argsort(times) waveforms = waveforms[sort_index] times = times[sort_index] return times,waveforms else: return None,None
def __load_waveforms(self,selectChan,file_name): spk_startswith = "spike_{0}".format(selectChan) with hp.File(file_name,"r") as f: times = list() waveforms = list() for chn_unit in f["spikes"].keys(): if chn_unit.startswith(spk_startswith): tep_time = f["spikes"][chn_unit]["times"].value waveform = f["spikes"][chn_unit]["waveforms"].value times.append(tep_time) waveforms.append(waveform) if times: times = np.hstack(times) waveforms = np.vstack(waveforms) sort_index = np.argsort(times) waveforms = waveforms[sort_index] return waveforms else: return None
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list): """ :param row: it is a list.length is number of labels. e.g. 2002 :param vocabulary_index2word_label :param result_list :return: a lable """ label_list=list(np.argsort(row)) label_list.reverse() #print("label_list:",label_list) # a list,length is number of labels. for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want. #print(i,"index:",index) flag1=vocabulary_index2word_label[index] not in result_list flag2=index!=vocabulary_word2index_label[_PAD] flag3=index!=vocabulary_word2index_label[_END] if flag1 and flag2 and flag3: #print("going to return ") return vocabulary_index2word_label[index] # write question id and labels to file system.
def get_label_using_logits_batch(question_id_sublist, logits_batch, vocabulary_index2word_label, f, top_number=5): print("get_label_using_logits.shape:", np.array(logits_batch).shape) # (1, 128, 2002)) for i, logits in enumerate(logits_batch): index_list = np.argsort(logits)[-top_number:] #print("index_list:",index_list) index_list = index_list[::-1] label_list = [] for index in index_list: #print("index:",index) label = vocabulary_index2word_label[index] label_list.append( label) # ('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) # print("get_label_using_logits.label_list",label_list) write_question_id_with_labels(question_id_sublist[i], label_list, f) f.flush() # get label using logits
def process_each_row_get_lable(row,vocabulary_index2word_label,vocabulary_word2index_label,result_list): """ :param row: it is a list.length is number of labels. e.g. 2002 :param vocabulary_index2word_label :param result_list :return: a lable """ label_list=list(np.argsort(row)) label_list.reverse() #print("label_list:",label_list) # a list,length is number of labels. for i,index in enumerate(label_list): # if index is not exists, and not _PAD,_END, then it is the label we want. #print(i,"index:",index) flag1=vocabulary_index2word_label[index] not in result_list flag2=index!=vocabulary_word2index_label[_PAD] flag3=index!=vocabulary_word2index_label[_END] if flag1 and flag2 and flag3: #print("going to return ") return vocabulary_index2word_label[index]
def filter_sort_unique(self, max_objval=float('Inf')): # filter if max_objval < float('inf'): good_idx = self.objvals <= max_objval self.objvals = self.objvals[good_idx] self.solutions = self.solutions[good_idx] if len(self.objvals) > 0: sort_idx = np.argsort(self.objvals) self.objvals = self.objvals[sort_idx] self.solutions = self.solutions[sort_idx] # unique b = np.ascontiguousarray(self.solutions).view( np.dtype((np.void, self.solutions.dtype.itemsize * self.P))) _, unique_idx = np.unique(b, return_index=True) self.objvals = self.objvals[unique_idx] self.solutions = self.solutions[unique_idx]
def round_solution_pool(pool, constraints): pool.distinct().sort() P = pool.P L0_reg_ind = np.isnan(constraints['coef_set'].C_0j) L0_max = constraints['L0_max'] rounded_pool = SolutionPool(P) for solution in pool.solutions: # sort from largest to smallest coefficients feature_order = np.argsort([-abs(x) for x in solution]) rounded_solution = np.zeros(shape=(1, P)) l0_norm_count = 0 for k in range(0, P): j = feature_order[k] if not L0_reg_ind[j]: rounded_solution[0, j] = np.round(solution[j], 0) elif l0_norm_count < L0_max: rounded_solution[0, j] = np.round(solution[j], 0) l0_norm_count += L0_reg_ind[j] rounded_pool.add(objvals=np.nan, solutions=rounded_solution) rounded_pool.distinct().sort() return rounded_pool
def listen(self, results): score_out = results['score_out'] y_gt = results['y_gt'] sort_idx = np.argsort(score_out, axis=-1) idx_gt = np.argmax(y_gt, axis=-1) correct = 0 count = 0 for kk, ii in enumerate(idx_gt): sort_idx_ = sort_idx[kk][::-1] for jj in sort_idx_[:self.top_k]: if ii == jj: correct += 1 break count += 1 # self.log.info('Correct {}/{}'.format(correct, count)) self.correct += correct self.count += count self.step = int(results['step']) # self.log.info('Step {}'.format(self.step)) pass
def nn(model, text, vectors, query, k=5): """ Return the nearest neighbour sentences to query text: list of sentences vectors: the corresponding representations for text query: a string to search """ qf = encode(model, [query]) qf /= norm(qf) scores = numpy.dot(qf, vectors.T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [text[a] for a in sorted_args[:k]] print('QUERY: ' + query) print('NEAREST: ') for i, s in enumerate(sentences): print(s, sorted_args[i])
def nn(model, text, vectors, query, k=5): """ Return the nearest neighbour sentences to query text: list of sentences vectors: the corresponding representations for text query: a string to search """ qf = encode(model, [query]) qf /= norm(qf) scores = numpy.dot(qf, vectors.T).flatten() sorted_args = numpy.argsort(scores)[::-1] sentences = [text[a] for a in sorted_args[:k]] print 'QUERY: ' + query print 'NEAREST: ' for i, s in enumerate(sentences): print s, sorted_args[i]
def _spatial_sort(glyph): from scipy.spatial.distance import cdist from numpy import argsort from numpy import argmin curr = argmin(glyph[:,0]) visited = set([curr]) order = [curr] dd = cdist(glyph, glyph) while len(visited)<len(glyph): row = dd[curr,:] for i in argsort(row): if row[i]<=0.0 or i==curr or i in visited: continue order.append(i) visited.add(i) break glyph[:,:] = glyph[order,:]
def label_ranking_reciprocal_rank(label, # [sent_num] preds): # [sent_num] """ Calcualting the reciprocal rank according to definition, """ rank = np.argsort(preds)[::-1] #pos_rank = np.take(rank, np.where(label == 1)[0]) #return np.mean(1.0 / pos_rank) if_find = False pos = 0 for r in rank: pos += 1 if label[r] == 1: first_pos_r = pos if_find = True break assert(if_find) return 1.0 / first_pos_r
def _matrix_inverse(self, matrix): """ Computes inverse of a matrix. """ matrix = np.array(matrix) n_features = matrix.shape[0] rank = np.linalg.matrix_rank(matrix) if rank == n_features: return np.linalg.inv(matrix) else: # Matrix is not full rank, so use Hadi's technique to compute inverse # Reference: Ali S. Hadi (1992) "Identifying Multiple Outliers in Multivariate Data" eg. 2.3, 2.4 eigenValues, eigenVectors = np.linalg.eig(matrix) eigenValues = np.abs(eigenValues) # to deal with -0 values idx = eigenValues.argsort()[::-1] eigenValues = eigenValues[idx] eigenVectors = eigenVectors[:, idx] s = eigenValues[eigenValues != 0].min() w = [1 / max(e, s) for e in eigenValues] W = w * np.eye(n_features) return eigenVectors.dot(W).dot(eigenVectors.T)
def get_1000G_snps(sumstats, out_file): sf = np.loadtxt(sumstats,dtype=str,skiprows=1) h5f = h5py.File('ref/Misc/1000G_SNP_info.h5','r') rf = h5f['snp_chr'][:] h5f.close() ind1 = np.in1d(sf[:,1],rf[:,2]) ind2 = np.in1d(rf[:,2],sf[:,1]) sf1 = sf[ind1] rf1 = rf[ind2] ### check order ### if sum(sf1[:,1]==rf1[:,2])==len(rf1[:,2]): print 'Good!' else: print 'Shit happens, sorting sf1 to have the same order as rf1' O1 = np.argsort(sf1[:,1]) O2 = np.argsort(rf1[:,2]) O3 = np.argsort(O2) sf1 = sf1[O1][O3] out = ['hg19chrc snpid a1 a2 bp or p'+'\n'] for i in range(len(sf1[:,1])): out.append(sf1[:,0][i]+' '+sf1[:,1][i]+' '+sf1[:,2][i]+' '+sf1[:,3][i]+' '+rf1[:,1][i]+' '+sf1[:,5][i]+' '+sf1[:,6][i]+'\n') ff = open(out_file,"w") ff.writelines(out) ff.close()
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''): cmap = sns.cubehelix_palette(as_cmap=True, light=.9) m, nv = mis.shape for j in range(m): inds = np.argsort(- mis[j, :])[:topk] if len(inds) >= 2: plt.clf() order = np.argsort(cont[:,j]) subdata = data[:, inds][order].T subdata -= np.nanmean(subdata, axis=1, keepdims=True) subdata /= np.nanstd(subdata, axis=1, keepdims=True) columns = [column_label[i] for i in inds] sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata)) filename = '{}/heatmaps/group_num={}.png'.format(prefix, j) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) plt.title("Latent factor {}".format(j)) plt.yticks(rotation=0) plt.savefig(filename, bbox_inches='tight') plt.close('all') #plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j], # outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
def plot_top_relationships(data, corex, labels, column_label, topk=5, prefix=''): dual = (corex.moments['X_i Y_j'] * corex.moments['X_i Z_j']).T alpha = dual > 0.04 cy = corex.moments['ry'] m, nv = alpha.shape for j in range(m): inds = np.where(alpha[j] > 0)[0] inds = inds[np.argsort(- dual[j][inds])][:topk] if len(inds) >= 2: if dual[j, inds[0]] > 0.1: factor = labels[:, j] title = '$Y_{%d}$' % j else: k = np.argmax(np.abs(cy[j])) if k == j: k = np.argsort(-np.abs(cy[j]))[1] factor = corex.moments['X_i Z_j'][inds[0], j] * labels[:, j] + corex.moments['X_i Z_j'][inds[0], k] * labels[:, k] title = '$Y_{%d} + Y_{%d}$' % (j, k) plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=factor, outfile=prefix + '/relationships/group_num=' + str(j), title=title)
def trim(g, max_parents=False, max_children=False): for node in g: if max_parents: parents = list(g.successors(node)) weights = [g.edge[node][parent]['weight'] for parent in parents] for weak_parent in np.argsort(weights)[:-max_parents]: g.remove_edge(node, parents[weak_parent]) if max_children: children = g.predecessors(node) weights = [g.edge[child][node]['weight'] for child in children] for weak_child in np.argsort(weights)[:-max_children]: g.remove_edge(children[weak_child], node) return g # Misc. utilities
def deviation_plot(rp, variable_name, slope_cutoff=1, average_cutoff = 2.): average_panel = rp.value_panel(variable_name, types=['average']) average_panel = (average_panel.T - np.median(average_panel, axis=1)).T average_panel.sort() average_ranges = np.max(average_panel, axis=1) - np.min(average_panel, axis=1) average_panel = average_panel[np.argsort(average_ranges)][::-1] slope_panel = rp.value_panel(variable_name, types=['slope']) slope_panel = (slope_panel.T - np.median(slope_panel, axis=1)).T slope_panel.sort() slope_ranges = np.max(slope_panel, axis=1) - np.min(slope_panel, axis=1) slope_panel = slope_panel[np.argsort(slope_ranges)][::-1] return _multiplot(rp.dataset, variable_name, slope_panel, average_panel, left_vmin = -1.0*slope_cutoff, left_vmax = slope_cutoff, right_vmin = -1.0*average_cutoff, right_vmax = average_cutoff)
def get_local_words(preds, vocab, NEs=[], k=50): """ given the word probabilities over many coordinates, first normalize the probability of each word in different locations to get a probability distribution, then compute the entropy of the word's distribution over all coordinates and return the words that are low entropy and are not named entities. """ #normalize the probabilites of each vocab using entropy normalized_preds = normalize(preds, norm='l1', axis=0) entropies = stats.entropy(normalized_preds) sorted_indices = np.argsort(entropies) sorted_local_words = np.array(vocab)[sorted_indices].tolist() filtered_local_words = [] NEset = set(NEs) for word in sorted_local_words: if word in NEset: continue filtered_local_words.append(word) return filtered_local_words[0:k]
def cr(self): # Composite Reliability composite = pd.DataFrame(0, index=np.arange(1), columns=self.latent) for i in range(self.lenlatent): block = self.data_[self.Variables['measurement'] [self.Variables['latent'] == self.latent[i]]] p = len(block.columns) if(p != 1): cor_mat = np.cov(block.T) evals, evecs = np.linalg.eig(cor_mat) U, S, V = np.linalg.svd(cor_mat, full_matrices=False) indices = np.argsort(evals) indices = indices[::-1] evecs = evecs[:, indices] evals = evals[indices] loadings = V[0, :] * np.sqrt(evals[0]) numerador = np.sum(abs(loadings))**2 denominador = numerador + (p - np.sum(loadings ** 2)) cr = numerador / denominador composite[self.latent[i]] = cr else: composite[self.latent[i]] = 1 composite = composite.T return(composite)
def _get_sorted_channels_(self, all_keys, pattern): sub_list = [f for f in all_keys if pattern in f] all_channels = [int(f.split(pattern)[1]) for f in sub_list] idx = numpy.argsort(all_channels) return sub_list, idx
def set_streams(self, stream_mode): if stream_mode == 'single-file': sources = [] to_write = [] count = 0 params = self.get_description() my_file = h5py.File(self.file_name) all_matches = [re.findall('\d+', u) for u in my_file.keys()] all_streams = [] for m in all_matches: if len(m) > 0: all_streams += [int(m[0])] idx = numpy.argsort(all_streams) for i in xrange(len(all_streams)): params['h5_key'] = my_file.keys()[idx[i]] new_data = type(self)(self.file_name, params) sources += [new_data] to_write += ['We found the datafile %s with t_start %d and duration %d' %(new_data.file_name, new_data.t_start, new_data.duration)] print_and_log(to_write, 'debug', logger) return sources elif stream_mode == 'multi-files': return H5File.set_streams(stream_mode)
def set_streams(self, stream_mode): if stream_mode == 'single-file': sources = [] to_write = [] count = 0 params = self.get_description() my_file = h5py.File(self.file_name) all_matches = my_file.get('recordings').keys() all_streams = [] for m in all_matches: all_streams += [int(m)] idx = numpy.argsort(all_streams) for count in xrange(len(all_streams)): params['recording_number'] = all_streams[idx[count]] new_data = type(self)(self.file_name, params) sources += [new_data] to_write += ['We found the datafile %s with t_start %d and duration %d' %(new_data.file_name, new_data.t_start, new_data.duration)] print_and_log(to_write, 'debug', logger) return sources elif stream_mode == 'multi-files': return H5File.set_streams(stream_mode)
def rho_estimation(data, update=None, compute_rho=True, mratio=0.01): N = len(data) rho = numpy.zeros(N, dtype=numpy.float32) if update is None: dist = distancematrix(data) didx = lambda i,j: i*N + j - i*(i+1)//2 - i - 1 nb_selec = max(5, int(mratio*N)) sdist = {} if compute_rho: for i in xrange(N): indices = numpy.concatenate((didx(i, numpy.arange(i+1, N)), didx(numpy.arange(0, i-1), i))) tmp = numpy.argsort(numpy.take(dist, indices))[:nb_selec] sdist[i] = numpy.take(dist, numpy.take(indices, tmp)) rho[i] = numpy.mean(sdist[i]) else: M = len(update[0]) nb_selec = max(5, int(mratio*M)) sdist = {} for i in xrange(N): dist = distancematrix(data[i].reshape(1, len(data[i])), update[0]).ravel() all_dist = numpy.concatenate((dist, update[1][i])) idx = numpy.argsort(all_dist)[:nb_selec] sdist[i] = numpy.take(all_dist, idx) rho[i] = numpy.mean(sdist[i]) return rho, dist, sdist, nb_selec
def update_data_plot(self): reverse_sort = np.argsort(self.sort_idcs) if len(self.inspect_points): inspect = reverse_sort[np.array(sorted(self.inspect_points))] data = numpy.vstack((np.ones(len(inspect))*(2*self.raw_lags[-1]-self.raw_lags[-2]), inspect+0.5)).T self.inspect_markers.set_offsets(data) self.inspect_markers.set_color(self.inspect_colors) else: self.inspect_markers.set_offsets([]) self.inspect_markers.set_color([]) self.ui.data_overview.draw_idle()
def eigenDecompose(self, X, K, normalize=True): if (X.shape[1] >= X.shape[0]): s,U = la.eigh(K) else: U, s, _ = la.svd(X, check_finite=False, full_matrices=False) if (s.shape[0] < U.shape[1]): s = np.concatenate((s, np.zeros(U.shape[1]-s.shape[0]))) #note: can use low-rank formulas here s=s**2 if normalize: s /= float(X.shape[1]) if (np.min(s) < -1e-10): raise Exception('Negative eigenvalues found') s[s<0]=0 ind = np.argsort(s)[::-1] U = U[:, ind] s = s[ind] return s,U
def threshold_from_predictions(y, y_pred, false_positive_margin=0, recall=1): """Determines a threshold for classifying examples as positive Args: y: labels y_pred: scores from the classifier recall: Threshold is set to classify at least this fraction of positive labelled examples as positive false_positive_margin: Threshold is set to acheive desired recall, and then is extended to include an additional fraction of negative labelled examples equal to false_positive_margin (This allows adding a buffer to the threshold while maintaining a constant "cost") """ n_positive = np.count_nonzero(y) n_negative = len(y) - n_positive if n_positive == 0: return np.max(y_pred) if false_positive_margin == 0 and recall == 1: return np.min(y_pred[y]) ind = np.argsort(y_pred) y_pred_sorted = y_pred[ind] y_sorted = y[ind] so_far = [0, 0] j = 0 for i in reversed(range(len(y_sorted))): so_far[y_sorted[i]] += 1 if so_far[1] >= int(np.floor(recall * n_positive)): j = i break so_far = [0, 0] if false_positive_margin == 0: return y_pred_sorted[j] k = 0 for i in reversed(range(j)): so_far[y_sorted[i]] += 1 if so_far[0] >= false_positive_margin * n_negative: k = i break return y_pred_sorted[k]
def relabel_by_size(labels): """ Relabel clusters so they are sorted by number of members, descending. Args: labels (np.array(int)): 1-based cluster labels """ order = np.argsort(np.argsort(-np.bincount(labels))) return 1 + order[labels]
def adjust_pvalue_bh(p): """ Multiple testing correction of p-values using the Benjamini-Hochberg procedure """ descending = np.argsort(p)[::-1] # q = p * N / k where p = p-value, N = # tests, k = p-value rank scale = float(len(p)) / np.arange(len(p), 0, -1) q = np.minimum(1, np.minimum.accumulate(scale * p[descending])) # Return to original order return q[np.argsort(descending)]
def compute_readpairs_per_umi_threshold(reads, subsample_rate): ''' Compute a threshold above which the UMIs are unlikely to be PCR off-products. reads (np.array(int)) - Read pairs for each UMI subsample_rate (float) - Subsample reads to this fraction. Returns threshold (int) - The RPPU threshold in the subsampled space ''' if len(np.unique(reads)) < 2: print 'Skipping RPPU threshold calculation.' return 1 print 'RPPU subsample rate: %0.4f' % subsample_rate reads = np.random.binomial(reads, subsample_rate) reads = reads[reads > 0] if len(np.unique(reads)) < 2: print 'Subsampling gave a degenerate distribution of RPPU. Skipping RPPU threshold calculation.' return 1 new_n50 = tk_stats.NX(reads, 0.5) print 'New N50: %d:' % new_n50 # Log-transform counts log_reads = np.log(reads) # Run K-Means. Reshape necessary because kmeans takes a matrix. kmeans = sk_cluster.KMeans(2).fit(log_reads.reshape((-1,1))) kmeans.predict(log_reads.reshape((-1,1))) # Take the cluster with the smallest mean min_cluster = np.argsort(np.ravel(kmeans.cluster_centers_))[0] print 'RPPU component means: ' + str(list(iter(np.exp(kmeans.cluster_centers_)))) print 'RPPU component members: ' + str(np.bincount(kmeans.labels_)) # Take the max element in the min-cluster threshold = np.max(reads[kmeans.labels_ == min_cluster]) return threshold
def rebalance(self): """ Rebalances the binary heap. Takes O(n log n) time to run. Avoid using, when possible. """ # Sort array by priority sorted_indices_by_priority = np.argsort(-self.pq_array[:,0]) self.pq_array = self.pq_array[sorted_indices_by_priority] pq_indices = range(self.size) # Create hash tables self.pq_hash = dict(zip(pq_indices,self.pq_array[:,1])) self.exp_hash = dict(zip(self.pq_array[:,1],pq_indices))
def rank_output(self): self.output_ranks = np.argsort(self.output_raw, axis=1, kind='mergesort').ravel()[::-1].astype(np.int32)
def overlay_emojiface(probs): if max(probs) > 0.8: emotion = emotions[np.argmax(probs)] return 'emoji/{}-{}.png'.format(emotion, emotion) else: index1, index2 = np.argsort(probs)[::-1][:2] emotion1 = emotions[index1] emotion2 = emotions[index2] return 'emoji/{}-{}.png'.format(emotion1, emotion2)
def __call__(self, words, weights, vocabulary_max): if len(words) < vocabulary_max * self.trigger_ratio: return words, weights if not isinstance(words, numpy.ndarray): words = numpy.array(words) # Tail optimization does not help with very large vocabularies if len(words) > vocabulary_max * 2: indices = numpy.argpartition(weights, len(weights) - vocabulary_max) indices = indices[-vocabulary_max:] words = words[indices] weights = weights[indices] return words, weights # Vocabulary typically consists of these three parts: # 1) the core - we found it's border - `core_end` - 15% # 2) the body - 70% # 3) the minor tail - 15% # (1) and (3) are roughly the same size # (3) can be safely discarded, (2) can be discarded with care, # (1) shall never be discarded. sorter = numpy.argsort(weights)[::-1] weights = weights[sorter] trend_start = int(len(weights) * 0.2) trend_finish = int(len(weights) * 0.8) z = numpy.polyfit(numpy.arange(trend_start, trend_finish), numpy.log(weights[trend_start:trend_finish]), 1) exp_z = numpy.exp(z[1] + z[0] * numpy.arange(len(weights))) avg_error = numpy.abs(weights[trend_start:trend_finish] - exp_z[trend_start:trend_finish]).mean() tail_size = numpy.argmax((numpy.abs(weights - exp_z) < avg_error)[::-1]) weights = weights[:-tail_size][:vocabulary_max] words = words[sorter[:-tail_size]][:vocabulary_max] return words, weights