我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.sparse.coo_matrix()。
def make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=1, normalize=False): prob_norm = old_mat.sum() + (old_mat.shape[0] * old_mat.shape[1]) * smooth old_mat = old_mat.tocoo() row_d = old_mat.row col_d = old_mat.col data_d = old_mat.data neg = np.log(neg) for i in xrange(len(old_mat.data)): if data_d[i] == 0.0: continue joint_prob = (data_d[i] + smooth) / prob_norm denom = row_probs[row_d[i], 0] * col_probs[0, col_d[i]] if denom == 0.0: data_d[i] = 0 continue data_d[i] = np.log(joint_prob / denom) data_d[i] = max(data_d[i] - neg, 0) if normalize: data_d[i] /= -1*np.log(joint_prob) return coo_matrix((data_d, (row_d, col_d)))
def _getEdgeP(self, xEdge, yEdge, zEdge): if self.dim == 2: raise Exception('Not implemented') # this should be a reordering of the face inner product? ind1, ind2, ind3 = [], [], [] for ind in self._sortedCells: p = self._pointer(ind) w = self._levelWidth(p[-1]) posX = [0, 0] if xEdge == 'eX0' else [w, 0] if xEdge == 'eX1' else [0, w] if xEdge == 'eX2' else [w, w] posY = [0, 0] if yEdge == 'eY0' else [w, 0] if yEdge == 'eY1' else [0, w] if yEdge == 'eY2' else [w, w] posZ = [0, 0] if zEdge == 'eZ0' else [w, 0] if zEdge == 'eZ1' else [0, w] if zEdge == 'eZ2' else [w, w] ind1.append( self._ex2i[self._index([ p[0] , p[1] + posX[0], p[2] + posX[1], p[3]])] ) ind2.append( self._ey2i[self._index([ p[0] + posY[0], p[1] , p[2] + posY[1], p[3]])] + self.ntEx ) ind3.append( self._ez2i[self._index([ p[0] + posZ[0], p[1] + posZ[1], p[2] , p[3]])] + self.ntEx + self.ntEy ) IND = np.r_[ind1, ind2, ind3] PXXX = sp.coo_matrix((np.ones(self.dim*self.nC), (range(self.dim*self.nC), IND)), shape=(self.dim*self.nC, self.ntE)).tocsr() Re = self._deflationMatrix('E') return PXXX * Re
def matixToRowColDataArr(X): """ Convert sparse affinity/similarity matrix to numpy array format (row_array,col_array,data_array) So cython update function can work efficiently on it. """ # convert to coo format (from lil,csr,csc) if isinstance(X, coo_matrix): X_coo=X elif (isinstance(X, csr_matrix)) or (isinstance(X, lil_matrix)): X_coo=X.tocoo() else: # others like numpy matrix could be convert to coo matrix X_coo=coo_matrix(X) # Upcast matrix to a floating point format (if necessary) X_coo=X_coo.asfptype() # get row_array,col_array,data_array in their correct data type (for cython to work) row_array,col_array,data_array=X_coo.row.astype(np.int),X_coo.col.astype(np.int),X_coo.data return row_array,col_array,data_array
def loadMatrix(data_file, dataCutoff=None): """ Load similarity data file if dataCutoff is not None, all value (affinity/similarity) below this will be discarded """ #print('{0}, loading data'.format(datetime.now())) simi=pd.DataFrame.from_csv(data_file,sep='\t',index_col=None) samples=sorted(list(set(simi.row) | set(simi.col))) samplesInd={el:ind for ind,el in enumerate(samples)} row,col,data=simi.row.map(lambda x:samplesInd[x]),simi.col.map(lambda x:samplesInd[x]),simi.data if dataCutoff is not None: row_new,col_new,data_new=[],[],[] for r,c,d in zip(row,col,data): if d>dataCutoff: row_new.append(r) col_new.append(c) data_new.append(d) simi_mat=coo_matrix((data_new,(row_new,col_new)), shape=(len(samplesInd),len(samplesInd))) else: simi_mat=coo_matrix((data,(row,col)), shape=(len(samplesInd),len(samplesInd))) return simi_mat
def makeDelta1(R): """Make the delta1 coboundary matrix :param R: Edge list NEdges x 2. It is assumed that there is at least one edge incident on every vertex """ NEdges = R.shape[0] NVertices = int(np.max(R))+1 #Make a list of edges for fast lookup Edges = [] for i in range(NVertices): Edges.append({}) for i in range(R.shape[0]): [a, b] = [int(R[i, 0]), int(R[i, 1])] Edges[a][b] = i Edges[b][a] = i tic = time.time() (I, J, V) = get3CliquesBrute(Edges) toc = time.time() print("Elapsed time 3 cliques brute: %g"%(toc - tic)) [I, J, V] = [a.flatten() for a in [I, J, V]] TriNum = len(I)/3 Delta1 = sparse.coo_matrix((V, (I, J)), shape = (TriNum, NEdges)).tocsr() return Delta1
def _create_main_design(self, **kwargs): r""" Create design matrix for main effects Keyword Args: * *df* (``DataFrame``). specify a new dataframe to create design matrix from Returns: array_like: design matrix in sparse CSR format """ df = kwargs.get('df', self.train_df) df.reset_index(drop=True, inplace=True) df['row_index'] = df.index df['intercept'] = 1.0 # assume intercept is always included id_cols = ['row_index'] melted_df = pd.melt(df[id_cols + self.main_effects], id_cols) melted_df = melted_df.merge(self.main_map, on='variable') melted_df['col_index'] = melted_df['main_idx'] row = melted_df.row_index col = melted_df.col_index data = melted_df.value return sparse.coo_matrix((data, (row, col)), shape=(max(row) + 1, max(col) + 1)).tocsr()
def read_data(filename): """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe and a sparse matrix of artist/user/playcount """ # read in triples of user/artist/playcount from the input dataset # get a model based off the input params start = time.time() logging.debug("reading data from %s", filename) data = pandas.read_table(filename, usecols=[0, 2, 3], names=['user', 'artist', 'plays']) # map each artist and user to a unique numeric value data['user'] = data['user'].astype("category") data['artist'] = data['artist'].astype("category") # create a sparse matrix of all the users/plays plays = coo_matrix((data['plays'].astype(numpy.float32), (data['artist'].cat.codes.copy(), data['user'].cat.codes.copy()))) logging.debug("read data file in %s", time.time() - start) return data, plays
def bm25_weight(X, K1=100, B=0.8): """ Weighs each row of a sparse matrix X by BM25 weighting """ # calculate idf per term (user) X = coo_matrix(X) N = float(X.shape[0]) idf = log(N / (1 + bincount(X.col))) # calculate length_norm per document (artist) row_sums = numpy.ravel(X.sum(axis=1)) average_length = row_sums.mean() length_norm = (1.0 - B) + B * row_sums / average_length # weight matrix rows by bm25 X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] return X
def prepare_model(self, dataset): '''Load the data from the training file into a format adapted for the KNN methods. ''' filename = dataset.dirname + 'data/train_set_triplets' if os.path.isfile(filename + '.npy'): file_content = np.load(filename + '.npy') else: file_content = np.loadtxt(filename) np.save(filename, file_content) #self.user_item = ssp.coo_matrix((file_content[:,2], (file_content[:,0], file_content[:,1]))).tocsr() self.binary_user_item = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,0], file_content[:,1]))).tocsr() del file_content self.n_items = self.binary_user_item.shape[1] self.n_users = self.binary_user_item.shape[0]
def GLGrid(train): ''' Function for obtaining the optimal GraphLab Factorization Recommender parameters for a given dataset. train: scipy.sparse.coo_matrix The dataset used for grid searching the best parameters. Returns: dict Dictionary of the best GraphLab parameters for the given dataset. ''' c = coo_matrix(train) sf = graphlab.SFrame({'row': c.row, 'col': c.col, 'data': c.data}) sf_small = sf.dropna('data', how="all") folds = graphlab.cross_validation.KFold(sf_small, 3) params = dict([('target', 'data'), ('user_id', 'row'), ('item_id', 'col'), ('num_factors', [10, 20]), ('sgd_step_size', [0.02, 10])]) job = graphlab.grid_search.create(folds, graphlab.factorization_recommender.create, params) params = job.get_best_params() return params
def prepare_input_data(self, X): """ Check to make sure that the input matrix and its mask of missing values are valid. Returns X and missing mask. """ #TODO - separate out safety checks in _preprocess_sparse as well, and include them here instead self._check_input(X) shape = X.shape coo = coo_matrix(X) row_id = coo.row col_id = coo.col self.missing_mask = row_id, col_id, shape self._check_max_rank(X) self._check_missing_value_mask() return X
def sparseDf(self, df, matrixType="csc"): """ convert a pandas sparse df to numpy sparse array :param df: pandas sparse df :param matrixType: csc or csr :return: numpy sparse array """ columns = df.columns dat, rows = map(list, zip( *[(df[col].sp_values - df[col].fill_value, df[col].sp_index.to_int_index().indices) for col in columns])) cols = [np.ones_like(a) * i for (i, a) in enumerate(dat)] datF, rowsF, colsF = np.concatenate(dat), np.concatenate(rows), np.concatenate(cols) arr = sparse.coo_matrix((datF, (rowsF, colsF)), df.shape, dtype=np.float64) if matrixType == "csc": return arr.tocsc() elif matrixType == "csr": return arr.tocsc() else: raise ValueError("Only accept csc or csr")
def solveSingle(self,inputDF,outputDict,rho,beta_target): I,J,V,Y=[],[],[],[] fd = {} # mapping feature names to consecutive integers, starting with 0 for i,(id, x) in enumerate(inputDF.items()): l = outputDict.get(id) for k,v in x.items(): I.append(i) J.append(k) V.append(v) upd(fd,k) Y.append(l) J = map(lambda k: fd[k], J) X = sparse.coo_matrix((V,(I,J)),shape=(I[-1]+1,len(fd))) fd_reverse = [k for k,v in sorted(fd.items(), key = lambda t: t[1])] # y_new = y - X . beta_target # converting a proximal least square problem to a ridge regression ZmUl = np.array([beta_target.get(k,0) for k in fd_reverse]) y_new = np.array(Y) - X * ZmUl ridge = Ridge(alpha = rho , fit_intercept=False) ret = ridge.fit(X,y_new) #ret = self.lr.fit(X,y_new) # ordered list of feature names according to their integer ids in fd #raise ValueError('fd_reverse = %s \n X = %s \n J = %s \n I = %s \n V = %s \n Y = %s \n y_new = %s \n ret.coef_ = %s \n ZmUl = %s \n'\ # %(str(fd_reverse), str(X), str(J), str(I), str(V), str(Y), str(y_new), str(ret.coef_), str(ZmUl))) return dict(zip(fd_reverse, (ret.coef_ + ZmUl).tolist()))
def split(self, X): X = sp.coo_matrix(X) rng = np.random.RandomState(self.random_state) shape = X.shape n_data = len(X.data) n_train = int(self.train_size * n_data) for it in range(self.n_iter): ind = rng.permutation(n_data) train_ind = ind[:n_train] test_ind = ind[n_train:] X_tr = sp.coo_matrix((X.data[train_ind], (X.row[train_ind], X.col[train_ind])), shape=shape) X_te = sp.coo_matrix((X.data[test_ind], (X.row[test_ind], X.col[test_ind])), shape=shape) yield X_tr, X_te
def generate_hamiltonian(m, basis): """ Generates the (sparse) Hamiltonian Parameters ---------- basis : Basis object Full basis for this specific number sector. """ nbas = basis.len Us = m.Us if m.W is not None: Us = m.W + np.diag(m.Us) HDi = np.arange(nbas) HD = NumberSector.onsite_hamiltonian(m.Es, basis.vs) \ + NumberSector.interaction_hamiltonian(Us, basis.vs) Hki, Hkj, Hkv = NumberSector.hopping_hamiltonian(basis, m.hopping, basis.vs) return sparse.coo_matrix((Hkv, (Hki, Hkj)), shape=(nbas, nbas)).tocsr() \ + sparse.coo_matrix((HD, (HDi, HDi)), shape=(nbas, nbas)).tocsr()
def creation_operator(i, basis0, basis1): """ Create a boson on site <i> Parameters ---------- i : int Site index basis0 : list Initial basis basis1 : list Final basis """ index0 = np.arange(basis0.len) mbasis = np.copy(basis0.vs) mbasis[:, i] += 1 index1 = basis1.index(mbasis) return sparse.coo_matrix((np.sqrt(mbasis[:, i]), (index1, index0)), shape=[basis1.len, basis0.len]).tocsr()
def reblocks_matrix(self, sparse_matrix): # Gets all non-zero coordinates and makes sure that they are considered froms = sparse_matrix.row tos = sparse_matrix.col data = sparse_matrix.data all_indices = np.hstack((froms, tos)) indices = np.unique(all_indices) compact_shape = indices.shape[0] # Builds the hash matrix_hash = {} titles = [] for i in range(compact_shape): matrix_hash[indices[i]] = i froms[froms == indices[i]] = matrix_hash[indices[i]] tos[tos == indices[i]] = matrix_hash[indices[i]] titles.append(indices[i]) matrix = coo_matrix((data, (froms, tos)), shape=(compact_shape, compact_shape)).toarray().astype(np.float64) return matrix, matrix_hash, titles
def drop_tolerance(A, t): """ Drops entry of `A` having absolute value lower than `t`. Args: A (coo_matrix): Given coo matrix. t (float): Tolerance threshld. Returns: A coo matrix. """ A = A.tocoo() row = [] col = [] data = [] for idx, (i, j) in enumerate(zip(A.row, A.col)): value = A.data[idx] if value < t or value > -t: continue row.append(i) col.append(j) data.append(value) A = coo_matrix((data, (row, col)), shape=A.shape, dtype=A.dtype) del row, col, data return A
def degree_reverse_rank_perm(A, reverse=False): """ Computes permutation that sorts nodes by degree. Args: A (coo_matrix): Given coo matrix. reverse (bool): If True, sorts with descending order. Returns: A permutation of node indices. Like `(i -> j)` is denoted as `perm[i] = j`. """ n, _ = A.shape degree = {i: 0 for i in range(n)} for i, j in zip(A.row, A.col): degree[j] += 1 bottoms = sorted(degree, key=degree.get, reverse=reverse) perm = [0 for _ in range(n)] for i in range(n): perm[bottoms[i]] = i return perm
def reorder_matrix(A, perm, fix_row=False, fix_col=False): """ Reorders given coo matrix with given permutation. You can fix either row or column. Args: A (coo_matrix): Given coo matrix. perm (list): List of node indicies denoting permutation. fix_row (bool): If True, reorders column only. fix_col (bool): If True, reorders row only. Returns: A coo matrix. """ A = A.tocoo() if not fix_row: row = [perm[i] for i in A.row] else: row = A.row if not fix_col: col = [perm[j] for j in A.col] else: col = A.col A = coo_matrix((A.data, (row, col)), shape=A.shape, dtype=A.dtype) del row, col return A
def matrix_to_row_col_data(X): """Convert sparse affinity matrix to arrays. .. note:: Deprecated. It will be removed in icing 0.2. This is now done by check_array from numpy. """ # convert to coo format (from lil,csr,csc) if isinstance(X, coo_matrix): X_coo = X elif (isinstance(X, csr_matrix)) or (isinstance(X, lil_matrix)): X_coo = X.tocoo() else: # others like numpy matrix could be convert to coo matrix X_coo = coo_matrix(X) # Upcast matrix to a floating point format (if necessary) X_coo = X_coo.asfptype() return X_coo.row.astype(np.int), X_coo.col.astype(np.int), X_coo.data
def hess(self, *args, **kwargs): """Evaluate Lagrangian Hessian at (x, z).""" l_vals, l_rows, l_cols = super(SciPyAmplModel, self).hess(*args, **kwargs) # AMPL only returns the upper triangular part of the Hessian and # `scipy.coo_matrix` doesn't have a `symmetric` attribute, so we # need to copy the upper part of the matrix diag_idx = np.where(l_rows == l_cols) # strict upper triangle of H is obtained by switching rows and # cols indices and removing values on the diagonal. u_rows = np.delete(l_cols, diag_idx) # creates a copy u_cols = np.delete(l_rows, diag_idx) u_vals = np.delete(l_vals, diag_idx) H = sp.coo_matrix((np.concatenate((l_vals, u_vals)), (np.concatenate((l_rows, u_rows)), np.concatenate((l_cols, u_cols)))), shape=(self.nvar, self.nvar)) return H
def hess(self, *args, **kwargs): """Evaluate Lagrangian Hessian at (x, z).""" u_vals, u_rows, u_cols = super(SciPyAdolcModel, self).hess(*args, **kwargs) # ADOL-C only returns the upper triangular part of the Hessian and # `scipy.coo_matrix` doesn't have a `symmetric` attribute, so we # need to copy the upper part of the matrix diag_idx = np.where(u_rows == u_cols) l_rows = np.delete(u_cols, diag_idx) # creates a copy l_cols = np.delete(u_rows, diag_idx) l_vals = np.delete(u_vals, diag_idx) H = sp.coo_matrix((np.concatenate((l_vals, u_vals)), (np.concatenate((l_rows, u_rows)), np.concatenate((l_cols, u_cols)))), shape=(self.nvar, self.nvar)) return H
def construct_sparse_feature_matrix(dat, fea_map): num_row = 0 row = [] col = [] val = [] for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]): # keep the same order in iterating through the data set: 0,...,N-1 for f, v in fea_dic.items(): row.append( num_row ) col.append( fea_map[f] ) val.append( v ) num_row += 1 spmat = sp.coo_matrix((val, (row, col)), shape=(num_row, len(fea_map))).tocsr() return spmat # construct K x N mask matrix: K=number of labeled features, N=number of data points # ** we can skip the labeled documents, if any. needs further experiments **
def construct_feature_document_indicators(dat, labeled_features): if len(labeled_features) == 0: return sp.coo_matrix(([1.], ([0], [0])), shape=(1, len(dat))).tocsr() num_col = 0 row = [] col = [] val = [] for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]): num_row = 0 for fea, label_dist in sorted(labeled_features.iteritems(), key = lambda x: x[0]): if fea in fea_dic: row.append( num_row ) col.append( num_col ) val.append( 1. ) num_row += 1 num_col += 1 spmat = sp.coo_matrix((val, (row, col)), shape=(len(labeled_features), len(dat))).tocsr() return spmat # construct feature expected distribution: K x C matrix: K=number of labeled features, C=number of classes
def construct_label_document_indicators(dat, labeled_instances): if len(labeled_instances) == 0: return sp.coo_matrix(([1.], ([0], [0])), shape=(1, len(dat))).tocsr() num_row = 0 num_col = 0 row = [] col = [] val = [] for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]): if doc_id in labeled_instances: row.append( num_row ) col.append( num_col ) val.append( 1. ) num_row += 1 num_col += 1 spmat = sp.coo_matrix((val, (row, col)), shape=(len(labeled_instances), len(dat))).tocsr() return spmat # construct L x C matrix: L=number of labeled documents, C=number of classes
def load_recmat(self): assert self.complete() with self.output().open() as f: data = json.load(f) print('Data loaded') subreddit_remapping = {sid: ix for ix, sid in enumerate(tqdm(data['subreddit_ids'].values(), desc='remapping sub'))} author_remapping = {aid: ix for ix, aid in enumerate(tqdm(data['author_ids'].values(), desc='remapping author'))} i = np.asarray([author_remapping[auth] for sub, auth in data['edges']]) j = np.asarray([subreddit_remapping[sub] for sub, auth in data['edges']]) entries = np.ones(len(data['edges'])) recmat = sp.coo_matrix((entries, (i, j)), shape=[len(author_remapping), len(subreddit_remapping)]) sub_mapping = {sub: subreddit_remapping[ix] for sub, ix in data['subreddit_ids'].items()} auth_mapping = {auth: author_remapping[ix] for auth, ix in data['author_ids'].items()} return sp.csr_matrix(recmat), sub_mapping, auth_mapping
def contruct_ellipse_parallel(pars): Coor,cm,A_i,Vr,dims,dist,max_size,min_size,d=pars dist_cm = coo_matrix(np.hstack([Coor[c].reshape(-1, 1) - cm[k] for k, c in enumerate(['x', 'y', 'z'][:len(dims)])])) Vr.append(dist_cm.T * spdiags(A_i.toarray().squeeze(), 0, d, d) * dist_cm / A_i.sum(axis=0)) if np.sum(np.isnan(Vr)) > 0: raise Exception('You cannot pass empty (all zeros) components!') D, V = eig(Vr[-1]) dkk = [np.min((max_size**2, np.max((min_size**2, dd.real)))) for dd in D] # search indexes for each component return np.sqrt(np.sum([(dist_cm * V[:, k])**2 / dkk[k] for k in range(len(dkk))], 0)) <= dist #%% threshold_components
def transform(self, X): """Encode categorical columns into sparse matrix with one-hot-encoding. Args: X (numpy.array): categorical columns to encode Returns: X_new (scipy.sparse.coo_matrix): sparse matrix encoding categorical variables into dummy variables """ for col in range(X.shape[1]): X_col = self._transform_col(X[:, col], col) if X_col is not None: if col == 0: X_new = X_col else: X_new = sparse.hstack((X_new, X_col)) logging.debug('{} --> {} features'.format( col, self.label_encoder.label_maxes[col]) ) return X_new
def Zpad(self, M, N, mode='center', dtype=np.dtype('complex64'), **kwargs): slc = [] if mode == 'center': for m, n in zip(M, N): slc += [slice(m // 2 + int(np.ceil(-n / 2)), m // 2 + int(np.ceil( n / 2))), ] elif mode == 'edge': for m, n in zip(M, N): slc.append(slice(n)) pass x = np.arange( np.prod(M), dtype=int ).reshape(M, order='F') rows = x[slc].flatten(order='F') cols = np.arange(rows.size) ones = np.ones_like(cols) shape = np.prod(M), np.prod(N) M = spp.coo_matrix( (ones, (rows,cols)), shape=shape, dtype=dtype ) return self.SpMatrix(M, **kwargs)
def interp_mat(m, N, width, table, coord, backend): ndim = coord.shape[0] if ndim == 1: _interp_mat = _interp1_mat elif ndim == 2: _interp_mat = _interp2_mat elif ndim == 3: _interp_mat = _interp3_mat else: raise ValueError('Number of dimensions can only be 1, 2 or 3, got %r', ndim) row, col, ker = _interp_mat(m, N, width, table, coord) return sparse.coo_matrix((ker, (row, col)), shape=(m, np.prod(N, dtype=np.int)))
def test_closureap(): """ Correctedness of all-pairs parallel closure. """ np.random.seed(100) dt = DirTree('test', (2, 5, 10), root='test_parallel') N = 100 thresh = 0.1 A = sp.rand(N, N, thresh, 'csr') nnz = A.getnnz() sparsity = float(nnz) / N ** 2 print 'Number of nnz = {}, sparsity = {:g}'.format(nnz, sparsity) A = np.asarray(A.todense()) clo.closureap(A, dt) coords = np.asarray(fromdirtree(dt, N), dtype=coo_dtype) coo = (coords['weight'], (coords['row'], coords['col'])) B = np.asarray(sp.coo_matrix(coo, shape=(N, N)).todense()) rows = [] for row in xrange(N): r, _ = clo.cclosuress(A, row) rows.append(r) C = np.asarray(rows) assert np.allclose(B, C) # cleanup for logpath in glob('closure-*.log'): os.remove(logpath)
def multi_hot_encode(x: Sequence[str], prefix: str) -> (coo_matrix, Dict[str, int]): """ Return sparse matrix encoding categorical variables in x and dictionary mapping categorical variables to column numbers. Each record in x must be a single string with the categorical variables separated by a comma. The prefix prepends the categorical variable name to prevent collisions. """ data = [] i = [] j = [] col = count() dummy_col = defaultdict(lambda: next(col)) for row, cat_vars in enumerate(x): for cat_var in cat_vars.split(','): prepended = f'{prefix}_{cat_var}' data.append(1) i.append(row) j.append(dummy_col[prepended]) return coo_matrix((data, (i, j))), {v: k for k, v in dummy_col.items()}
def get_lshash(text: coo_matrix) -> List[str]: """ Return list of cosine LSHs encoding text. """ def cosine_LSH(vector, planes): """ Return a single cosine LSH for a particular record and given planes. """ sig = 0 for plane in planes: sig <<= 1 if vector.dot(plane) >= 0: sig |= 1 return str(sig) bits = 512 random_projections = np.random.randn(bits, text.shape[1]) hashes = [cosine_LSH(text.getrow(idx), random_projections) for idx in range(text.shape[0])] return hashes
def load_data(path="data/cora/", dataset="cora"): """Load citation network dataset (cora only for now)""" print('Loading {} dataset...'.format(dataset)) idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str)) features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32) labels = encode_onehot(idx_features_labels[:, -1]) # build graph idx = np.array(idx_features_labels[:, 0], dtype=np.int32) idx_map = {j: i for i, j in enumerate(idx)} edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32) edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape) adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32) # build symmetric adjacency matrix adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1])) return features.todense(), adj, labels
def label2mnt(mnt_s_out, mnt_w_out, mnt_h_out, mnt_o_out, thresh=0.5): mnt_s_out = np.squeeze(mnt_s_out) mnt_w_out = np.squeeze(mnt_w_out) mnt_h_out = np.squeeze(mnt_h_out) mnt_o_out = np.squeeze(mnt_o_out) assert len(mnt_s_out.shape)==2 and len(mnt_w_out.shape)==3 and len(mnt_h_out.shape)==3 and len(mnt_o_out.shape)==3 # get cls results mnt_sparse = sparse.coo_matrix(mnt_s_out>thresh) mnt_list = np.array(zip(mnt_sparse.row, mnt_sparse.col), dtype=np.int32) if mnt_list.shape[0] == 0: return np.zeros((0, 4)) # get regression results mnt_w_out = np.argmax(mnt_w_out, axis=-1) mnt_h_out = np.argmax(mnt_h_out, axis=-1) mnt_o_out = np.argmax(mnt_o_out, axis=-1) # TODO: use ori_highest_peak(np version) # get final mnt mnt_final = np.zeros((len(mnt_list), 4)) mnt_final[:, 0] = mnt_sparse.col*8 + mnt_w_out[mnt_list[:,0], mnt_list[:,1]] mnt_final[:, 1] = mnt_sparse.row*8 + mnt_h_out[mnt_list[:,0], mnt_list[:,1]] mnt_final[:, 2] = (mnt_o_out[mnt_list[:,0], mnt_list[:,1]]*2-89.)/180*np.pi mnt_final[mnt_final[:, 2]<0.0, 2] = mnt_final[mnt_final[:, 2]<0.0, 2]+2*np.pi mnt_final[:, 3] = mnt_s_out[mnt_list[:,0], mnt_list[:, 1]] return mnt_final
def _build(self, num_rows, num_cols): """ Allocate the matrix. Parameters ---------- num_rows : int number of rows in the matrix. num_cols : int number of cols in the matrix. """ data, rows, cols = self._build_sparse(num_rows, num_cols) metadata = self._metadata for key, (ind1, ind2, idxs, jac_type, factor) in iteritems(metadata): if idxs is None: metadata[key] = (slice(ind1, ind2), jac_type, factor) else: # store reverse indices to avoid copying subjac data during # update_submat. metadata[key] = (np.argsort(idxs) + ind1, jac_type, factor) self._matrix = coo_matrix((data, (rows, cols)), shape=(num_rows, num_cols))
def MasseyVector(self): """This function produces X'Wy """ idx = self.itemlist table = self.table pair = self.pair j = np.ravel(pair) i = np.repeat(np.arange(table.shape[0], dtype=np.int32), 2, axis=0) data = np.array([[1,-1]],dtype=np.float32) data = np.ravel(np.repeat(data, table.shape[0], axis=0)) X = coo_matrix((data, (i, j)), shape=(table.shape[0], len(idx))) X = X.tocsr() W = np.require(table.iloc[:,4].values, np.float32) y = table.iloc[:, 2].values - table.iloc[:, 3].values; Wy=np.multiply(W, y) return X.T*Wy
def __getitem__(self, thing: Any) -> sparse.coo_matrix: if type(thing) is slice or type(thing) is np.ndarray or type(thing) is int: gm = GraphManager(None, axis=self.axis) for key, g in self.items(): # Slice the graph matrix properly without making it dense (a, b, w) = (g.row, g.col, g.data) indices = np.arange(g.shape[0])[thing] mask = np.logical_and(np.in1d(a, indices), np.in1d(b, indices)) a = a[mask] b = b[mask] w = w[mask] d = dict(zip(np.sort(indices), np.arange(indices.shape[0]))) a = np.array([d[x] for x in a]) b = np.array([d[x] for x in b]) gm[key] = sparse.coo_matrix((w, (a, b)), shape=(len(indices), len(indices))) return gm else: return self.__getattr__(thing)
def __init__(self, *args): ''' Initialization of the perceptron with given sizes. ''' self.shape = args n = len(args) self.bp_times = [] # Build layers self.layers = [] # Input layer (+1 unit for bias) self.layers.append(sci_sp.csc_matrix(np.ones(self.shape[0]+1))) # Hidden layer(s) + output layer for i in range(1,n): self.layers.append(sci_sp.csc_matrix(np.ones(self.shape[i]))) # Build weights matrix (randomly) self.weights = [] self.sparsifiers = [] self.has_sparsified = False for i in range(n-1): new_weights = (2 * (npr.random((self.layers[i].size, self.layers[i+1].size))) - 1) * 0.00001 self.weights.append(sci_sp.csc_matrix(new_weights)) self.sparsifiers.append(sci_sp.coo_matrix(np.ones_like(new_weights)))
def sparsify(self): self.has_sparsified = True for i in range(len(self.weights)-1): # not the softmax layer # so the 50th percentile of existing weights above 0 # leave it as np.percentile, not median, cuz experimentation thresh = np.percentile( np.abs( self.weights[i].toarray()[np.abs(self.weights[i].toarray()) > 0] ), 50 ) # add to sparsifier # kill based upon sparsifier, but in the actual backprop new_sparsifier = self.sparsifiers[i].toarray() self.sparsifiers[i] = sci_sp.coo_matrix(np.logical_and(np.abs(self.weights[i].toarray()) > thresh, new_sparsifier)) self.weights[i][np.abs(self.weights[i].toarray()) < thresh] = 0 self.weights[i].eliminate_zeros()
def merge_nearest_neighbors(filenames, total_rows): """ Merge nearest neighbor adjacency matrix HDF files. Returns: A sparse adjacency matrix """ nn = sp_sparse.coo_matrix((total_rows, total_rows)) for filename in filenames: h5 = h5py.File(filename, 'r') nn += sp_sparse.coo_matrix((np.ones(len(h5['i'])), (h5['i'][:], h5['j'][:])), shape=nn.shape) h5.close() return nn
def tocoo(self): if type(self.m) is not sp_sparse.coo_matrix: self.m = self.m.tocoo()
def save_pkl_files(dsm_prefix, dsm, save_in_one_file=False): """ Save the space to separate pkl files. :param dsm_prefix: :param dsm: """ # Save in a single file (for small spaces) if save_in_one_file: io_utils.save(dsm, dsm_prefix + '.pkl') # Save in multiple files: npz for the matrix and pkl for the other data members of Space else: mat = coo_matrix(dsm.cooccurrence_matrix.get_mat()) np.savez_compressed(dsm_prefix + 'cooc.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape) with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out: pickle.dump(dsm._row2id, f_out, 2) with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out: pickle.dump(dsm._id2row, f_out, 2) with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out: pickle.dump(dsm._column2id, f_out, 2) with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out: pickle.dump(dsm._id2column, f_out, 2)
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: :param dsm: """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space with np.load(dsm_prefix + 'cooc.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)