我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.sparse.csr_matrix()。
def dataset_from_file(filename): """Load a dataset from file. Args: filename (string): the name of the file from which extract the dataset Returns: tuple: the dataset (np.ndarray) and the ngrams (list of strings) """ loader = np.load(filename) num_entries = loader['num_entries'][0] sp_dataset = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']) dataset = sp_dataset.toarray() samp_entries, num_features = dataset.shape return dataset.reshape(int(samp_entries / num_entries), num_entries, num_features), loader['ngrams']
def construct_csr_matrix_from_data_and_nodes(f,nodes,blacklisted_nodes,remove_diag=True): print "GenomeDISCO | "+strftime("%c")+" | processing: Loading interaction data from "+f total_nodes=len(nodes.keys()) i=[] j=[] v=[] #print strftime("%c") c=0 for line in gzip.open(f): items=line.strip().split('\t') n1,n2,val=nodes[items[0]]['idx'],nodes[items[1]]['idx'],float(items[2]) i.append(n1) j.append(n2) v.append(val) c+=1 csr_m=csr_matrix( (v,(i,j)), shape=(total_nodes,total_nodes),dtype=float) if remove_diag: csr_m.setdiag(0) return filter_nodes(csr_m,blacklisted_nodes)
def to_transition(m): mup=m mdown=mup.transpose() mdown.setdiag(0) mtogether=mup+mdown sums=mtogether.sum(axis=1) nonzeros=mtogether.nonzero() num_elts=len(nonzeros[0]) rows=[] cols=[] m_norm_data=[] for elt in range(num_elts): i=nonzeros[0][elt] j=nonzeros[1][elt] rows.append(i) cols.append(j) if sums[i,0]>0: m_norm_data.append(float(mtogether[i,j])/(float(sums[i,0]))) else: m_norm_data.append(0) return csr_matrix((m_norm_data,(rows,cols)),shape=mtogether.get_shape(),dtype=float)
def _compute_features(self, model): bls = [ b[0] for b in tuple(model.repr_model()) ] nfeats_other = 1 nfeats_ngrams = len(self.module_ngram_to_id) nfeats = nfeats_other + nfeats_ngrams feats = sp.dok_matrix((1, nfeats), dtype=np.float32) # other features feats[0, 0] = len(bls) # ngrams features for k in xrange(1, self.ngram_maxlen): for i in xrange(len(bls) - k): ngram = tuple(bls[i:i + k]) if ngram in self.module_ngram_to_id: ngram_i = self.module_ngram_to_id[ngram] feats_i = nfeats_other + ngram_i feats[0, feats_i] += 1.0 return sp.csr_matrix(feats)
def transform(self, df, **kwargs): """ Takes a dataframe that has :code:`link_id`, :code:`item_id` and :code:`score` columns. Returns a SciPy :code:`csr_matrix`. :param df: The DataFrame to make a sparse matrix from. Must have :code:`link_id`, :code:`item_id`, and :code:`score` columns. :type df: pandas.DataFrame :rtype: scipy.sparse.csr_matrix """ link_u = list(df.link_id.unique()) item_u = list(df.item_id.unique()) data = df.score.as_matrix() row = df.link_id.astype('category', categories=link_u).cat.codes col = df.item_id.astype('category', categories=item_u).cat.codes outshape = (len(link_u), len(item_u)) in_tuple = (data, (row, col)) kwargs = self.merge_kwargs(dict(links=link_u, items=item_u), kwargs) return csr_matrix(in_tuple, shape=outshape), kwargs
def calc_pmi(counts, cds): """ Calculates e^PMI; PMI without the log(). """ sum_w = np.array(counts.sum(axis=1))[:, 0] sum_c = np.array(counts.sum(axis=0))[0, :] if cds != 1: sum_c = sum_c ** cds sum_total = sum_c.sum() sum_w = np.reciprocal(sum_w) sum_c = np.reciprocal(sum_c) pmi = csr_matrix(counts) pmi = multiply_by_rows(pmi, sum_w) pmi = multiply_by_columns(pmi, sum_c) pmi = pmi * sum_total return pmi
def write_data(self, result_dict): for key, result in six.iteritems(result_dict): if ss.isspmatrix(result): if np.isnan(result.data).any(): raise ValueError("data {} have nan".format(key)) elif np.isnan(result).any(): raise ValueError("data {} have nan".format(key)) with SimpleTimer("Writing generated data {} to hdf5 file" .format(key), end_in_new_line=False): if key in self.h5f: # self.h5f[key][...] = result raise NotImplementedError("Overwriting not supported.") else: if (isinstance(result, ss.csc_matrix) or isinstance(result, ss.csr_matrix)): # sparse matrix h5sparse.Group(self.h5f).create_dataset(key, data=result) else: self.h5f.create_dataset(key, data=result) self.h5f.flush()
def __deflationMatrix(self, theSet, theHang, theIndex, withHanging=True, asOnes=False): reducedInd = dict() # final reduced index ii = 0 I,J,V = [],[],[] for fx in sorted(theSet): if theIndex[fx] not in theHang: reducedInd[theIndex[fx]] = ii I += [theIndex[fx]] J += [ii] V += [1.0] ii += 1 if withHanging: for hfkey in theHang.keys(): hf = theHang[hfkey] I += [hfkey]*len(hf) J += [reducedInd[_[0]] for _ in hf] if asOnes: V += [1.0]*len(hf) else: V += [_[1] for _ in hf] return sp.csr_matrix((V,(I,J)), shape=(len(theSet), len(reducedInd)))
def getInterpolationMat(self, loc, locType='CC', zerosOutside=False): """ Produces interpolation matrix :param numpy.ndarray loc: Location of points to interpolate to :param str locType: What to interpolate (see below) :rtype: scipy.sparse.csr_matrix :return: M, the interpolation matrix locType can be:: 'Ex' -> x-component of field defined on edges 'Ey' -> y-component of field defined on edges 'Ez' -> z-component of field defined on edges 'Fx' -> x-component of field defined on faces 'Fy' -> y-component of field defined on faces 'Fz' -> z-component of field defined on faces 'N' -> scalar field defined on nodes 'CC' -> scalar field defined on cell centers 'CCVx' -> x-component of vector field defined on cell centers 'CCVy' -> y-component of vector field defined on cell centers 'CCVz' -> z-component of vector field defined on cell centers """ return self._getInterpolationMat(loc, locType, zerosOutside)
def _getInnerProductDeriv(self, prop, projType, doFast=True, invProp=False, invMat=False): """ :param numpy.array prop: material property (tensor properties are possible) at each cell center (nC, (1, 3, or 6)) :param str projType: 'F' for faces 'E' for edges :param bool doFast: do a faster implementation if available. :param bool invProp: inverts the material property :param bool invMat: inverts the matrix :rtype: scipy.sparse.csr_matrix :return: dMdm, the derivative of the inner product matrix (nE, nC*nA) """ fast = None if hasattr(self, '_fastInnerProductDeriv') and doFast: fast = self._fastInnerProductDeriv(projType, prop, invProp=invProp, invMat=invMat) if fast is not None: return fast if invProp or invMat: raise NotImplementedError('inverting the property or the matrix is not yet implemented for this mesh/tensorType. You should write it!') tensorType = TensorType(self, prop) P = self._getInnerProductProjectionMatrices(projType, tensorType=tensorType) def innerProductDeriv(v): return self._getInnerProductDerivFunction(tensorType, P, projType, v) return innerProductDeriv
def test_FaceInnerProductAnisotropicDerivInvProp(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([eye, zero, eye])]) MfSig = self.mesh.getFaceInnerProduct(x, invProp=True) MfSigDeriv = self.mesh.getFaceInnerProductDeriv(x0, invProp=True) return MfSig*self.face_vec, MfSigDeriv(self.face_vec) * P.T print('Testing FaceInnerProduct Anisotropic InvProp') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def test_FaceInnerProductAnisotropicDerivInvMat(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([eye, zero, eye])]) MfSig = self.mesh.getFaceInnerProduct(x, invMat=True) MfSigDeriv = self.mesh.getFaceInnerProductDeriv(x0, invMat=True) return MfSig*self.face_vec, MfSigDeriv(self.face_vec) * P.T print('Testing FaceInnerProduct Anisotropic InvMat') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def test_FaceInnerProductAnisotropicDerivInvPropInvMat(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([eye, zero, eye])]) MfSig = self.mesh.getFaceInnerProduct(x, invProp=True, invMat=True) MfSigDeriv = self.mesh.getFaceInnerProductDeriv(x0, invProp=True, invMat=True) return MfSig*self.face_vec, MfSigDeriv(self.face_vec) * P.T print('Testing FaceInnerProduct Anisotropic InvProp InvMat') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def test_EdgeInnerProductAnisotropicDeriv(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([zero, eye, zero])]) MeSig = self.mesh.getEdgeInnerProduct(x.reshape(self.mesh.nC, 3)) MeSigDeriv = self.mesh.getEdgeInnerProductDeriv(x0) return MeSig*self.edge_vec, MeSigDeriv(self.edge_vec) * P.T print('Testing EdgeInnerProduct Anisotropic') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def test_EdgeInnerProductAnisotropicDerivInvProp(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([zero, eye, zero])]) MeSig = self.mesh.getEdgeInnerProduct(x, invProp=True) MeSigDeriv = self.mesh.getEdgeInnerProductDeriv(x0, invProp=True) return MeSig*self.edge_vec, MeSigDeriv(self.edge_vec) * P.T print('Testing EdgeInnerProduct Anisotropic InvProp') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def test_EdgeInnerProductAnisotropicDerivInvPropInvMat(self): def fun(x): x = np.repeat(np.atleast_2d(x), 3, axis=0).T x0 = np.repeat(self.x0, 3, axis=0).T zero = sp.csr_matrix((self.mesh.nC, self.mesh.nC)) eye = sp.eye(self.mesh.nC) P = sp.vstack([sp.hstack([zero, eye, zero])]) MeSig = self.mesh.getEdgeInnerProduct(x, invProp=True, invMat=True) MeSigDeriv = self.mesh.getEdgeInnerProductDeriv(x0, invProp=True, invMat=True) return MeSig*self.edge_vec, MeSigDeriv(self.edge_vec) * P.T print('Testing EdgeInnerProduct Anisotropic InvProp InvMat') return self.assertTrue(Tests.checkDerivative(fun, self.x0, num=7, tolerance=TOLD, plotIt=False))
def zero_rows_norm_eff1(self,hypothesis, structur, k): #find zero sum rows in hypothesis #print 'sum hyp' norma = hypothesis.sum(axis=1) n_zeros = np.where(norma == 0) # norm the structure matrix i_index = list() j_index = list() values = list() for x, i in enumerate(n_zeros[0]): #if x % 1000 == 0: # print x, len(n_zeros[0]) links = np.where(structur[i,:]!=0) value = k / len(links[0]) for j in links[0]: i_index.append(i) j_index.append(j) values.append(value) hypothesis= hypothesis+csr_matrix((values, (i_index, j_index)), shape=hypothesis.shape, dtype=np.float)
def assign(self): G, path_index = self.path_edge_matrix(self.paths) H = np.dot(G, G.T) perm_paths = self.seriation(H) Y = sparse.csr_matrix(self.Y, dtype=np.float32) D = np.dot(Y.transpose(), Y) perm_classes = self.seriation(D) classmap = {} pathmap = {} for i in xrange(len(perm_classes)): classmap[perm_classes[i]] = path_index[perm_paths[i]] pathmap[path_index[perm_paths[i]]] = perm_classes[i] return classmap, pathmap
def predict_topk(self, X, k=5): row = [[] for i in xrange(k)] col = [[] for i in xrange(k)] i = 0 for x in self.iterate_dataset(X): edge_weight = self.evaluate_model(x, self.w) ranking = self.create_ranking(edge_weight, k) rank = 0 for path in ranking: yhat = self.path_to_class_map[path[1]] col[rank].append(yhat) row[rank].append(i) rank += 1 i += 1 Yhats = [] for i in xrange(k): row1 = np.array(row[i]) col1 = np.array(col[i]) data1 = np.array([1 for i in xrange(len(row[i]))]) Yhat = sparse.csr_matrix((data1, (row1, col1)), shape=(X.shape[0], self.n_classes)) Yhats.append(Yhat) return Yhats
def run(self): # open instances loader = numpy.load(self.in_vectors().path) instances = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']) # load feature names with open(self.in_featurenames().path,'r',encoding='utf-8') as infile: fn = infile.read().strip().split('\n') # calculate feature correlations feature_correlation = vectorizer.calculate_feature_correlation(instances) # write to file with open(self.out_feature_correlation().path,'w',encoding='utf-8') as out: # for fc in feature_correlation: # print(fc,len(fc)) # out.write('\t'.join([str(fc[0]),str(fc[1]),fn[fc[0]],fn[fc[1]],str(fc[2]),str(fc[3]),str(fc[4])]) + '\n') out.write('\n'.join(['\t'.join([str(fc[0]),str(fc[1]),fn[fc[0]],fn[fc[1]],str(fc[2]),str(fc[3]),str(fc[4])]) for fc in feature_correlation]))
def run(self): # load vectors loader = numpy.load(self.in_vectors().path) vectors = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']) # load labels with open(self.in_labels().path,'r',encoding='utf-8') as file_in: labels = file_in.read().strip().split('\n') # combine vetors and labels if vectors.shape[0] != len(labels): print('instances and labels do not align, exiting program...') instances_list = vectors.toarray().tolist() for i, label in enumerate(labels): instances_list[i].append(label) # write to file lw = linewriter.Linewriter(instances_list) lw.write_csv(self.out_instances().path)
def run(self): # read in vectors loader = numpy.load(self.in_vectors().path) instances = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']) num_dimensions = instances.shape[1] # generate vectorpopulation random_vectorpopulation = ga_functions.random_vectorpopulation(num_dimensions, self.population_size) numpy.savez(self.out_vectorpopulation().path, data=random_vectorpopulation.data, indices=random_vectorpopulation.indices, indptr=random_vectorpopulation.indptr, shape=random_vectorpopulation.shape) # read in parameter options with open(self.in_parameter_options().path) as infile: lines = infile.read().rstrip().split('\n') parameter_options = [[i for i in range(len(line.split()))] for line in lines] # generate parameterpopulation random_parameterpopulation = ga_functions.random_parameterpopulation(parameter_options, self.population_size) numpy.savez(self.out_parameterpopulation().path, data=random_parameterpopulation.data, indices=random_parameterpopulation.indices, indptr=random_parameterpopulation.indptr, shape=random_parameterpopulation.shape) ################################################################################ ###GA Iterator ################################################################################
def matixToRowColDataArr(X): """ Convert sparse affinity/similarity matrix to numpy array format (row_array,col_array,data_array) So cython update function can work efficiently on it. """ # convert to coo format (from lil,csr,csc) if isinstance(X, coo_matrix): X_coo=X elif (isinstance(X, csr_matrix)) or (isinstance(X, lil_matrix)): X_coo=X.tocoo() else: # others like numpy matrix could be convert to coo matrix X_coo=coo_matrix(X) # Upcast matrix to a floating point format (if necessary) X_coo=X_coo.asfptype() # get row_array,col_array,data_array in their correct data type (for cython to work) row_array,col_array,data_array=X_coo.row.astype(np.int),X_coo.col.astype(np.int),X_coo.data return row_array,col_array,data_array
def denseToSparseAbvCutoff(self, denseMatrix, cutoff): """ Remove datas in denseMatrix that is below cutoff, Convert the remaining datas into sparse matrix. Parameters: ---------------------- denseMatrix: dense numpy matrix cutoff: int or float Returns ---------------------- Scipy csr_matrix """ maskArray=denseMatrix>=cutoff sparseMatrix=csr_matrix( (np.asarray(denseMatrix[maskArray]).reshape(-1),np.nonzero(maskArray)),\ shape=denseMatrix.shape) return sparseMatrix
def denseToSparseTopPercentage(self, denseMatrix, percentage=10.0): """ Keep top percentage (such as 10%) of data points, remove all others. Convert into sparse matrix. Parameters: ---------------------- denseMatrix: dense numpy matrix percentage: float, default is 10.0 percentage of top data points to keep. default is 10.0% that is for 10000 data points keep top 1000. Returns ---------------------- Scipy csr_matrix """ rowN,colN=denseMatrix.shape totalN=rowN*colN topN=min(int(totalN*(percentage/100.0)), totalN) arr=np.array(denseMatrix.flatten())[0] cutoff=arr[arr.argsort()[-(topN)]] sparseMatrix=self.denseToSparseAbvCutoff(denseMatrix,cutoff) return sparseMatrix
def test_sparse_dot(self): x_d = np.array([0, 7, 2, 3], dtype=np.float32) x_r = np.array([0, 2, 2, 3], dtype=np.int64) x_c = np.array([4, 3, 2, 3], dtype=np.int64) x_sparse = sparse.csr_matrix((x_d, (x_r, x_c)), shape=(4, 5)) x_dense = x_sparse.toarray() W = np.random.random((5, 4)) backends = [KTF] if KTH.th_sparse_module: # Theano has some dependency issues for sparse backends.append(KTH) for K in backends: t_W = K.variable(W) k_s = K.eval(K.dot(K.variable(x_sparse), t_W)) k_d = K.eval(K.dot(K.variable(x_dense), t_W)) assert k_s.shape == k_d.shape assert_allclose(k_s, k_d, atol=1e-05)
def stream(self, fn, no_features=False): with open(fn, 'rt') as f: n_samples, n_feats, n_classes = list(map(int, f.readline().split())) for i, line in enumerate(f): if i == 0: continue if self.verbose and i % 10000 == 0: print("%s docs encoded" % i) res = self.quantize(line, no_features) if no_features: yield {"labels": res}, res else: (c, d), y = res yield {"labels": y}, sp.csr_matrix((d, ([0] * len(d), c)), shape=(1, n_feats), dtype='float32'), y
def predict(self, X, fmt='sparse'): assert fmt in ('sparse', 'dict') s = [] num = X.shape[0] if isinstance(X, sp.csr_matrix) else len(X) for i in range(num): Xi = X[i] mean = self.predictor.predict(Xi.data, Xi.indices, self.blend, self.gamma, self.leaf_probs) if fmt == 'sparse': s.append(mean) else: od = OrderedDict() for idx in reversed(mean.data.argsort()): od[mean.indices[idx]] = mean.data[idx] s.append(od) if fmt == 'sparse': return sp.vstack(s) return s
def custom_block_diag(blocks): """ create csr sparse block diagonal matrix from identically-sized blocks Blocks don't need to be identical, but they do need to be the same shape. """ L = len(blocks) K = blocks[0].shape[0] _data = [x.flatten() for x in blocks] m = np.arange(_data[0].shape[0]) flat_data = np.zeros(L * len(m)) for n in range(L): flat_data[m + n * len(m)] = _data[n][m] # now make the block diagonal array i = np.repeat(np.arange(L * K), K) j = np.tile(np.tile(np.arange(K), K), L) + np.repeat(np.arange(0, L * K, K), K * K) return sparse.csr_matrix((flat_data, (i, j)), shape=(L * K, L * K))
def solve_linear(model:Model.fem_model): K_bar,F_bar,index=model.K_,model.F_,model.index Dvec=model.D Logger.info('Solving linear model with %d DOFs...'%model.DOF) n_nodes=model.node_count try: #sparse matrix solution delta_bar = sl.spsolve(sp.csr_matrix(K_bar),F_bar,sym_pos=True) delta = delta_bar #fill original displacement vector prev = 0 for idx in index: gap=idx-prev if gap>0: delta=np.insert(delta,prev,[0]*gap) prev = idx + 1 if idx==index[-1] and idx!=n_nodes-1: delta = np.insert(delta,prev, [0]*(n_nodes*6-prev)) delta += Dvec except Exception as e: print(e) return None model.is_solved=True return delta
def get_test_matrix(self, test_data, shape, user_slice=None): num_users_all = shape[0] if user_slice: start, stop = user_slice stop = min(stop, num_users_all) num_users = stop - start coo_data = self._slice_test_data(test_data, start, stop) else: num_users = num_users_all coo_data = test_data user_coo, item_coo, fdbk_coo = coo_data num_items = shape[1] test_matrix = csr_matrix((fdbk_coo, (user_coo, item_coo)), shape=(num_users, num_items), dtype=fdbk_coo.dtype) return test_matrix, coo_data
def test_all_pairs_knn(self): counts = csr_matrix([[5, 1, 0, 9, 0, 0], [0, 2, 1, 1, 0, 0], [7, 0, 3, 0, 0, 0], [1, 8, 0, 0, 0, 0], [0, 0, 4, 4, 0, 0], [0, 3, 0, 0, 0, 2], [0, 0, 0, 0, 6, 0]], dtype=np.float64) counts = implicit.nearest_neighbours.tfidf_weight(counts).tocsr() # compute all neighbours using matrix dot product all_neighbours = counts.dot(counts.T).tocsr() K = 3 knn = implicit.nearest_neighbours.all_pairs_knn(counts, K).tocsr() for rowid in range(counts.shape[0]): # make sure values match for colid, data in zip(knn[rowid].indices, knn[rowid].data): self.assertAlmostEqual(all_neighbours[rowid, colid], data) # make sure top K selected row = all_neighbours[rowid] self.assertEqual(set(knn[rowid].indices), set(colid for colid, _ in sorted(zip(row.indices, row.data), key=lambda x: -x[1])[:K]))
def _csrget(indices, indptr, data, row, col): """Fast lookup of value in a scipy.sparse.csr_matrix format table. Parameters ---------- indices, indptr, data : numpy arrays of int, int, float The CSR format data. row, col : int The matrix coordinates of the desired value. Returns ------- dat: float The data value in the matrix. """ start, end = indptr[row], indptr[row+1] for i in range(start, end): if indices[i] == col: return data[i] return 0.
def _represent_ZGate(self, basis, **options): """Represent this qubits in the computational basis (ZGate). """ format = options.get('format', 'sympy') n = 1 definite_state = 0 for it in reversed(self.qubit_values): definite_state += n*it n = n*2 result = [0]*(2**self.dimension) result[int(definite_state)] = 1 if format == 'sympy': return Matrix(result) elif format == 'numpy': import numpy as np return np.matrix(result, dtype='complex').transpose() elif format == 'scipy.sparse': from scipy import sparse return sparse.csr_matrix(result, dtype='complex').transpose()
def nvecs(X, n, rank, do_flipsign=True, dtype=np.float): """ Eigendecomposition of mode-n unfolding of a tensor """ Xn = X.unfold(n) if issparse_mat(Xn): Xn = csr_matrix(Xn, dtype=dtype) Y = Xn.dot(Xn.T) _, U = eigsh(Y, rank, which='LM') else: Y = Xn.dot(Xn.T) N = Y.shape[0] _, U = eigh(Y, eigvals=(N - rank, N - 1)) #_, U = eigsh(Y, rank, which='LM') # reverse order of eigenvectors such that eigenvalues are decreasing U = array(U[:, ::-1]) # flip sign if do_flipsign: U = flipsign(U) return U
def get_item_representations(self, features=None): """ Get the latent representations for items given model and features. Arguments --------- features: np.float32 csr_matrix of shape [n_items, n_item_features], optional Each row contains that item's weights over features. An identity matrix will be used if not supplied. Returns ------- (item_biases, item_embeddings): (np.float32 array of shape n_items, np.float32 array of shape [n_items, num_components] Biases and latent representations for items. """ self._check_initialized() if features is None: return self.item_biases, self.item_embeddings features = sp.csr_matrix(features, dtype=CYTHON_DTYPE) return features * self.item_biases, features * self.item_embeddings
def get_user_representations(self, features=None): """ Get the latent representations for users given model and features. Arguments --------- features: np.float32 csr_matrix of shape [n_users, n_user_features], optional Each row contains that user's weights over features. An identity matrix will be used if not supplied. Returns ------- (user_biases, user_embeddings): (np.float32 array of shape n_users np.float32 array of shape [n_users, num_components] Biases and latent representations for users. """ self._check_initialized() if features is None: return self.user_biases, self.user_embeddings features = sp.csr_matrix(features, dtype=CYTHON_DTYPE) return features * self.user_biases, features * self.user_embeddings
def mult(A, x, t=False): if(sp.issparse(A)): if(t): return(sp.csr_matrix(x).dot(A).transpose().todense().A[:, 0]) return(A.dot(sp.csr_matrix(x).transpose()).todense().A[:, 0]) if(t): return(x.dot(A)) return(A.dot(x))
def dataset_to_file(dataset, ngrams, filename='dataset'): """Save a dataset to a file. Args: dataset (:class:`np.ndarray`): the dataset to save (built with :func:`dataset_tools.build_dataset`) ngrams (list of strings): the ngrams used to compute the features filename (string): the filename without extension (will be .npz) """ num_samples, num_entries, num_features = dataset.shape # We rehaspe the ndarray from 3D to 2D in order to write it into a text file # Each line of the file will correspond to one cited paper # Therefore, on each there will be the `num_entries` sets of features dataset_sp = sparse.csr_matrix(dataset.reshape(num_samples*num_entries, num_features)) np.savez(filename, num_entries=np.array([num_entries]), data=dataset_sp.data, indices=dataset_sp.indices, indptr=dataset_sp.indptr, shape=dataset_sp.shape, ngrams=ngrams)
def incidence_matrix(self): """ 'incidence_matrix' creates the branch-2-node incidence matrix :return: update self with self.A """ # initialize incidence matrix terms a = [] a_row = [] a_col = [] # cycle on branches (N1 and N2) for b, nodes in enumerate(self.nodes): # get nodes N1, N2 = nodes # detect connection to ground if N1 == 0: a.append(-1) a_row.append(N2 - 1) a_col.append(b) elif N2 == 0: a.append(1) a_row.append(N1 - 1) a_col.append(b) else: a.append(1) a_row.append(N1 - 1) a_col.append(b) a.append(-1) a_row.append(N2 - 1) a_col.append(b) # create conductance matrix self.A = csr_matrix((a, (a_row, a_col)))
def subsample_to_depth(m,seq_depth): if type(m) is csr_matrix: return subsample_to_depth_csr_upperTri(m,seq_depth) if type(m) is np.ndarray: return subsample_to_depth_array_upperTri(m,seq_depth)
def subsample_to_depth_csr_upperTri(m,seq_depth): depthm=m.sum() assert seq_depth<=depthm subsampling_prob=seq_depth/depthm vals=m.data num_elts=len(vals) m_subsampled_data=[]#np.random.binomial(value,subsampling_prob) elt=0 while elt<num_elts: m_subsampled_data.append(np.random.binomial(vals[elt],subsampling_prob,1)[0]) elt+=1 return csr_matrix((m_subsampled_data, m.indices, m.indptr), dtype=int,shape=m.shape)
def load_sparse_csr(filename): loader = np.load(filename) return csr_matrix(( loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])
def filter_nodes(m,to_remove): if len(to_remove)==0: return m nonzeros=m.nonzero() num_elts=len(nonzeros[0]) r_idx=[i for i, x in enumerate(nonzeros[0]) if x not in to_remove] c_idx=[i for i, x in enumerate(nonzeros[1]) if x not in to_remove] keep=list(set(r_idx).union(set(c_idx))) coo_mat=m.tocoo() return csr_matrix((coo_mat.data[keep],(coo_mat.row[keep],coo_mat.col[keep])),shape=m.get_shape(),dtype=float)
def _get_raw_context_matrix(self, sentences): """ compute the raw context matrix with weighted counts it has an entry for every word in the vocabulary """ # make the feature matrix featmat = lil_matrix((len(self.index2word), len(self.index2word)), dtype=float) for sentence_no, sentence in enumerate(sentences): if not sentence_no % self.progress: print("PROGRESS: at sentence #%i" % sentence_no) sentence = [word if word in self.word2index else None for word in sentence] # forward pass if self.forward: for i, word in enumerate(sentence[:-1]): if word: # get all words in the forward window wwords = sentence[i + 1:min(i + 1 + self.window, len(sentence))] for j, w in enumerate(wwords, 1): if w: featmat[self.word2index[word], self.word2index[w]] += 1. # /j # backwards pass if self.backward: sentence_back = sentence[::-1] for i, word in enumerate(sentence_back[:-1]): if word: # get all words in the forward window of the backwards sentence wwords = sentence_back[i + 1:min(i + 1 + self.window, len(sentence_back))] for j, w in enumerate(wwords, 1): if w: featmat[self.word2index[word], self.word2index[w]] += 1. # /j print("PROGRESS: through with all the sentences") self.featmat = csr_matrix(featmat)
def get_context_matrix(self, fill_diag=True, norm='count'): """ for every word in the sentences, create a vector that contains the counts of its context words (weighted by the distance to it with a max distance of window) Inputs: - norm: if the feature matrix should be normalized to contain ones on the diagonal (--> average context vectors) - fill_diag: if diagonal of featmat should be filled with word counts Returns: - featmat: n_voc x n_voc sparse array with weighted context word counts for every word """ featmat = deepcopy(self.featmat) # fill up the diagonals with the total counts of each word --> similarity matrix if fill_diag: featmat = lil_matrix(featmat) for i, word in enumerate(self.index2word): featmat[i, i] = self.wcounts[word] featmat = csr_matrix(featmat) assert ((featmat - featmat.transpose()).data**2).sum() < 2.220446049250313e-16, "featmat not symmetric" # possibly normalize by the max counts if norm == 'count': print("normalizing feature matrix by word count") normmat = lil_matrix(featmat.shape, dtype=float) normmat.setdiag([1. / self.wcounts[word] for word in self.index2word]) featmat = csr_matrix(normmat) * featmat elif norm == 'max': print("normalizing feature matrix by max counts") normmat = lil_matrix(featmat.shape, dtype=float) normmat.setdiag([1. / v[0] if v[0] else 1. for v in featmat.max(axis=1).toarray()]) featmat = csr_matrix(normmat) * featmat return featmat
def get_local_context_matrix(self, tokens, forward=True, backward=True): """ compute a local context matrix. it has an entry for every token, even if it is not present in the vocabulary Inputs: - tokens: list of words Returns: - local_featmat: size len(set(tokens)) x n_vocab - tok_idx: {word: index} to map the words from the tokens list to an index of the featmat """ # for every token we still only need one representation per document tok_idx = {word: i for i, word in enumerate(set(tokens))} featmat = lil_matrix((len(tok_idx), len(self.index2word)), dtype=float) # clean out context words we don't know known_tokens = [word if word in self.word2index else None for word in tokens] # forward pass if self.forward: for i, word in enumerate(tokens[:-1]): # get all words in the forward window wwords = known_tokens[i + 1:min(i + 1 + self.window, len(known_tokens))] for j, w in enumerate(wwords, 1): if w: featmat[tok_idx[word], self.word2index[w]] += 1. / j # backwards pass if self.backward: tokens_back = tokens[::-1] known_tokens_back = known_tokens[::-1] for i, word in enumerate(tokens_back[:-1]): # get all words in the forward window of the backwards sentence, incl. word itself wwords = known_tokens_back[i + 1:min(i + 1 + self.window, len(known_tokens_back))] for j, w in enumerate(wwords, 1): if w: featmat[tok_idx[word], self.word2index[w]] += 1. / j featmat = csr_matrix(featmat) # normalize matrix normmat = lil_matrix((featmat.shape[0], featmat.shape[0]), dtype=float) normmat.setdiag([1. / v[0] if v[0] else 1. for v in featmat.max(axis=1).toarray()]) featmat = csr_matrix(normmat) * featmat return featmat, tok_idx
def dc_split(self, use_eigen_split=False): n = self.P.shape[0] if self.P.nnz == 0: # P is zero P1, P2 = sp.csr_matrix((n, n)), sp.csr_matrix((n, n)) if use_eigen_split: lmb, Q = LA.eigh(self.P.todense()) P1 = sum([Q[:, i]*lmb[i]*Q[:, i].T for i in range(n) if lmb[i] > 0]) P2 = sum([-Q[:, i]*lmb[i]*Q[:, i].T for i in range(n) if lmb[i] < 0]) assert abs(np.sum(P1 - P2 - self.P)) < 1e-8 else: lmb_min = np.min(LA.eigh(self.P.todense())[0]) if lmb_min < 0: P1 = self.P + (1-lmb_min)*sp.identity(n) P2 = (1-lmb_min)*sp.identity(n) else: P1 = self.P P2 = sp.csr_matrix((n, n)) f1 = QuadraticFunction(P1, self.q, self.r) f2 = QuadraticFunction(P2, sp.csc_matrix((n, 1)), 0) return (f1, f2) # Returns the one-variable function when regarding f(x) # as a quadratic expression in x[k]. # f is an instance of QuadraticFunction # return value is an instance of OneVarQuadraticFunction # TODO: speedup
def build_matrix(self, X, opt_y=None, weighting=None): if opt_y==None: if weighting==None: return xgb.DMatrix(csr_matrix(X), missing =-999.0) else : #scale weight sumtotal=float(X.shape[0]) sumweights=np.sum(weighting) for s in range(0,len(weighting)): weighting[s]*=sumtotal/sumweights return xgb.DMatrix(csr_matrix(X), missing =-999.0, weight=weighting) else: if weighting==None: return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0) else : sumtotal=float(X.shape[0]) sumweights=np.sum(weighting) for s in range(0,len(weighting)): weighting[s]*=sumtotal/sumweights return xgb.DMatrix(csr_matrix(X), label=np.array(opt_y), missing =-999.0, weight=weighting)
def dot(X, Y): if sparse.isspmatrix(X) and sparse.isspmatrix(Y): return X * Y elif sparse.isspmatrix(X) or sparse.isspmatrix(Y): return sparse.csr_matrix(X) * sparse.csr_matrix(Y) return np.asmatrix(X) * np.asmatrix(Y)