Python scipy.sparse 模块,coo_matrix() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.sparse.coo_matrix()

项目:histwords    作者:williamleif    | 项目源码 | 文件源码
def make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=1, normalize=False):
    prob_norm = old_mat.sum() + (old_mat.shape[0] * old_mat.shape[1]) * smooth
    old_mat = old_mat.tocoo()
    row_d = old_mat.row
    col_d = old_mat.col
    data_d = old_mat.data
    neg = np.log(neg)
    for i in xrange(len(old_mat.data)):
        if data_d[i] == 0.0:
            continue
        joint_prob = (data_d[i] + smooth) / prob_norm
        denom = row_probs[row_d[i], 0] * col_probs[0, col_d[i]]
        if denom == 0.0:
            data_d[i] = 0
            continue
        data_d[i] = np.log(joint_prob /  denom)
        data_d[i] = max(data_d[i] - neg, 0)
        if normalize:
            data_d[i] /= -1*np.log(joint_prob)
    return coo_matrix((data_d, (row_d, col_d)))
项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def _getEdgeP(self, xEdge, yEdge, zEdge):
        if self.dim == 2: raise Exception('Not implemented') # this should be a reordering of the face inner product?

        ind1, ind2, ind3 = [], [], []
        for ind in self._sortedCells:
            p = self._pointer(ind)
            w = self._levelWidth(p[-1])

            posX = [0, 0] if xEdge == 'eX0' else [w, 0] if xEdge == 'eX1' else [0, w] if xEdge == 'eX2' else [w, w]
            posY = [0, 0] if yEdge == 'eY0' else [w, 0] if yEdge == 'eY1' else [0, w] if yEdge == 'eY2' else [w, w]
            posZ = [0, 0] if zEdge == 'eZ0' else [w, 0] if zEdge == 'eZ1' else [0, w] if zEdge == 'eZ2' else [w, w]

            ind1.append( self._ex2i[self._index([ p[0]          , p[1] + posX[0], p[2] + posX[1], p[3]])]                         )
            ind2.append( self._ey2i[self._index([ p[0] + posY[0], p[1]          , p[2] + posY[1], p[3]])] + self.ntEx             )
            ind3.append( self._ez2i[self._index([ p[0] + posZ[0], p[1] + posZ[1], p[2]          , p[3]])] + self.ntEx + self.ntEy )

        IND = np.r_[ind1, ind2, ind3]

        PXXX = sp.coo_matrix((np.ones(self.dim*self.nC), (range(self.dim*self.nC), IND)), shape=(self.dim*self.nC, self.ntE)).tocsr()

        Re = self._deflationMatrix('E')

        return PXXX * Re
项目:pysapc    作者:bioinfocao    | 项目源码 | 文件源码
def matixToRowColDataArr(X):
    """
    Convert sparse affinity/similarity matrix to numpy array format (row_array,col_array,data_array)
    So cython update function can work efficiently on it.
    """
    # convert to coo format (from lil,csr,csc)
    if isinstance(X, coo_matrix):
        X_coo=X
    elif (isinstance(X, csr_matrix)) or (isinstance(X, lil_matrix)):
        X_coo=X.tocoo()
    else: # others like numpy matrix could be convert to coo matrix
        X_coo=coo_matrix(X)
    # Upcast matrix to a floating point format (if necessary)
    X_coo=X_coo.asfptype() 
    # get row_array,col_array,data_array in their correct data type (for cython to work)
    row_array,col_array,data_array=X_coo.row.astype(np.int),X_coo.col.astype(np.int),X_coo.data

    return row_array,col_array,data_array
项目:pysapc    作者:bioinfocao    | 项目源码 | 文件源码
def loadMatrix(data_file, dataCutoff=None):
    """
    Load similarity data file
    if dataCutoff is not None, all value (affinity/similarity) below this will be discarded
    """
    #print('{0}, loading data'.format(datetime.now()))
    simi=pd.DataFrame.from_csv(data_file,sep='\t',index_col=None)
    samples=sorted(list(set(simi.row) | set(simi.col)))
    samplesInd={el:ind for ind,el in enumerate(samples)}
    row,col,data=simi.row.map(lambda x:samplesInd[x]),simi.col.map(lambda x:samplesInd[x]),simi.data
    if dataCutoff is not None:
        row_new,col_new,data_new=[],[],[]
        for r,c,d in zip(row,col,data):
            if d>dataCutoff:
                row_new.append(r)
                col_new.append(c)
                data_new.append(d)
        simi_mat=coo_matrix((data_new,(row_new,col_new)), shape=(len(samplesInd),len(samplesInd)))
    else:
        simi_mat=coo_matrix((data,(row,col)), shape=(len(samplesInd),len(samplesInd)))
    return simi_mat
项目:SlidingWindowVideoTDA    作者:ctralie    | 项目源码 | 文件源码
def makeDelta1(R):
    """Make the delta1 coboundary matrix
    :param R: Edge list NEdges x 2. It is assumed that 
    there is at least one edge incident on every vertex
    """
    NEdges = R.shape[0]
    NVertices = int(np.max(R))+1
    #Make a list of edges for fast lookup
    Edges = []
    for i in range(NVertices):
        Edges.append({})
    for i in range(R.shape[0]):
        [a, b] = [int(R[i, 0]), int(R[i, 1])]
        Edges[a][b] = i
        Edges[b][a] = i    

    tic = time.time()
    (I, J, V) = get3CliquesBrute(Edges)
    toc = time.time()
    print("Elapsed time 3 cliques brute: %g"%(toc - tic))
    [I, J, V] = [a.flatten() for a in [I, J, V]]
    TriNum = len(I)/3
    Delta1 = sparse.coo_matrix((V, (I, J)), shape = (TriNum, NEdges)).tocsr()

    return Delta1
项目:diamond    作者:stitchfix    | 项目源码 | 文件源码
def _create_main_design(self, **kwargs):
        r"""
        Create design matrix for main effects
        Keyword Args:
            * *df* (``DataFrame``). specify a new dataframe to create
                design matrix from
        Returns:
            array_like: design matrix in sparse CSR format

        """
        df = kwargs.get('df', self.train_df)
        df.reset_index(drop=True, inplace=True)
        df['row_index'] = df.index
        df['intercept'] = 1.0  # assume intercept is always included

        id_cols = ['row_index']

        melted_df = pd.melt(df[id_cols + self.main_effects], id_cols)
        melted_df = melted_df.merge(self.main_map, on='variable')
        melted_df['col_index'] = melted_df['main_idx']
        row = melted_df.row_index
        col = melted_df.col_index
        data = melted_df.value
        return sparse.coo_matrix((data, (row, col)),
                                 shape=(max(row) + 1, max(col) + 1)).tocsr()
项目:implicit    作者:benfred    | 项目源码 | 文件源码
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of artist/user/playcount """
    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    logging.debug("reading data from %s", filename)
    data = pandas.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(numpy.float32),
                       (data['artist'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))

    logging.debug("read data file in %s", time.time() - start)
    return data, plays
项目:implicit    作者:benfred    | 项目源码 | 文件源码
def bm25_weight(X, K1=100, B=0.8):
    """ Weighs each row of a sparse matrix X  by BM25 weighting """
    # calculate idf per term (user)
    X = coo_matrix(X)

    N = float(X.shape[0])
    idf = log(N / (1 + bincount(X.col)))

    # calculate length_norm per document (artist)
    row_sums = numpy.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    X.data = X.data * (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col]
    return X
项目:sequence-based-recommendations    作者:rdevooght    | 项目源码 | 文件源码
def prepare_model(self, dataset):
        '''Load the data from the training file into a format adapted for the KNN methods.
        '''
        filename = dataset.dirname + 'data/train_set_triplets'
        if os.path.isfile(filename + '.npy'):
            file_content = np.load(filename + '.npy')
        else:
            file_content = np.loadtxt(filename)
            np.save(filename, file_content)

        #self.user_item = ssp.coo_matrix((file_content[:,2], (file_content[:,0], file_content[:,1]))).tocsr()
        self.binary_user_item = ssp.coo_matrix((np.ones(file_content.shape[0]), (file_content[:,0], file_content[:,1]))).tocsr()

        del file_content

        self.n_items = self.binary_user_item.shape[1]
        self.n_users = self.binary_user_item.shape[0]
项目:sparseMF    作者:jeh0753    | 项目源码 | 文件源码
def GLGrid(train):
    ''' Function for obtaining the optimal GraphLab Factorization Recommender parameters for a given dataset.

    train: scipy.sparse.coo_matrix
        The dataset used for grid searching the best parameters.

    Returns: dict
        Dictionary of the best GraphLab parameters for the given dataset.
    '''    
    c = coo_matrix(train)
    sf = graphlab.SFrame({'row': c.row, 'col': c.col, 'data': c.data})
    sf_small = sf.dropna('data', how="all")
    folds = graphlab.cross_validation.KFold(sf_small, 3)
    params = dict([('target', 'data'), ('user_id', 'row'), ('item_id', 'col'), ('num_factors', [10, 20]), ('sgd_step_size', [0.02, 10])])
    job = graphlab.grid_search.create(folds, graphlab.factorization_recommender.create, params) 
    params = job.get_best_params() 
    return params
项目:sparseMF    作者:jeh0753    | 项目源码 | 文件源码
def prepare_input_data(self, X):
        """
        Check to make sure that the input matrix and its mask of missing
        values are valid. Returns X and missing mask.
        """
        #TODO - separate out safety checks in _preprocess_sparse as well, and include them here instead

        self._check_input(X)
        shape = X.shape
        coo = coo_matrix(X)
        row_id = coo.row
        col_id = coo.col
        self.missing_mask = row_id, col_id, shape
        self._check_max_rank(X)
        self._check_missing_value_mask()
        return X
项目:glmnet_py    作者:hanfang    | 项目源码 | 文件源码
def sparseDf(self, df, matrixType="csc"):
        """
        convert a pandas sparse df to numpy sparse array
        :param df: pandas sparse df
        :param matrixType: csc or csr
        :return: numpy sparse array
        """
        columns = df.columns
        dat, rows = map(list, zip(
            *[(df[col].sp_values - df[col].fill_value, df[col].sp_index.to_int_index().indices) for col in columns]))
        cols = [np.ones_like(a) * i for (i, a) in enumerate(dat)]
        datF, rowsF, colsF = np.concatenate(dat), np.concatenate(rows), np.concatenate(cols)
        arr = sparse.coo_matrix((datF, (rowsF, colsF)), df.shape, dtype=np.float64)
        if matrixType == "csc":
            return arr.tocsc()
        elif matrixType == "csr":
            return arr.tocsc()
        else:
            raise ValueError("Only accept csc or csr")
项目:glmnet_py    作者:hanfang    | 项目源码 | 文件源码
def sparseDf(self, df, matrixType="csc"):
        """
        convert a pandas sparse df to numpy sparse array
        :param df: pandas sparse df
        :param matrixType: csc or csr
        :return: numpy sparse array
        """
        columns = df.columns
        dat, rows = map(list, zip(
            *[(df[col].sp_values - df[col].fill_value, df[col].sp_index.to_int_index().indices) for col in columns]))
        cols = [np.ones_like(a) * i for (i, a) in enumerate(dat)]
        datF, rowsF, colsF = np.concatenate(dat), np.concatenate(rows), np.concatenate(cols)
        arr = sparse.coo_matrix((datF, (rowsF, colsF)), df.shape, dtype=np.float64)
        if matrixType == "csc":
            return arr.tocsc()
        elif matrixType == "csr":
            return arr.tocsc()
        else:
            raise ValueError("Only accept csc or csr")
项目:glmnet_py    作者:hanfang    | 项目源码 | 文件源码
def sparseDf(self, df, matrixType="csc"):
        """
        convert a pandas sparse df to numpy sparse array
        :param df: pandas sparse df
        :param matrixType: csc or csr
        :return: numpy sparse array
        """
        columns = df.columns
        dat, rows = map(list, zip(
            *[(df[col].sp_values - df[col].fill_value, df[col].sp_index.to_int_index().indices) for col in columns]))
        cols = [np.ones_like(a) * i for (i, a) in enumerate(dat)]
        datF, rowsF, colsF = np.concatenate(dat), np.concatenate(rows), np.concatenate(cols)
        arr = sparse.coo_matrix((datF, (rowsF, colsF)), df.shape, dtype=np.float64)
        if matrixType == "csc":
            return arr.tocsc()
        elif matrixType == "csr":
            return arr.tocsc()
        else:
            raise ValueError("Only accept csc or csr")
项目:SparkADMM    作者:yahoo    | 项目源码 | 文件源码
def solveSingle(self,inputDF,outputDict,rho,beta_target):
        I,J,V,Y=[],[],[],[]
        fd = {} # mapping feature names to consecutive integers, starting with 0
        for i,(id, x) in enumerate(inputDF.items()):
            l = outputDict.get(id)
            for k,v in x.items():
                I.append(i)
                J.append(k)
                V.append(v)
                upd(fd,k)
            Y.append(l)
        J = map(lambda k: fd[k], J)
        X = sparse.coo_matrix((V,(I,J)),shape=(I[-1]+1,len(fd)))
        fd_reverse = [k for k,v in sorted(fd.items(), key = lambda t: t[1])]
        # y_new = y - X . beta_target
        # converting a proximal least square problem to a ridge regression
        ZmUl = np.array([beta_target.get(k,0) for k in fd_reverse])
        y_new = np.array(Y) - X * ZmUl
        ridge = Ridge(alpha =  rho , fit_intercept=False)
        ret = ridge.fit(X,y_new)
        #ret = self.lr.fit(X,y_new)
        # ordered list of feature names according to their integer ids in fd
        #raise ValueError('fd_reverse = %s \n X = %s \n J = %s \n I = %s \n V = %s \n Y = %s \n y_new = %s \n ret.coef_ = %s \n ZmUl = %s \n'\
        #            %(str(fd_reverse), str(X), str(J), str(I), str(V), str(Y), str(y_new), str(ret.coef_), str(ZmUl)))
        return dict(zip(fd_reverse, (ret.coef_ + ZmUl).tolist()))
项目:modl    作者:arthurmensch    | 项目源码 | 文件源码
def split(self, X):
        X = sp.coo_matrix(X)
        rng = np.random.RandomState(self.random_state)
        shape = X.shape
        n_data = len(X.data)
        n_train = int(self.train_size * n_data)

        for it in range(self.n_iter):
            ind = rng.permutation(n_data)
            train_ind = ind[:n_train]
            test_ind = ind[n_train:]
            X_tr = sp.coo_matrix((X.data[train_ind],
                                  (X.row[train_ind], X.col[train_ind])),
                                 shape=shape)
            X_te = sp.coo_matrix((X.data[test_ind],
                                  (X.row[test_ind], X.col[test_ind])),
                                 shape=shape)
            yield X_tr, X_te
项目:babusca    作者:georglind    | 项目源码 | 文件源码
def generate_hamiltonian(m, basis):
        """
        Generates the (sparse) Hamiltonian

        Parameters
        ----------
        basis : Basis object
            Full basis for this specific number sector.
        """
        nbas = basis.len

        Us = m.Us
        if m.W is not None:
            Us = m.W + np.diag(m.Us)

        HDi = np.arange(nbas)
        HD = NumberSector.onsite_hamiltonian(m.Es, basis.vs) \
            + NumberSector.interaction_hamiltonian(Us, basis.vs)
        Hki, Hkj, Hkv = NumberSector.hopping_hamiltonian(basis, m.hopping, basis.vs)

        return sparse.coo_matrix((Hkv, (Hki, Hkj)), shape=(nbas, nbas)).tocsr() \
            + sparse.coo_matrix((HD, (HDi, HDi)), shape=(nbas, nbas)).tocsr()
项目:babusca    作者:georglind    | 项目源码 | 文件源码
def creation_operator(i, basis0, basis1):
    """
    Create a boson on site <i>

    Parameters
    ----------
    i : int
        Site index
    basis0 : list
        Initial basis
    basis1 : list
        Final basis
    """
    index0 = np.arange(basis0.len)

    mbasis = np.copy(basis0.vs)
    mbasis[:, i] += 1
    index1 = basis1.index(mbasis)

    return sparse.coo_matrix((np.sqrt(mbasis[:, i]), (index1, index0)), shape=[basis1.len, basis0.len]).tocsr()
项目:AequilibraE    作者:AequilibraE    | 项目源码 | 文件源码
def reblocks_matrix(self, sparse_matrix):
        # Gets all non-zero coordinates and makes sure that they are considered
        froms = sparse_matrix.row
        tos =  sparse_matrix.col
        data = sparse_matrix.data

        all_indices = np.hstack((froms, tos))
        indices = np.unique(all_indices)
        compact_shape = indices.shape[0]

        # Builds the hash
        matrix_hash = {}
        titles = []
        for i in range(compact_shape):
            matrix_hash[indices[i]] = i
            froms[froms == indices[i]] = matrix_hash[indices[i]]
            tos[tos == indices[i]] = matrix_hash[indices[i]]
            titles.append(indices[i])
        matrix = coo_matrix((data, (froms, tos)), shape=(compact_shape, compact_shape)).toarray().astype(np.float64)
        return matrix, matrix_hash, titles
项目:bear    作者:theeluwin    | 项目源码 | 文件源码
def drop_tolerance(A, t):
    """
    Drops entry of `A` having absolute value lower than `t`.

    Args:
        A (coo_matrix): Given coo matrix.
        t (float): Tolerance threshld.

    Returns:
        A coo matrix.
    """
    A = A.tocoo()
    row = []
    col = []
    data = []
    for idx, (i, j) in enumerate(zip(A.row, A.col)):
        value = A.data[idx]
        if value < t or value > -t:
            continue
        row.append(i)
        col.append(j)
        data.append(value)
    A = coo_matrix((data, (row, col)), shape=A.shape, dtype=A.dtype)
    del row, col, data
    return A
项目:bear    作者:theeluwin    | 项目源码 | 文件源码
def degree_reverse_rank_perm(A, reverse=False):
    """
    Computes permutation that sorts nodes by degree.

    Args:
        A (coo_matrix): Given coo matrix.
        reverse (bool): If True, sorts with descending order.

    Returns:
        A permutation of node indices. Like `(i -> j)` is denoted as `perm[i] = j`.
    """
    n, _ = A.shape
    degree = {i: 0 for i in range(n)}
    for i, j in zip(A.row, A.col):
        degree[j] += 1
    bottoms = sorted(degree, key=degree.get, reverse=reverse)
    perm = [0 for _ in range(n)]
    for i in range(n):
        perm[bottoms[i]] = i
    return perm
项目:bear    作者:theeluwin    | 项目源码 | 文件源码
def reorder_matrix(A, perm, fix_row=False, fix_col=False):
    """
    Reorders given coo matrix with given permutation. You can fix either row or column.

    Args:
        A (coo_matrix): Given coo matrix.
        perm (list): List of node indicies denoting permutation.
        fix_row (bool): If True, reorders column only.
        fix_col (bool): If True, reorders row only.

    Returns:
        A coo matrix.
    """
    A = A.tocoo()
    if not fix_row:
        row = [perm[i] for i in A.row]
    else:
        row = A.row
    if not fix_col:
        col = [perm[j] for j in A.col]
    else:
        col = A.col
    A = coo_matrix((A.data, (row, col)), shape=A.shape, dtype=A.dtype)
    del row, col
    return A
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def matrix_to_row_col_data(X):
    """Convert sparse affinity matrix to arrays.

    .. note:: Deprecated.
          It will be removed in icing 0.2. This is now done by check_array from
          numpy.
    """
    # convert to coo format (from lil,csr,csc)
    if isinstance(X, coo_matrix):
        X_coo = X
    elif (isinstance(X, csr_matrix)) or (isinstance(X, lil_matrix)):
        X_coo = X.tocoo()
    else:  # others like numpy matrix could be convert to coo matrix
        X_coo = coo_matrix(X)
    # Upcast matrix to a floating point format (if necessary)
    X_coo = X_coo.asfptype()
    return X_coo.row.astype(np.int), X_coo.col.astype(np.int), X_coo.data
项目:NLP.py    作者:PythonOptimizers    | 项目源码 | 文件源码
def hess(self, *args, **kwargs):
        """Evaluate Lagrangian Hessian at (x, z)."""
        l_vals, l_rows, l_cols = super(SciPyAmplModel, self).hess(*args,
                                                                  **kwargs)

        # AMPL only returns the upper triangular part of the Hessian and
        # `scipy.coo_matrix` doesn't have a `symmetric` attribute, so we
        # need to copy the upper part of the matrix
        diag_idx = np.where(l_rows == l_cols)

        # strict upper triangle of H is obtained by switching rows and
        # cols indices and removing values on the diagonal.
        u_rows = np.delete(l_cols, diag_idx)  # creates a copy
        u_cols = np.delete(l_rows, diag_idx)
        u_vals = np.delete(l_vals, diag_idx)

        H = sp.coo_matrix((np.concatenate((l_vals, u_vals)),
                           (np.concatenate((l_rows, u_rows)),
                            np.concatenate((l_cols, u_cols)))),
                          shape=(self.nvar, self.nvar))
        return H
项目:NLP.py    作者:PythonOptimizers    | 项目源码 | 文件源码
def hess(self, *args, **kwargs):
            """Evaluate Lagrangian Hessian at (x, z)."""
            u_vals, u_rows, u_cols = super(SciPyAdolcModel,
                                           self).hess(*args, **kwargs)

            # ADOL-C only returns the upper triangular part of the Hessian and
            # `scipy.coo_matrix` doesn't have a `symmetric` attribute, so we
            # need to copy the upper part of the matrix
            diag_idx = np.where(u_rows == u_cols)

            l_rows = np.delete(u_cols, diag_idx)  # creates a copy
            l_cols = np.delete(u_rows, diag_idx)
            l_vals = np.delete(u_vals, diag_idx)

            H = sp.coo_matrix((np.concatenate((l_vals, u_vals)),
                               (np.concatenate((l_rows, u_rows)),
                                np.concatenate((l_cols, u_cols)))),
                              shape=(self.nvar, self.nvar))
            return H
项目:gelearn    作者:lookatmoon    | 项目源码 | 文件源码
def construct_sparse_feature_matrix(dat, fea_map):
    num_row = 0
    row = []
    col = []
    val = []
    for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]): # keep the same order in iterating through the data set: 0,...,N-1
        for f, v in fea_dic.items():
            row.append( num_row )
            col.append( fea_map[f] )
            val.append( v )
        num_row += 1
    spmat = sp.coo_matrix((val, (row, col)), shape=(num_row, len(fea_map))).tocsr()
    return spmat

# construct K x N mask matrix: K=number of labeled features, N=number of data points
# ** we can skip the labeled documents, if any. needs further experiments **
项目:gelearn    作者:lookatmoon    | 项目源码 | 文件源码
def construct_feature_document_indicators(dat, labeled_features):
    if len(labeled_features) == 0:
        return sp.coo_matrix(([1.], ([0], [0])), shape=(1, len(dat))).tocsr()

    num_col = 0
    row = []
    col = []
    val = []
    for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]):
        num_row = 0
        for fea, label_dist in sorted(labeled_features.iteritems(), key = lambda x: x[0]):
            if fea in fea_dic:
                row.append( num_row )
                col.append( num_col )
                val.append( 1. )
            num_row += 1
        num_col += 1
    spmat = sp.coo_matrix((val, (row, col)), shape=(len(labeled_features), len(dat))).tocsr()
    return spmat

# construct feature expected distribution: K x C matrix: K=number of labeled features, C=number of classes
项目:gelearn    作者:lookatmoon    | 项目源码 | 文件源码
def construct_label_document_indicators(dat, labeled_instances):
    if len(labeled_instances) == 0:
        return sp.coo_matrix(([1.], ([0], [0])), shape=(1, len(dat))).tocsr()

    num_row = 0
    num_col = 0
    row = []
    col = []
    val = []
    for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]):
        if doc_id in labeled_instances:
            row.append( num_row )
            col.append( num_col )
            val.append( 1. )
            num_row += 1
        num_col += 1
    spmat = sp.coo_matrix((val, (row, col)), shape=(len(labeled_instances), len(dat))).tocsr()
    return spmat

# construct L x C matrix: L=number of labeled documents, C=number of classes
项目:gelearn    作者:lookatmoon    | 项目源码 | 文件源码
def construct_sparse_feature_matrix(dat, fea_map):
    num_row = 0
    row = []
    col = []
    val = []
    for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]): # keep the same order in iterating through the data set: 0,...,N-1
        for f, v in fea_dic.items():
            row.append( num_row )
            col.append( fea_map[f] )
            val.append( v )
        num_row += 1
    spmat = sp.coo_matrix((val, (row, col)), shape=(num_row, len(fea_map))).tocsr()
    return spmat

# construct K x N mask matrix: K=number of labeled features, N=number of data points
# ** we can skip the labeled documents, if any. needs further experiments **
项目:gelearn    作者:lookatmoon    | 项目源码 | 文件源码
def construct_feature_document_indicators(dat, labeled_features):
    if len(labeled_features) == 0:
        return sp.coo_matrix(([1.], ([0], [0])), shape=(1, len(dat))).tocsr()

    num_col = 0
    row = []
    col = []
    val = []
    for doc_id, fea_dic in sorted(dat.iteritems(), key = lambda x: x[0]):
        num_row = 0
        for fea, label_dist in sorted(labeled_features.iteritems(), key = lambda x: x[0]):
            if fea in fea_dic:
                row.append( num_row )
                col.append( num_col )
                val.append( 1. )
            num_row += 1
        num_col += 1
    spmat = sp.coo_matrix((val, (row, col)), shape=(len(labeled_features), len(dat))).tocsr()
    return spmat

# construct feature expected distribution: K x C matrix: K=number of labeled features, C=number of classes
项目:subreddit-scoring    作者:ririw    | 项目源码 | 文件源码
def load_recmat(self):
        assert self.complete()
        with self.output().open() as f:
            data = json.load(f)
        print('Data loaded')
        subreddit_remapping = {sid: ix for ix, sid in enumerate(tqdm(data['subreddit_ids'].values(), desc='remapping sub'))}
        author_remapping = {aid: ix for ix, aid in enumerate(tqdm(data['author_ids'].values(), desc='remapping author'))}

        i = np.asarray([author_remapping[auth] for sub, auth in data['edges']])
        j = np.asarray([subreddit_remapping[sub] for sub, auth in data['edges']])
        entries = np.ones(len(data['edges']))

        recmat = sp.coo_matrix((entries, (i, j)), shape=[len(author_remapping), len(subreddit_remapping)])

        sub_mapping = {sub: subreddit_remapping[ix] for sub, ix in data['subreddit_ids'].items()}
        auth_mapping = {auth: author_remapping[ix] for auth, ix in data['author_ids'].items()}
        return sp.csr_matrix(recmat), sub_mapping, auth_mapping
项目:SCaIP    作者:simonsfoundation    | 项目源码 | 文件源码
def contruct_ellipse_parallel(pars):

    Coor,cm,A_i,Vr,dims,dist,max_size,min_size,d=pars
    dist_cm = coo_matrix(np.hstack([Coor[c].reshape(-1, 1) - cm[k]
                                                for k, c in enumerate(['x', 'y', 'z'][:len(dims)])]))
    Vr.append(dist_cm.T * spdiags(A_i.toarray().squeeze(),
                                  0, d, d) * dist_cm / A_i.sum(axis=0))

    if np.sum(np.isnan(Vr)) > 0:
        raise Exception('You cannot pass empty (all zeros) components!')

    D, V = eig(Vr[-1])

    dkk = [np.min((max_size**2, np.max((min_size**2, dd.real)))) for dd in D]

    # search indexes for each component
    return np.sqrt(np.sum([(dist_cm * V[:, k])**2 / dkk[k] for k in range(len(dkk))], 0)) <= dist
#%% threshold_components
项目:Kaggler    作者:qqgeogor    | 项目源码 | 文件源码
def transform(self, X):
        """Encode categorical columns into sparse matrix with one-hot-encoding.

        Args:
            X (numpy.array): categorical columns to encode

        Returns:
            X_new (scipy.sparse.coo_matrix): sparse matrix encoding categorical
                                             variables into dummy variables
        """

        for col in range(X.shape[1]):
            X_col = self._transform_col(X[:, col], col)
            if X_col is not None:
                if col == 0:
                    X_new = X_col
                else:
                    X_new = sparse.hstack((X_new, X_col))

            logging.debug('{} --> {} features'.format(
                col, self.label_encoder.label_maxes[col])
            )

        return X_new
项目:indigo    作者:mbdriscoll    | 项目源码 | 文件源码
def Zpad(self, M, N, mode='center', dtype=np.dtype('complex64'), **kwargs):
        slc = []
        if mode == 'center':
            for m, n in zip(M, N):
                slc += [slice(m // 2 + int(np.ceil(-n / 2)),
                              m // 2 + int(np.ceil( n / 2))), ]
        elif mode == 'edge':
            for m, n in zip(M, N):
                slc.append(slice(n))
            pass
        x = np.arange( np.prod(M), dtype=int ).reshape(M, order='F')
        rows = x[slc].flatten(order='F')
        cols = np.arange(rows.size)
        ones = np.ones_like(cols)
        shape = np.prod(M), np.prod(N)
        M = spp.coo_matrix( (ones, (rows,cols)), shape=shape, dtype=dtype )
        return self.SpMatrix(M, **kwargs)
项目:indigo    作者:mbdriscoll    | 项目源码 | 文件源码
def interp_mat(m, N, width, table, coord, backend):

    ndim = coord.shape[0]

    if ndim == 1:
        _interp_mat = _interp1_mat
    elif ndim == 2:
        _interp_mat = _interp2_mat
    elif ndim == 3:
        _interp_mat = _interp3_mat
    else:
        raise ValueError('Number of dimensions can only be 1, 2 or 3, got %r',
                         ndim)

    row, col, ker = _interp_mat(m, N, width, table, coord)

    return sparse.coo_matrix((ker, (row, col)),
                             shape=(m, np.prod(N, dtype=np.int)))
项目:knowledge_linker    作者:glciampaglia    | 项目源码 | 文件源码
def test_closureap():
    """ Correctedness of all-pairs parallel closure. """
    np.random.seed(100)
    dt = DirTree('test', (2, 5, 10), root='test_parallel')
    N = 100
    thresh = 0.1
    A = sp.rand(N, N, thresh, 'csr')
    nnz = A.getnnz()
    sparsity = float(nnz) / N ** 2
    print 'Number of nnz = {}, sparsity = {:g}'.format(nnz, sparsity)
    A = np.asarray(A.todense())
    clo.closureap(A, dt)
    coords = np.asarray(fromdirtree(dt, N), dtype=coo_dtype)
    coo = (coords['weight'], (coords['row'], coords['col']))
    B = np.asarray(sp.coo_matrix(coo, shape=(N, N)).todense())
    rows = []
    for row in xrange(N):
        r, _ = clo.cclosuress(A, row)
        rows.append(r)
    C = np.asarray(rows)
    assert np.allclose(B, C)
    # cleanup
    for logpath in glob('closure-*.log'):
        os.remove(logpath)
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def multi_hot_encode(x: Sequence[str],
                     prefix: str) -> (coo_matrix, Dict[str, int]):
    """
    Return sparse matrix encoding categorical variables in x and dictionary
    mapping categorical variables to column numbers.
    Each record in x must be a single string with the categorical variables
    separated by a comma. The prefix prepends the categorical variable name
    to prevent collisions.
    """
    data = []
    i = []
    j = []
    col = count()
    dummy_col = defaultdict(lambda: next(col))
    for row, cat_vars in enumerate(x):
        for cat_var in cat_vars.split(','):
            prepended = f'{prefix}_{cat_var}'
            data.append(1)
            i.append(row)
            j.append(dummy_col[prepended])
    return coo_matrix((data, (i, j))), {v: k for k, v in dummy_col.items()}
项目:fake_news    作者:bmassman    | 项目源码 | 文件源码
def get_lshash(text: coo_matrix) -> List[str]:
    """
    Return list of cosine LSHs encoding text.
    """
    def cosine_LSH(vector, planes):
        """
        Return a single cosine LSH for a particular record and given planes.
        """
        sig = 0
        for plane in planes:
            sig <<= 1
            if vector.dot(plane) >= 0:
                sig |= 1
        return str(sig)

    bits = 512
    random_projections = np.random.randn(bits, text.shape[1])
    hashes = [cosine_LSH(text.getrow(idx), random_projections)
              for idx in range(text.shape[0])]
    return hashes
项目:keras-gcn    作者:tkipf    | 项目源码 | 文件源码
def load_data(path="data/cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1]))

    return features.todense(), adj, labels
项目:FingerNet    作者:felixTY    | 项目源码 | 文件源码
def label2mnt(mnt_s_out, mnt_w_out, mnt_h_out, mnt_o_out, thresh=0.5):
    mnt_s_out = np.squeeze(mnt_s_out)
    mnt_w_out = np.squeeze(mnt_w_out)
    mnt_h_out = np.squeeze(mnt_h_out)
    mnt_o_out = np.squeeze(mnt_o_out)
    assert len(mnt_s_out.shape)==2 and len(mnt_w_out.shape)==3 and len(mnt_h_out.shape)==3 and len(mnt_o_out.shape)==3 
    # get cls results
    mnt_sparse = sparse.coo_matrix(mnt_s_out>thresh)
    mnt_list = np.array(zip(mnt_sparse.row, mnt_sparse.col), dtype=np.int32)
    if mnt_list.shape[0] == 0:
        return np.zeros((0, 4))
    # get regression results
    mnt_w_out = np.argmax(mnt_w_out, axis=-1)
    mnt_h_out = np.argmax(mnt_h_out, axis=-1)
    mnt_o_out = np.argmax(mnt_o_out, axis=-1) # TODO: use ori_highest_peak(np version)
    # get final mnt
    mnt_final = np.zeros((len(mnt_list), 4))
    mnt_final[:, 0] = mnt_sparse.col*8 + mnt_w_out[mnt_list[:,0], mnt_list[:,1]]
    mnt_final[:, 1] = mnt_sparse.row*8 + mnt_h_out[mnt_list[:,0], mnt_list[:,1]]
    mnt_final[:, 2] = (mnt_o_out[mnt_list[:,0], mnt_list[:,1]]*2-89.)/180*np.pi
    mnt_final[mnt_final[:, 2]<0.0, 2] = mnt_final[mnt_final[:, 2]<0.0, 2]+2*np.pi
    mnt_final[:, 3] = mnt_s_out[mnt_list[:,0], mnt_list[:, 1]]
    return mnt_final
项目:FingerNet    作者:felixTY    | 项目源码 | 文件源码
def label2mnt(mnt_s_out, mnt_w_out, mnt_h_out, mnt_o_out, thresh=0.5):
    mnt_s_out = np.squeeze(mnt_s_out)
    mnt_w_out = np.squeeze(mnt_w_out)
    mnt_h_out = np.squeeze(mnt_h_out)
    mnt_o_out = np.squeeze(mnt_o_out)
    assert len(mnt_s_out.shape)==2 and len(mnt_w_out.shape)==3 and len(mnt_h_out.shape)==3 and len(mnt_o_out.shape)==3 
    # get cls results
    mnt_sparse = sparse.coo_matrix(mnt_s_out>thresh)
    mnt_list = np.array(zip(mnt_sparse.row, mnt_sparse.col), dtype=np.int32)
    if mnt_list.shape[0] == 0:
        return np.zeros((0, 4))
    # get regression results
    mnt_w_out = np.argmax(mnt_w_out, axis=-1)
    mnt_h_out = np.argmax(mnt_h_out, axis=-1)
    mnt_o_out = np.argmax(mnt_o_out, axis=-1) # TODO: use ori_highest_peak(np version)
    # get final mnt
    mnt_final = np.zeros((len(mnt_list), 4))
    mnt_final[:, 0] = mnt_sparse.col*8 + mnt_w_out[mnt_list[:,0], mnt_list[:,1]]
    mnt_final[:, 1] = mnt_sparse.row*8 + mnt_h_out[mnt_list[:,0], mnt_list[:,1]]
    mnt_final[:, 2] = (mnt_o_out[mnt_list[:,0], mnt_list[:,1]]*2-89.)/180*np.pi
    mnt_final[mnt_final[:, 2]<0.0, 2] = mnt_final[mnt_final[:, 2]<0.0, 2]+2*np.pi
    mnt_final[:, 3] = mnt_s_out[mnt_list[:,0], mnt_list[:, 1]]
    return mnt_final
项目:OpenMDAO    作者:OpenMDAO    | 项目源码 | 文件源码
def _build(self, num_rows, num_cols):
        """
        Allocate the matrix.

        Parameters
        ----------
        num_rows : int
            number of rows in the matrix.
        num_cols : int
            number of cols in the matrix.
        """
        data, rows, cols = self._build_sparse(num_rows, num_cols)

        metadata = self._metadata
        for key, (ind1, ind2, idxs, jac_type, factor) in iteritems(metadata):
            if idxs is None:
                metadata[key] = (slice(ind1, ind2), jac_type, factor)
            else:
                # store reverse indices to avoid copying subjac data during
                # update_submat.
                metadata[key] = (np.argsort(idxs) + ind1, jac_type, factor)

        self._matrix = coo_matrix((data, (rows, cols)),
                                  shape=(num_rows, num_cols))
项目:ranking    作者:wattlebird    | 项目源码 | 文件源码
def MasseyVector(self):
        """This function produces X'Wy
        """
        idx = self.itemlist
        table = self.table
        pair = self.pair
        j = np.ravel(pair)
        i = np.repeat(np.arange(table.shape[0], dtype=np.int32), 2, axis=0)
        data = np.array([[1,-1]],dtype=np.float32)
        data = np.ravel(np.repeat(data, table.shape[0], axis=0))
        X = coo_matrix((data, (i, j)), shape=(table.shape[0], len(idx)))
        X = X.tocsr()
        W = np.require(table.iloc[:,4].values, np.float32)
        y = table.iloc[:, 2].values - table.iloc[:, 3].values;
        Wy=np.multiply(W, y)
        return X.T*Wy
项目:loompy    作者:linnarsson-lab    | 项目源码 | 文件源码
def __getitem__(self, thing: Any) -> sparse.coo_matrix:
        if type(thing) is slice or type(thing) is np.ndarray or type(thing) is int:
            gm = GraphManager(None, axis=self.axis)
            for key, g in self.items():
                # Slice the graph matrix properly without making it dense
                (a, b, w) = (g.row, g.col, g.data)
                indices = np.arange(g.shape[0])[thing]
                mask = np.logical_and(np.in1d(a, indices), np.in1d(b, indices))
                a = a[mask]
                b = b[mask]
                w = w[mask]
                d = dict(zip(np.sort(indices), np.arange(indices.shape[0])))
                a = np.array([d[x] for x in a])
                b = np.array([d[x] for x in b])
                gm[key] = sparse.coo_matrix((w, (a, b)), shape=(len(indices), len(indices)))
            return gm
        else:
            return self.__getattr__(thing)
项目:stupidmlp    作者:howonlee    | 项目源码 | 文件源码
def __init__(self, *args):
        ''' Initialization of the perceptron with given sizes.  '''

        self.shape = args
        n = len(args)
        self.bp_times = []

        # Build layers
        self.layers = []
        # Input layer (+1 unit for bias)
        self.layers.append(sci_sp.csc_matrix(np.ones(self.shape[0]+1)))
        # Hidden layer(s) + output layer
        for i in range(1,n):
            self.layers.append(sci_sp.csc_matrix(np.ones(self.shape[i])))

        # Build weights matrix (randomly)
        self.weights = []
        self.sparsifiers = []
        self.has_sparsified = False
        for i in range(n-1):
            new_weights = (2 * (npr.random((self.layers[i].size, self.layers[i+1].size))) - 1) * 0.00001
            self.weights.append(sci_sp.csc_matrix(new_weights))
            self.sparsifiers.append(sci_sp.coo_matrix(np.ones_like(new_weights)))
项目:stupidmlp    作者:howonlee    | 项目源码 | 文件源码
def sparsify(self):
        self.has_sparsified = True
        for i in range(len(self.weights)-1): # not the softmax layer
            # so the 50th percentile of existing weights above 0
            # leave it as np.percentile, not median, cuz experimentation
            thresh = np.percentile(
                               np.abs(
                                   self.weights[i].toarray()[np.abs(self.weights[i].toarray()) > 0]
                                ), 50
                              )
            # add to sparsifier
            # kill based upon sparsifier, but in the actual backprop
            new_sparsifier = self.sparsifiers[i].toarray()
            self.sparsifiers[i] = sci_sp.coo_matrix(np.logical_and(np.abs(self.weights[i].toarray()) > thresh, new_sparsifier))
            self.weights[i][np.abs(self.weights[i].toarray()) < thresh] = 0
            self.weights[i].eliminate_zeros()
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def merge_nearest_neighbors(filenames, total_rows):
    """ Merge nearest neighbor adjacency matrix HDF files.
    Returns: A sparse adjacency matrix """
    nn = sp_sparse.coo_matrix((total_rows, total_rows))
    for filename in filenames:
        h5 = h5py.File(filename, 'r')
        nn += sp_sparse.coo_matrix((np.ones(len(h5['i'])),
                                    (h5['i'][:], h5['j'][:])),
                                   shape=nn.shape)
        h5.close()
    return nn
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def tocoo(self):
        if type(self.m) is not sp_sparse.coo_matrix:
            self.m = self.m.tocoo()
项目:MetaphoricChange    作者:Garrafao    | 项目源码 | 文件源码
def save_pkl_files(dsm_prefix, dsm, save_in_one_file=False):
    """
    Save the space to separate pkl files.
    :param dsm_prefix:
    :param dsm:
    """

    # Save in a single file (for small spaces)
    if save_in_one_file:
        io_utils.save(dsm, dsm_prefix + '.pkl')

    # Save in multiple files: npz for the matrix and pkl for the other data members of Space
    else:
        mat = coo_matrix(dsm.cooccurrence_matrix.get_mat())
        np.savez_compressed(dsm_prefix + 'cooc.npz', data=mat.data, row=mat.row, col=mat.col, shape=mat.shape)

        with open(dsm_prefix + '_row2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._row2id, f_out, 2)

        with open(dsm_prefix + '_id2row.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2row, f_out, 2)

        with open(dsm_prefix + '_column2id.pkl', 'wb') as f_out:
            pickle.dump(dsm._column2id, f_out, 2)

        with open(dsm_prefix + '_id2column.pkl', 'wb') as f_out:
            pickle.dump(dsm._id2column, f_out, 2)
项目:MetaphoricChange    作者:Garrafao    | 项目源码 | 文件源码
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """

    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)