Python scipy.sparse 模块,spdiags() 实例源码

我们从Python开源项目中,提取了以下30个代码示例,用于说明如何使用scipy.sparse.spdiags()

项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def edgeCurl(self):
        """The edgeCurl property."""
        if self.nCy > 1:
            raise NotImplementedError('Edge curl not yet implemented for '
                                      'nCy > 1')
        if getattr(self, '_edgeCurl', None) is None:
            # 1D Difference matricies
            dr = sp.spdiags((np.ones((self.nCx+1, 1))*[-1, 1]).T, [-1, 0],
                            self.nCx, self.nCx, format="csr")
            dz = sp.spdiags((np.ones((self.nCz+1, 1))*[-1, 1]).T, [0, 1],
                            self.nCz, self.nCz+1, format="csr")

            # 2D Difference matricies
            Dr = sp.kron(sp.identity(self.nNz), dr)
            Dz = -sp.kron(dz, sp.identity(self.nCx))

            A = self.area
            E = self.edge
            # Edge curl operator
            self._edgeCurl = (utils.sdiag(1/A)*sp.vstack((Dz, Dr)) *
                              utils.sdiag(E))
        return self._edgeCurl
项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:scipy-lecture-notes-zh-CN    作者:jayleicn    | 项目源码 | 文件源码
def sakurai(n):
    """ Example taken from
        T. Sakurai, H. Tadano, Y. Inadomi and U. Nagashima
        A moment-based method for large-scale generalized eigenvalue problems
        Appl. Num. Anal. Comp. Math. Vol. 1 No. 2 (2004) """

    A = sparse.eye( n, n )
    d0 = array(r_[5,6*ones(n-2),5])
    d1 = -4*ones(n)
    d2 =  ones(n)
    B = sparse.spdiags([d2,d1,d0,d1,d2],[-2,-1,0,1,2],n,n)

    k = arange(1,n+1)
    w_ex = sort(1./(16.*pow(cos(0.5*k*pi/(n+1)),4))) # exact eigenvalues

    return A,B, w_ex
项目:NewtonMultigrid    作者:amergl    | 项目源码 | 文件源码
def specificJacobi(ndofs,gamma,u):
    factor=-(ndofs+1)*(ndofs+1)#-1/h**2
    size=ndofs*ndofs
    if ndofs == 1:
        return sparse.csc_matrix([gamma * u[0] * exp(u[0]) + gamma*exp(u[0]) -4*factor])
    diag=zeros(size)
    for j in range(ndofs):
        for i in range(ndofs):
            diag[j*ndofs+i]= gamma * u[j*ndofs+i] * exp(u[j*ndofs+i]) + gamma*exp(u[j*ndofs+i])
    diag+=-4*factor
    data=[diag,[factor]*size,[factor]*size,[factor]*size,[factor]*size]
    offset=[0,-1,1,-ndofs,ndofs]
    M = sparse.spdiags(data,offset,size,size,format='lil')
    for i in range(1,ndofs):
        M[i*ndofs,i*ndofs-1]=M[i*ndofs-1,i*ndofs]=0
    return sparse.csc_matrix(M)
项目:semihin    作者:HKUST-KnowComp    | 项目源码 | 文件源码
def variantWalk(self):
        tick = time.time()

        alpha = self.param['alpha']
        n = self.graph.shape[0]
        c = self.Y.shape[1]
        nf = self.param['normalize_factor']

        data = (self.graph.sum(axis=0) + nf * np.ones(n)).ravel()
        Di = sparse.spdiags(data,0,n,n).tocsc()
        S_iter = (Di - alpha * self.graph).tocsc()

        F = np.zeros((n, c))
        for i in range(c):
            F[:, i], info = slin.cg(S_iter, self.Y[:, i], tol=1e-10)

        toc = time.time()

        self.ElapsedTime = toc - tick
        self.PredictedProbs = F
项目:SCaIP    作者:simonsfoundation    | 项目源码 | 文件源码
def contruct_ellipse_parallel(pars):

    Coor,cm,A_i,Vr,dims,dist,max_size,min_size,d=pars
    dist_cm = coo_matrix(np.hstack([Coor[c].reshape(-1, 1) - cm[k]
                                                for k, c in enumerate(['x', 'y', 'z'][:len(dims)])]))
    Vr.append(dist_cm.T * spdiags(A_i.toarray().squeeze(),
                                  0, d, d) * dist_cm / A_i.sum(axis=0))

    if np.sum(np.isnan(Vr)) > 0:
        raise Exception('You cannot pass empty (all zeros) components!')

    D, V = eig(Vr[-1])

    dkk = [np.min((max_size**2, np.max((min_size**2, dd.real)))) for dd in D]

    # search indexes for each component
    return np.sqrt(np.sum([(dist_cm * V[:, k])**2 / dkk[k] for k in range(len(dkk))], 0)) <= dist
#%% threshold_components
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def get_term_topic(self, X):
        n_features = X.shape[1]
        id2word = self.vocabulary_
        word2topic = {}

        with open('word_topic.txt', 'r') as f:
            for line in f:
                strs = line.decode('utf-8').strip('\n').split('\t')
                word2topic[strs[0]] = strs[2]

        topic = np.zeros((len(id2word),))

        for i, key in enumerate(id2word):
            if key in word2topic:
                topic[id2word[key]] = word2topic[key]
            else:
                print key

        topic = preprocessing.MinMaxScaler().fit_transform(topic)
        # topic = sp.spdiags(topic, diags=0, m=n_features,
        #                    n=n_features, format='csr')
        return topic
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def fit(self, X, y=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        if not sp.issparse(X):
            X = sp.csc_matrix(X)
        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            idf = np.log(float(n_samples) / df) + 1.0
            self._idf_diag = sp.spdiags(idf,
                                        diags=0, m=n_features, n=n_features)

        return self
项目:genomedisco    作者:kundajelab    | 项目源码 | 文件源码
def sqrtvc(m):
    mup=m
    mdown=mup.transpose()
    mdown.setdiag(0)
    mtogether=mup+mdown
    sums_sq=np.sqrt(mtogether.sum(axis=1)) 
    D_sq = sps.spdiags(1.0/sums_sq.flatten(), [0], mtogether.get_shape()[0], mtogether.get_shape()[1], format='csr')
    return sps.triu(D_sq.dot(mtogether.dot(D_sq)))
项目:genomedisco    作者:kundajelab    | 项目源码 | 文件源码
def hichip_add_diagonal(m):
    mup=m
    mdown=mup.transpose()
    mdown.setdiag(0)
    mtogether=mup+mdown
    sums=mtogether.sum(axis=1)
    max_sum=np.max(sums)
    to_add=1.0*max_sum-1.0*sums
    to_add_values=[]
    for i in range(m.shape[0]):
        to_add_values.append(to_add[i,0])
    mtogether.setdiag(np.array(to_add_values))
    D = sps.spdiags(1.0/sums.flatten(), [0], mtogether.get_shape()[0], mtogether.get_shape()[1], format='csr')
    return sps.triu(D.dot(mtogether))
项目:genomedisco    作者:kundajelab    | 项目源码 | 文件源码
def coverage_norm(m):
    mup=m
    mdown=mup.transpose()
    mdown.setdiag(0)
    mtogether=mup+mdown
    sums=mtogether.sum(axis=1)
    D = sps.spdiags(1.0/sums.flatten(), [0], mtogether.get_shape()[0], mtogether.get_shape()[1], format='csr')
    return sps.triu(D.dot(mtogether.dot(D)))

#assumes matrix is upper triangular
项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def sdiag(h):
    """Sparse diagonal matrix"""
    if isinstance(h, Zero):
        return Zero()

    return sp.spdiags(mkvc(h), 0, h.size, h.size, format="csr")
项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def ddx(n):
    """Define 1D derivatives, inner, this means we go from n+1 to n"""
    return sp.spdiags(
        (np.ones((n+1, 1))*[-1, 1]).T, [0, 1], n, n+1,
        format="csr"
    )
项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def av(n):
    """Define 1D averaging operator from nodes to cell-centers."""
    return sp.spdiags(
        (0.5*np.ones((n+1, 1))*[1, 1]).T, [0, 1], n, n+1,
        format="csr"
    )
项目:NewtonMultigrid    作者:amergl    | 项目源码 | 文件源码
def __init__(self, A, omega, *args, **kwargs):
        """Initialization routine for the smoother

        Args:
            A (scipy.sparse.csc_matrix): sparse matrix A of the system to solve
            omega (float): a weighting factor
            *args: Variable length argument list
            **kwargs: Arbitrary keyword arguments
        """
        super(WeightedJacobi, self).__init__(A, *args, **kwargs)

        self.P = sp.spdiags(self.A.diagonal(), 0, self.A.shape[0], self.A.shape[1],
                            format='csc')
        # precompute inverse of the preconditioner for later usage
        self.Pinv = omega * spLA.inv(self.P)
项目:NewtonMultigrid    作者:amergl    | 项目源码 | 文件源码
def __get_partial_matrix(self):
        factor=1./(self.dx*self.dx)
        size=self.ndofs*self.ndofs
        diag=np.zeros(size)
        for j in range(self.ndofs):
            for i in range(self.ndofs):
                diag[j*self.ndofs+i]=4.*factor
        data=[diag,[-factor]*size,[-factor]*size,[-factor]*size,[-factor]*size]
        offset=[0,-1,1,-self.ndofs,self.ndofs]
        M=sp.spdiags(data,offset,size,size,format='lil')
        for i in range(1,self.ndofs):
            M[i*self.ndofs,i*self.ndofs-1]=M[i*self.ndofs-1,i*self.ndofs]=0
        return M
项目:NewtonMultigrid    作者:amergl    | 项目源码 | 文件源码
def __get_system_matrix(ndofs):
        """Helper routine to get the system matrix discretizing :math:`-Delta` with second order FD

        Args:
            ndofs (int): number of inner grid points (no boundaries!)
        Returns:
            scipy.sparse.csc_matrix: sparse system matrix A
                of size :attr:`ndofs` x :attr:`ndofs`
        """
        data = np.array([[2]*ndofs, [-1]*ndofs, [-1]*ndofs])
        diags = np.array([0, -1, 1])
        return sp.spdiags(data, diags, ndofs, ndofs, format='csc')
项目:NewtonMultigrid    作者:amergl    | 项目源码 | 文件源码
def __get_partial_matrix(self):
        factor=1./(self.dx*self.dx)
        size=self.ndofs*self.ndofs
        diag=np.zeros(size)
        for j in range(self.ndofs):
            for i in range(self.ndofs):
                diag[j*self.ndofs+i]=4.*factor
        data=[diag,[-factor]*size,[-factor]*size,[-factor]*size,[-factor]*size]
        offset=[0,-1,1,-self.ndofs,self.ndofs]
        M=sp.spdiags(data,offset,size,size,format='lil')
        for i in range(1,self.ndofs):
            M[i*self.ndofs,i*self.ndofs-1]=M[i*self.ndofs-1,i*self.ndofs]=0
        return M
项目:SCaIP    作者:simonsfoundation    | 项目源码 | 文件源码
def order_components(A, C):
    """Order components based on their maximum temporal value and size

    Parameters
    -----------
    A:   sparse matrix (d x K)
         spatial components
    C:   matrix or np.ndarray (K x T)
         temporal components

    Returns
    -------
    A_or:  np.ndarray
        ordered spatial components
    C_or:  np.ndarray
        ordered temporal components
    srt:   np.ndarray
        sorting mapping

    """
    A = np.array(A.todense())
    nA2 = np.sqrt(np.sum(A**2, axis=0))
    K = len(nA2)
    A = np.array(np.matrix(A) * spdiags(1 / nA2, 0, K, K))
    nA4 = np.sum(A**4, axis=0)**0.25
    C = np.array(spdiags(nA2, 0, K, K) * np.matrix(C))
    mC = np.ndarray.max(np.array(C), axis=1)
    srt = np.argsort(nA4 * mC)[::-1]
    A_or = A[:, srt] * spdiags(nA2[srt], 0, K, K)
    C_or = spdiags(1. / nA2[srt], 0, K, K) * (C[srt, :])

    return A_or, C_or, srt
项目:pylmnn    作者:johny-c    | 项目源码 | 文件源码
def sum_outer_products(X, weights, remove_zero=False):
    """Computes the sum of weighted outer products using a sparse weights matrix

    Parameters
    ----------
    X : array_like
        An array of data samples with shape (n_samples, n_features_in).
    weights : csr_matrix
        A sparse weights matrix (indicating target neighbors) with shape (n_samples, n_samples).
    remove_zero : bool
        Whether to remove rows and columns of the symmetrized weights matrix that are zero (default: False).

    Returns
    -------
    array_like
        An array with the sum of all weighted outer products with shape (n_features_in, n_features_in).

    """
    weights_sym = weights + weights.T
    if remove_zero:
        _, cols = weights_sym.nonzero()
        ind = np.unique(cols)
        weights_sym = weights_sym.tocsc()[:, ind].tocsr()[ind, :]
        X = X[ind]

    n = weights_sym.shape[0]
    diag = sparse.spdiags(weights_sym.sum(axis=0), 0, n, n)
    laplacian = diag.tocsr() - weights_sym
    sodw = X.T.dot(laplacian.dot(X))

    return sodw
项目:PyPardisoProject    作者:haasad    | 项目源码 | 文件源码
def create_test_A_b_small(matrix=False, sort_indices=True):
    """
    --- A ---
    scipy.sparse.csr.csr_matrix, float64
    matrix([[5, 1, 0, 0, 0],
            [0, 6, 2, 0, 0],
            [0, 0, 7, 3, 0],
            [0, 0, 0, 8, 4],
            [0, 0, 0, 0, 9]])
    --- b ---
    np.ndarray, float64
    array([[ 1],
           [ 4],
           [ 7],
           [10],
           [13]])
    or       
    array([[ 1,  2,  3],
           [ 4,  5,  6],
           [ 7,  8,  9],
           [10, 11, 12],
           [13, 14, 15]])

    """
    A = sp.spdiags(np.arange(10, dtype=np.float64).reshape(2,5), [1, 0], 5, 5, format='csr')
    if sort_indices:
        A.sort_indices()
    b = np.arange(1,16, dtype=np.float64).reshape(5,3)
    if matrix:
        return A, b
    else:
        return A, b[:,[0]]
项目:quality-content-synthesizer    作者:pratulyab    | 项目源码 | 文件源码
def fit(self, raw_documents, y=None):
        """Learn vocabulary and log-entropy from training set.
        Parameters
        ----------
        raw_documents : iterable
            an iterable which yields either str, unicode or file objects
        Returns
        -------
        self : LogEntropyVectorizer
        """
        X = super(LogEntropyVectorizer, self).fit_transform(raw_documents)

        n_samples, n_features = X.shape
        gf = np.ravel(X.sum(axis=0)) # count total number of each words

        if self.smooth_idf:
            n_samples += int(self.smooth_idf)
            gf += int(self.smooth_idf)

        P = (X * sp.spdiags(1./gf, diags=0, m=n_features, n=n_features)) # probability of word occurence
        p = P.data
        P.data = (p * np.log2(p) / np.log2(n_samples))
        g = 1 + np.ravel(P.sum(axis=0))
        f = np.log2(1 + X.data)
        X.data = f
        # global weights
        self._G = sp.spdiags(g, diags=0, m=n_features, n=n_features)
        return self
项目:discretize    作者:simpeg    | 项目源码 | 文件源码
def ddxCellGrad(n, bc):
    """
        Create 1D derivative operator from cell-centers to nodes this means we
        go from n to n+1

        For Cell-Centered **Dirichlet**, use a ghost point::

            (u_1 - u_g)/hf = grad

                u_g       u_1      u_2
                 *    |    *   |    *     ...
                      ^
                      0

            u_g = - u_1
            grad = 2*u1/dx
            negitive on the other side.

        For Cell-Centered **Neumann**, use a ghost point::

            (u_1 - u_g)/hf = 0

                u_g       u_1      u_2
                 *    |    *   |    *     ...

            u_g = u_1
            grad = 0;  put a zero in.

    """
    bc = checkBC(bc)

    D = sp.spdiags((np.ones((n+1, 1))*[-1, 1]).T, [-1, 0], n+1, n,
                   format="csr")
    # Set the first side
    if(bc[0] == 'dirichlet'):
        D[0, 0] = 2
    elif(bc[0] == 'neumann'):
        D[0, 0] = 0
    # Set the second side
    if(bc[1] == 'dirichlet'):
        D[-1, -1] = -2
    elif(bc[1] == 'neumann'):
        D[-1, -1] = 0
    return D
项目:PorousMediaLab    作者:biogeochemistry    | 项目源码 | 文件源码
def create_template_AL_AR(phi, diff_coef, adv_coef, bc_top_type, bc_bot_type, dt, dx, N):
    """ creates 2 matrices for transport equation AL and AR

    Args:
        phi (TYPE): vector of porosity(phi) or 1-phi
        diff_coef (float): diffusion coefficient
        adv_coef (float): advection coefficient
        bc_top_type (string): type of boundary condition
        bc_bot_type (string): type of boundary condition
        dt (float): time step
        dx (float): spatial step
        N (int): size of mesh

    Returns:
        array: AL and AR matrices
    """
    # TODO: error source somewhere in non constant
    # porosity profile. Maybe we also need d phi/dx
    s = phi * diff_coef * dt / dx / dx
    q = phi * adv_coef * dt / dx
    AL = spdiags([-s / 2 - q / 4, phi + s, -s / 2 + q / 4],
                 [-1, 0, 1], N, N, format='csr')  # .toarray()
    AR = spdiags([s / 2 + q / 4, phi - s, s / 2 - q / 4],
                 [-1, 0, 1], N, N, format='csr')  # .toarray()

    if bc_top_type in ['dirichlet', 'constant']:
        AL[0, 0] = phi[0]
        AL[0, 1] = 0
        AR[0, 0] = phi[0]
        AR[0, 1] = 0
    elif bc_top_type in ['neumann', 'flux']:
        AL[0, 0] = phi[0] + s[0]  # + adv_coef * s[0] * dx / diff_coef] - q[0] * adv_coef * dx / diff_coef] / 2
        AL[0, 1] = -s[0]
        AR[0, 0] = phi[0] - s[0]  # - adv_coef * s[0] * dx / diff_coef] + q[0] * adv_coef * dx / diff_coef] / 2
        AR[0, 1] = s[0]
    else:
        print('\nABORT!!!: Not correct top boundary condition type...')
        sys.exit()

    if bc_bot_type in ['dirichlet', 'constant']:
        AL[-1, -1] = phi[-1]
        AL[-1, -2] = 0
        AR[-1, -1] = phi[-1]
        AR[-1, -2] = 0
    elif bc_bot_type in ['neumann', 'flux']:
        AL[-1, -1] = phi[-1] + s[-1]
        AL[-1, -2] = -s[-1]  # / 2 - s[-1] / 2
        AR[-1, -1] = phi[-1] - s[-1]
        AR[-1, -2] = s[-1]  # / 2 + s[-1] / 2
    else:
        print('\nABORT!!!: Not correct bottom boundary condition type...')
        sys.exit()
    return AL, AR
项目:2016CCF_BDCI_Sougou    作者:coderSkyChen    | 项目源码 | 文件源码
def fit(self, X, y, termTopic=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf
        # compute the normalized var
        if y is not None:
            aX = X
            m = len(np.unique(y))
            p = np.zeros((m, aX.shape[1]))

            for j in range(np.min(y), m + np.min(y)):
                w = aX[y == j, :]
                tij = np.sum(w, axis=0)
                lj = np.sum(tij)
                p[j - np.min(y), :] = tij * 1.0 / lj

            ave_p = np.sum(p, axis=0) * 1.0 / m

            new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0))

        if not sp.issparse(X):
            X = sp.csc_matrix(X)

        if self.use_idf:
            n_samples, n_features = X.shape

            df = _document_frequency(X)
            # the number of all words
            whole_df = np.sum(df)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            idf = np.log(whole_df * 1.0 / df * 1.0)

            idf = idf * new_var

            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self
项目:2016CCF-sougou    作者:prozhuchen    | 项目源码 | 文件源码
def fit(self, X, y, termTopic=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf
        # compute the normalized var
        if y is not None:
            aX = X
            m = len(np.unique(y))
            p = np.zeros((m, aX.shape[1]))

            for j in range(np.min(y), m + np.min(y)):
                w = aX[y == j, :]
                tij = np.sum(w, axis=0)
                lj = np.sum(tij)
                p[j - np.min(y), :] = tij * 1.0 / lj

            ave_p = np.sum(p, axis=0) * 1.0 / m

            new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0))

        if not sp.issparse(X):
            X = sp.csc_matrix(X)

        if self.use_idf:
            n_samples, n_features = X.shape

            df = _document_frequency(X)
            # the number of all words
            whole_df = np.sum(df)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            idf = np.log(whole_df * 1.0 / df * 1.0)

            idf = idf * new_var

            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self
项目:2016_CCFsougou    作者:dhdsjy    | 项目源码 | 文件源码
def fit(self, X, y, termTopic=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf
        # compute the normalized var
        if y is not None:
            aX = X
            m = len(np.unique(y))
            p = np.zeros((m, aX.shape[1]))

            for j in range(np.min(y), m + np.min(y)):
                w = aX[y == j, :]
                tij = np.sum(w, axis=0)
                lj = np.sum(tij)
                p[j - np.min(y), :] = tij * 1.0 / lj

            ave_p = np.sum(p, axis=0) * 1.0 / m

            new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0))

        if not sp.issparse(X):
            X = sp.csc_matrix(X)

        if self.use_idf:
            n_samples, n_features = X.shape

            df = _document_frequency(X)
            # the number of all words
            whole_df = np.sum(df)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            idf = np.log(whole_df * 1.0 / df * 1.0)

            idf = idf * new_var

            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self
项目:2016_CCFsougou2    作者:dhdsjy    | 项目源码 | 文件源码
def fit(self, X, y, termTopic=None):
        """Learn the idf vector (global term weights)

        Parameters
        ----------
        X : sparse matrix, [n_samples, n_features]
            a matrix of term/token counts
        """
        # todo http://nlpr-web.ia.ac.cn/cip/proceedings/klchen.pdf
        # compute the normalized var
        if y is not None:
            aX = X
            m = len(np.unique(y))
            p = np.zeros((m, aX.shape[1]))

            for j in range(np.min(y), m + np.min(y)):
                w = aX[y == j, :]
                tij = np.sum(w, axis=0)
                lj = np.sum(tij)
                p[j - np.min(y), :] = tij * 1.0 / lj

            ave_p = np.sum(p, axis=0) * 1.0 / m

            new_var = np.sqrt(np.sqrt(np.sum((p - ave_p) ** 2, axis=0)) * 1.0 / np.sum(p, axis=0))

        if not sp.issparse(X):
            X = sp.csc_matrix(X)

        if self.use_idf:
            n_samples, n_features = X.shape

            df = _document_frequency(X)
            # the number of all words
            whole_df = np.sum(df)

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            idf = np.log(whole_df * 1.0 / df * 1.0)

            idf = idf * new_var

            self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
                                        n=n_features, format='csr')

        return self