def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]
def __init__(self, sparse, documents_columns=True): if documents_columns: self.sparse = sparse.tocsc() else: self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len())
def unitvec(vec): """ Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. Output will be in the same format as input (i.e., gensim vector=>gensim vector, or numpy array=>numpy array, scipy.sparse=>scipy.sparse). """ if scipy.sparse.issparse(vec): # convert scipy.sparse to standard numpy array vec = vec.tocsr() veclen = numpy.sqrt(numpy.sum(vec.data ** 2)) if veclen > 0.0: return vec / veclen else: return vec if isinstance(vec, numpy.ndarray): vec = numpy.asarray(vec, dtype=float) veclen = blas_nrm2(vec) if veclen > 0.0: return blas_scal(1.0 / veclen, vec) else: return vec try: first = next(iter(vec)) # is there at least one element? except: return vec if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format? length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) assert length > 0.0, "sparse documents must not contain any explicit zero entries" if length != 1.0: return [(termid, val / length) for termid, val in vec] else: return list(vec) else: raise ValueError("unknown input type")
def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]
def scipy2sparse(vec, eps=1e-9): """Convert a scipy.sparse vector into document format (=list of 2-tuples).""" vec = vec.tocsr() assert vec.shape[0] == 1 return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]
def unitvec(vec, norm='l2'): """ Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. Output will be in the same format as input (i.e., gensim vector=>gensim vector, or np array=>np array, scipy.sparse=>scipy.sparse). """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': veclen = np.sum(np.abs(vec.data)) if norm == 'l2': veclen = np.sqrt(np.sum(vec.data ** 2)) if veclen > 0.0: return vec / veclen else: return vec if isinstance(vec, np.ndarray): vec = np.asarray(vec, dtype=float) if norm == 'l1': veclen = np.sum(np.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) if veclen > 0.0: return blas_scal(1.0 / veclen, vec) else: return vec try: first = next(iter(vec)) # is there at least one element? except: return vec if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format if norm == 'l1': length = float(sum(abs(val) for _, val in vec)) if norm == 'l2': length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) assert length > 0.0, "sparse documents must not contain any explicit zero entries" return ret_normalized_vec(vec, length) else: raise ValueError("unknown input type")
def unitvec(vec, norm='l2'): """ Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged. Output will be in the same format as input. """ if norm not in ('l1', 'l2'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) if scipy.sparse.issparse(vec): vec = vec.tocsr() if norm == 'l1': veclen = numpy.sum(numpy.abs(vec.data)) if norm == 'l2': veclen = numpy.sqrt(numpy.sum(vec.data ** 2)) if veclen > 0.0: return vec / veclen else: return vec if isinstance(vec, numpy.ndarray): vec = numpy.asarray(vec, dtype=float) if norm == 'l1': veclen = numpy.sum(numpy.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) if veclen > 0.0: return blas_scal(1.0 / veclen, vec) else: return vec try: first = next(iter(vec)) # is there at least one element? except: return vec if isinstance(first, (tuple, list)) and len(first) == 2: if norm == 'l1': length = float(sum(abs(val) for _, val in vec)) if norm == 'l2': length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) assert length > 0.0, "sparse documents must not contain any explicit zero entries" return ret_normalized_vec(vec, length) else: raise ValueError("unknown input type")