Python scipy.io 模块,mmread() 实例源码

我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用scipy.io.mmread()

项目:corporadb    作者:nlesc-sherlock    | 项目源码 | 文件源码
def trainModel(docMatrix, savemodel, k, iterations=10, parallelization=16):
    data = mmread(docMatrix)
    rowRange = sc.parallelize(xrange(data.shape[0]), parallelization)
    dataSpark = spark.createDataFrame(rowRange
            .map(lambda i: Row(label=i, features=sparkToScipySparse(data.getrow(i)))))
    lda = LDA(k=k, maxIter=iterations)
    model = lda.fit(dataSpark)
    model.save(savemodel)

    topicMatrix = model.topicsMatrix().toArray()
    topicMatrix = topicMatrix.T
    topicMatrix = topicMatrix / topicMatrix.sum(axis=0)
    print 'TODO: give wordXtopic.mtx a path'
    mmwrite('wordXtopic.mtx', topicMatrix)

    print 'TODO: give docXtopic.mtx a path'
    docXTopics = model.transform(dataSpark)
    dxT = docXTopics.collect()
    dxT_v2 = np.array([ dxtI['topicDistribution'] for dxtI in dxT ])
    mmwrite('docXtopic.mtx', dxT_v2)

# Main script
项目:corporadb    作者:nlesc-sherlock    | 项目源码 | 文件源码
def get_data(setname):
    dataset = CorporaDataSet(setname)
#    topic_word_array = dataset.getWordsInTopicMatrix()
#    topic_doc_array = dataset.getDocsInTopicMatrix()
    topic_word_array = dataset.getDocsInTopicMatrix()
    topic_doc_array = dataset.getWordsInTopicMatrix().T
    doc_length_array = numpy.full([topic_doc_array.shape[0]],1)
    vocabulary = dataset.loadVocabulary()[0].keys()
    print "topic word array shape: ",topic_word_array.shape
    print "topic doc shape: ",topic_doc_array.shape
    print "vocabulary: ",len(vocabulary)
    wordfreqs = mmread(setname + ".mtx").sum(1)
    word_freq_array = numpy.array(wordfreqs)[:,0]

    return {topic_word_key:topic_word_array,
            topic_doc_key:topic_doc_array,
            doc_length_key:doc_length_array,
            vocabulary_key:vocabulary,
            word_freq_key:word_freq_array}
项目:cellranger    作者:10XGenomics    | 项目源码 | 文件源码
def load_mtx(genome_dir):
        barcodes_tsv = os.path.join(genome_dir, "barcodes.tsv")
        genes_tsv = os.path.join(genome_dir, "genes.tsv")
        matrix_mtx = os.path.join(genome_dir, "matrix.mtx")
        for filepath in [barcodes_tsv, genes_tsv, matrix_mtx]:
            if not os.path.exists(filepath):
                raise IOError("Required file not found: %s" % filepath)
        barcodes = pd.read_csv(barcodes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze()
        genes = pd.read_csv(genes_tsv, delimiter='\t', header=None, usecols=[0]).values.squeeze()
        genes = [cr_constants.Gene(gene_id, None, None, None, None) for gene_id in genes]
        matrix = sp_io.mmread(matrix_mtx)
        gbm = GeneBCMatrix(genes, barcodes)
        gbm.m = matrix
        return gbm
项目:corporadb    作者:nlesc-sherlock    | 项目源码 | 文件源码
def getWordsInTopicMatrix(self):
        wxt = mmread(self.setname + '_wordXtopic.mtx')
        return wxt
项目:corporadb    作者:nlesc-sherlock    | 项目源码 | 文件源码
def getDocsInTopicMatrix(self):
        dxt = mmread(self.setname + '_docXtopic.mtx')
        return dxt.T
项目:bedrock-core    作者:Bedrock-py    | 项目源码 | 文件源码
def load_sparse_matrix(filepath):
    return csc_matrix(mmread(filepath))
项目:reuters-docsim    作者:sujitpal    | 项目源码 | 文件源码
def load_vectors(filename, is_sparse=True):
    if is_sparse:
        return io.mmread(filename)
    else:
        return np.loadtxt(filename, delimiter=",")
项目:Spam-Message-Classifier-sklearn    作者:ZPdesu    | 项目源码 | 文件源码
def import_training_data(self, word_vector_file, train_label_file):
        self.training_data.delete_many({})
        self.training_data.create_index('training_num')
        word_vector = io.mmread(word_vector_file)
        vector = np.array(word_vector.todense())
        with open(train_label_file, 'r') as f:
            label = json.load(f)

        num = len(label)
        for i in range(num):
            dic = {}
            dic['training_num'] = i
            dic['vector'] = list(vector[i])
            dic['label'] = int(label[i])
            self.training_data.insert_one(dic)
项目:scanpy    作者:theislab    | 项目源码 | 文件源码
def _read_mtx(filename, return_dict=True, dtype='float32'):
    """Read mtx file.
    """
    from scipy.io import mmread
    # could be rewritten accounting for dtype to be more performant
    X = mmread(filename).astype(dtype)
    from scipy.sparse import csr_matrix
    X = csr_matrix(X)
    logg.m('... did not find row_names or col_names')
    if return_dict:
        return {'X': X}
    else:
        return AnnData(X)
项目:tsnetwork    作者:HanKruiger    | 项目源码 | 文件源码
def load_mm(mm_file):
    adj = mmread(mm_file)

    assert adj.shape[0] == adj.shape[1]

    # Initialize graph
    g = gt.Graph(directed=False)

    edge_weight = g.edge_properties["weight"] = g.new_edge_property("double")

    # Create vertex for every row/column
    g.add_vertex(adj.shape[0])

    print('[graph_io] Reading matrix market file with {0} explicit elements...'.format(len(adj.data)))

    # Loop over all explicit elements in the sparse matrix
    for iteration, (i, j, w) in enumerate(zip(adj.row, adj.col, adj.data)):
        # Skip self-edges.
        if i == j:
            continue

        # Add edge to the graph, if its 'symmetric partner' is not already there.
        # (Undirected graph, so g.edge(i, j) == g.edge(j, i))
        if g.edge(i, j) is None:
            g.add_edge(i, j)

        edge_weight[i, j] = w

        # Print progress every 5%
        if iteration % (int(0.05 * len(adj.data))) == 0:
            perc = 100 * iteration / len(adj.data)
            print('[graph_io] {0:.1f}%'.format(perc), end='\r')
    print('\n[graph_io] Done!')
    return g


# Read a csv file, and construct an undirected weighted graph from it.
项目:anndata    作者:theislab    | 项目源码 | 文件源码
def read_mtx(filename, dtype='float32'):
    """Read `.mtx` file.

    Returns
    -------
    An :class:`~anndata.AnnData` object.
    """
    from scipy.io import mmread
    # could be rewritten accounting for dtype to be more performant
    X = mmread(filename).astype(dtype)
    from scipy.sparse import csr_matrix
    X = csr_matrix(X)
    return AnnData(X)
项目:loompy    作者:linnarsson-lab    | 项目源码 | 文件源码
def create_from_cellranger(indir: str, outdir: str = None, genome: str = None) -> None:
    """
    Create a .loom file from 10X Genomics cellranger output

    Args:
        indir (str):    path to the cellranger output folder (the one that contains 'outs')
        outdir (str):   output folder wher the new loom file should be saved (default to indir)
        genome (str):   genome build to load (e.g. 'mm10'; if None, determine species from outs folder)

    Returns:
        LoomConnection to created loom file.
    """
    if outdir is None:
        outdir = indir
    sampleid = os.path.split(os.path.abspath(indir))[-1]
    matrix_folder = os.path.join(indir, 'outs', 'filtered_gene_bc_matrices')
    if genome is None:
        genome = [f for f in os.listdir(matrix_folder) if not f.startswith(".")][0]
    matrix_folder = os.path.join(matrix_folder, genome)
    matrix = mmread(os.path.join(matrix_folder, "matrix.mtx")).astype("float32").todense()

    with open(os.path.join(matrix_folder, "genes.tsv"), "r") as f:
        lines = f.readlines()
    accession = np.array([x.split("\t")[0] for x in lines]).astype("str")
    gene = np.array([x.split("\t")[1].strip() for x in lines]).astype("str")
    with open(os.path.join(matrix_folder, "barcodes.tsv"), "r") as f:
        lines = f.readlines()
    cellids = np.array([sampleid + ":" + x.strip() for x in lines]).astype("str")

    col_attrs = {"CellID": cellids}
    row_attrs = {"Accession": accession, "Gene": gene}

    tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "projection.csv")
    # In cellranger V2 the file moved one level deeper
    if not os.path.exists(tsne_file):
        tsne_file = os.path.join(indir, "outs", "analysis", "tsne", "2_components", "projection.csv")
    if os.path.exists(tsne_file):
        tsne = np.loadtxt(tsne_file, usecols=(1, 2), delimiter=',', skiprows=1)
        col_attrs["X"] = tsne[:, 0].astype('float32')
        col_attrs["Y"] = tsne[:, 1].astype('float32')

    clusters_file = os.path.join(indir, "outs", "analysis", "clustering", "graphclust", "clusters.csv")
    if os.path.exists(clusters_file):
        labels = np.loadtxt(clusters_file, usecols=(1, ), delimiter=',', skiprows=1)
        col_attrs["ClusterID"] = labels.astype('int') - 1

    create(os.path.join(outdir, sampleid + ".loom"), matrix, row_attrs, col_attrs, file_attrs={"Genome": genome})