Python Bio.SeqIO 模块，index() 实例源码

我们从Python开源项目中，提取了以下21个代码示例，用于说明如何使用Bio.SeqIO.index()。

项目：NGS-Pipeline 作者：LewisLabUCSD | 项目源码 | 文件源码

def get_evm_pr(evm_path,ref_fa,out_path):
    '''this function get all evm proteins, output to files and merge them together
    * evm_path: evm path that has gff file
    * ref_fa: reference fa file
    * out_path: path to save all temperary files and final protein files
    '''
    if os.path.exists(out_path): 
        shutil.rmtree(out_path)
    os.mkdir(out_path)
    os.chdir(out_path)
    evm_gff= evm_path + '/evm.merge.gff'
    gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
    dic = SeqIO.index(ref_fa,'fasta')
    cds_df = gff_df[gff_df[2].values=='CDS']
    cds_df = cds_df.reset_index(drop=True)
    cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:])
    scaffolds = list(set(cds_df[0].tolist()))
    for scaff in scaffolds:
        output_cds(scaff,cds_df,dic)
    # merge files
    fns = natsorted(glob.glob('*.fa'))
    sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa'))
    for f in fns:
        os.remove(f)

项目：NGS-Pipeline 作者：LewisLabUCSD | 项目源码 | 文件源码

def add_gene_function(blast_db,evm_path):
    '''add gene symbol to gff file. the information is from the blast results
    '''
    blastp_fn = blast_db + '/blastp.txt'
    blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per'])
    blast_df = blast_df[blast_df['per'].values>50]
    blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:]))
    blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0])
    rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict()

    evm_gff= evm_path + '/evm.merge.gff'
    gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
    gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic))
    gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)]
    gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False)

# add_gene_function(blast_db,evm_path)
#===============================================================================
#                     process the gmap results and exonerates results directly
#===============================================================================
#=============== 1. get all mapped geneid, rna_accession, pr_accession

项目：NGS-Pipeline 作者：LewisLabUCSD | 项目源码 | 文件源码

def fa2embl(fa,embl,gff,path):
    if not os.path.exists(path): os.mkdir(path)
    os.chdir(path)
    df = pd.read_csv(gff,sep='\t',header=None,comment='#',usecols=[0,2])
    df = df[df[2].values=='gene']
    chroms = list(set(df[0].tolist()))
    dic = SeqIO.index(fa,'fasta')
    for s in chroms:
        SeqIO.write(dic[s],open('fa','w'),'fasta')
        sarge.run('grep \'{s}\' {gff} > gff'.format(s=s,gff=gff))
        sarge.run('/home/shangzhong/Installation/EMBOSS-6.6.0/bin/seqret \
        -sequence fa -feature -fformat gff -fopenfile1 gff -osformat2 embl \
        -auto -outseq {s}.embl'.format(s=s))
    fns = glob.glob('*.embl')
    sarge.run('cat {files} > {embl}'.format(files=' '.join(fns),embl=embl))
#     for f in fns:
#         os.remove(f)
# fa2embl('/data/genome/hamster/ncbi_refseq/hamster.fa','hamster.embl','/data/genome/hamster/ncbi_refseq/hamster.gff','/data/shangzhong/Picr_assembly/Annotation/RATT/embl')

项目：insilico-subtyping 作者：superphy | 项目源码 | 文件源码

def testNewAminoAcid(self):

        self.setUpPhylotyper(aa=True)

        # Set up subtype files
        build_pipeline(self.subtype_options, self.configObj)

        # Save setup
        self.subtypeOptionsObj.save()

        # Check output files
        filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme)
        fasta = SeqIO.index(filepaths['alignment'][0], 'fasta')

        with open(filepaths['subtype']) as f:
            for i, l in enumerate(f):
                pass

        sd = SeqDict()
        sd.load(filepaths['lookup'])

        n = 91
        self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))

项目：insilico-subtyping 作者：superphy | 项目源码 | 文件源码

def testNewDNA(self):

        self.setUpPhylotyper(aa=False)

        # Set up subtype files
        build_pipeline(self.subtype_options, self.configObj)

        # Save setup
        self.subtypeOptionsObj.save()

        # Check output files
        filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme)
        fasta = SeqIO.index(filepaths['alignment'][0], 'fasta')

        with open(filepaths['subtype']) as f:
            for i, l in enumerate(f):
                pass

        sd = SeqDict()
        sd.load(filepaths['lookup'])

        n = 120
        self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))

项目：ssbio 作者：SBRG | 项目源码 | 文件源码

def _load_strain_sequences(self, strain_gempro):
        """Load strain sequences from the orthology matrix into the base model for comparisons, and into the
        strain-specific model itself.

        """
        if self._orthology_matrix_has_sequences:  # Load directly from the orthology matrix if it contains sequences
            strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict()
        else:  # Otherwise load from the genome file if the orthology matrix contains gene IDs
            # Load the genome FASTA file
            log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path))
            strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')

        for strain_gene in strain_gempro.genes:
            if strain_gene.functional:
                if self._orthology_matrix_has_sequences:
                    strain_gene_key = strain_gene.id
                else:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id]
                    log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key))

                # # Load into the base strain for comparisons
                ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id)
                new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
                if ref_gene.protein.sequences.has_id(new_id):
                    log.debug('{}: sequence already loaded into reference model'.format(new_id))
                    continue
                ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                      set_as_representative=False)
                log.debug('{}: loaded sequence into reference model'.format(new_id))

                # Load into the strain GEM-PRO
                strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
                                                         set_as_representative=True)
                log.debug('{}: loaded sequence into strain model'.format(new_id))

项目：ssbio 作者：SBRG | 项目源码 | 文件源码

def build_strain_specific_models(self, save_models=False):
        """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
            genes exist.

        Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        # Create an emptied copy of the reference GEM-PRO
        for strain_gempro in tqdm(self.strains):
            log.debug('{}: building strain specific model'.format(strain_gempro.id))

            # For each genome, load the metabolic model or genes from the reference GEM-PRO
            logging.disable(logging.WARNING)
            if self._empty_reference_gempro.model:
                strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
            elif self._empty_reference_gempro.genes:
                strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()

            # Mark genes non-functional
            self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)

            # Load sequences into the base and strain models
            self._load_strain_sequences(strain_gempro=strain_gempro)

            if save_models:
                cobra.io.save_json_model(model=strain_gempro.model,
                                         filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
                strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))


        log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))