我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用Bio.SeqIO.index()。
def get_evm_pr(evm_path,ref_fa,out_path): '''this function get all evm proteins, output to files and merge them together * evm_path: evm path that has gff file * ref_fa: reference fa file * out_path: path to save all temperary files and final protein files ''' if os.path.exists(out_path): shutil.rmtree(out_path) os.mkdir(out_path) os.chdir(out_path) evm_gff= evm_path + '/evm.merge.gff' gff_df = pd.read_csv(evm_gff,sep='\t',header=None) dic = SeqIO.index(ref_fa,'fasta') cds_df = gff_df[gff_df[2].values=='CDS'] cds_df = cds_df.reset_index(drop=True) cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:]) scaffolds = list(set(cds_df[0].tolist())) for scaff in scaffolds: output_cds(scaff,cds_df,dic) # merge files fns = natsorted(glob.glob('*.fa')) sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa')) for f in fns: os.remove(f)
def add_gene_function(blast_db,evm_path): '''add gene symbol to gff file. the information is from the blast results ''' blastp_fn = blast_db + '/blastp.txt' blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per']) blast_df = blast_df[blast_df['per'].values>50] blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:])) blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0]) rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict() evm_gff= evm_path + '/evm.merge.gff' gff_df = pd.read_csv(evm_gff,sep='\t',header=None) gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic)) gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)] gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False) # add_gene_function(blast_db,evm_path) #=============================================================================== # process the gmap results and exonerates results directly #=============================================================================== #=============== 1. get all mapped geneid, rna_accession, pr_accession
def fa2embl(fa,embl,gff,path): if not os.path.exists(path): os.mkdir(path) os.chdir(path) df = pd.read_csv(gff,sep='\t',header=None,comment='#',usecols=[0,2]) df = df[df[2].values=='gene'] chroms = list(set(df[0].tolist())) dic = SeqIO.index(fa,'fasta') for s in chroms: SeqIO.write(dic[s],open('fa','w'),'fasta') sarge.run('grep \'{s}\' {gff} > gff'.format(s=s,gff=gff)) sarge.run('/home/shangzhong/Installation/EMBOSS-6.6.0/bin/seqret \ -sequence fa -feature -fformat gff -fopenfile1 gff -osformat2 embl \ -auto -outseq {s}.embl'.format(s=s)) fns = glob.glob('*.embl') sarge.run('cat {files} > {embl}'.format(files=' '.join(fns),embl=embl)) # for f in fns: # os.remove(f) # fa2embl('/data/genome/hamster/ncbi_refseq/hamster.fa','hamster.embl','/data/genome/hamster/ncbi_refseq/hamster.gff','/data/shangzhong/Picr_assembly/Annotation/RATT/embl')
def testNewAminoAcid(self): self.setUpPhylotyper(aa=True) # Set up subtype files build_pipeline(self.subtype_options, self.configObj) # Save setup self.subtypeOptionsObj.save() # Check output files filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme) fasta = SeqIO.index(filepaths['alignment'][0], 'fasta') with open(filepaths['subtype']) as f: for i, l in enumerate(f): pass sd = SeqDict() sd.load(filepaths['lookup']) n = 91 self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))
def testNewDNA(self): self.setUpPhylotyper(aa=False) # Set up subtype files build_pipeline(self.subtype_options, self.configObj) # Save setup self.subtypeOptionsObj.save() # Check output files filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme) fasta = SeqIO.index(filepaths['alignment'][0], 'fasta') with open(filepaths['subtype']) as f: for i, l in enumerate(f): pass sd = SeqDict() sd.load(filepaths['lookup']) n = 120 self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))
def _load_strain_sequences(self, strain_gempro): """Load strain sequences from the orthology matrix into the base model for comparisons, and into the strain-specific model itself. """ if self._orthology_matrix_has_sequences: # Load directly from the orthology matrix if it contains sequences strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict() else: # Otherwise load from the genome file if the orthology matrix contains gene IDs # Load the genome FASTA file log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path)) strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta') for strain_gene in strain_gempro.genes: if strain_gene.functional: if self._orthology_matrix_has_sequences: strain_gene_key = strain_gene.id else: # Pull the gene ID of the strain from the orthology matrix strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id] log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key)) # # Load into the base strain for comparisons ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id) new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id) if ref_gene.protein.sequences.has_id(new_id): log.debug('{}: sequence already loaded into reference model'.format(new_id)) continue ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=False) log.debug('{}: loaded sequence into reference model'.format(new_id)) # Load into the strain GEM-PRO strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=True) log.debug('{}: loaded sequence into strain model'.format(new_id))
def build_strain_specific_models(self, save_models=False): """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous genes exist. Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') # Create an emptied copy of the reference GEM-PRO for strain_gempro in tqdm(self.strains): log.debug('{}: building strain specific model'.format(strain_gempro.id)) # For each genome, load the metabolic model or genes from the reference GEM-PRO logging.disable(logging.WARNING) if self._empty_reference_gempro.model: strain_gempro.load_cobra_model(self._empty_reference_gempro.model) elif self._empty_reference_gempro.genes: strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes] logging.disable(logging.NOTSET) # Get a list of genes which do not have orthology in the strain not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist() # Mark genes non-functional self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain) # Load sequences into the base and strain models self._load_strain_sequences(strain_gempro=strain_gempro) if save_models: cobra.io.save_json_model(model=strain_gempro.model, filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id))) strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id))) log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None): ''' seq_table: pandas.DataFrame Samples on the columns; sequences on the rows records: index of Bio.Seq Indexed, unaligned input sequences. This could come from BioPython's SeqIO.to_dict or SeqIO.index. max_dist: float genetic distance cutoff above which a sequence will not be merged into an OTU min_fold: float Multiply the sequence's abundance by this fold to get the minimum abundance of an OTU for merging threshold_pval: float P-value below which a sequence will not be merged into an OTU log: filehandle Log file reporting the abundance, genetic, and distribution checks. ''' self.seq_table = seq_table self.records = records self.max_dist = max_dist self.min_fold = min_fold self.threshold_pval = threshold_pval self.log = log # get a list of the names of the sequences in order of their (decreasing) abundance self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False) # check that all sequence IDs in the table are in the fasta missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records] if len(missing_ids) > 0: raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids)) # initialize OTU information self.membership = {} self.otus = []
def _process_record(self, record_id): ''' Process the next sequence: run the genetic, abundance, and distribution checks, either merging the sequence into an existing OTU or creating a new OTU. ''' assert record_id in self.seq_table.index record = self.records[record_id] candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id]) if self.log is not None: print('seq', candidate.name, sep='\t', file=self.log) merged = False for otu in self.ga_matches(candidate): test_pval = candidate.distribution_pval(otu) if self.log is not None: print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log) if test_pval > self.threshold_pval: otu.absorb(candidate) self.membership[otu.name].append(candidate.name) merged = True break if not merged: # form own otu self.otus.append(candidate) self.membership[candidate.name] = [candidate.name]
def generate_otu_table(self): ''' Process all the input sequences to make an OTU table. returns: pandas.DataFrame OTU table (which can also be found at instance.otu_table) ''' for record_id in self.seq_abunds.index: self._process_record(record_id) self.otus.sort(key=lambda otu: otu.abundance, reverse=True) self.otu_table = pd.DataFrame([otu.counts for otu in self.otus], index=[otu.name for otu in self.otus]) self.otu_table.columns = self.seq_table.columns return self.otu_table
def read_sequence_table(fn): ''' Read in a table of sequences. Expect a header and the sequence IDs in the first column. Samples are on the columns. fn: filename (or handle) returns: pandas.DataFrame ''' df = pd.read_table(fn, dtype={0: str}, header=0) df.index = df.iloc[:,0] df = df.iloc[:,1:].astype(int) return df
def call_otus(seq_table_fh, fasta_fh, output_fh, dist_crit, abund_crit, pval_crit, log=None, membership=None): ''' Read in input files, call OTUs, and return output. seq_table_fh: filehandle sequence count table fasta_fh: filehandle or filename sequences fasta output_fh: filehandle place to write main output OTU table dist_crit, abund_crit, pval_crit: float threshold values for distance, abundance, and pvalue log, membership: filehandles places to write supplementary output ''' # read in the sequences table seq_table = read_sequence_table(seq_table_fh) # set up the input fasta records records = SeqIO.index(fasta_fh, 'fasta') # generate the caller object caller = DBCaller(seq_table, records, dist_crit, abund_crit, pval_crit, log) caller.generate_otu_table() caller.write_otu_table(output_fh) if membership is not None: caller.write_membership(membership)
def readGenome(fasta): genome_dict = SeqIO.index(fasta, "fasta") print(len(genome_dict)) return(genome_dict)
def augustus_prepare_hint(pasa,exonerate): ''' ''' dfs = [] for g,t,feature in zip([pasa,exonerate],['E','P'],['exonpart','CDSpart']): df = pd.read_csv(g,sep='\t',header=None) df[2] = df[2].map(lambda x: feature) df[8] = df[8].map(lambda x: x+';grp='+re.search('(?<=ID=).+?(?=;)',x).group(0)+';src='+t) dfs.append(df) res = pd.concat(dfs) res.to_csv('hints.gff',sep='\t',index=False,header=None)
def gene_rna_pr_id(hamster_id,gmap_gff,out_fn): '''this fnction get all gene rna pr id, including both refseq and gff information. * hamster_id: a file that has all ids in hamster.gff file * gmap_gff: gff results mapped using gmap * out_fn: ''' # rna accession in gff file ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0) ham_id_df = ham_id_df.astype('str') ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0]) ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0]) rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict() rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict() #-------- read rna gff file rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#') # add rna accession column rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0)) mrna = list(set(rna_df['rna_ac'].tolist())) # new rna in refseq compared to gff new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys())) # get geneid for new ref_rna gene id for r in new_ref_rna: handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read() geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0) try: p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0) except: p = '-' rna_gene_dic[r] = geneid rna_pr_dic[r] = p # transfer dic to dataframe r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index') r_g_df.columns = ['geneid'] r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index') r_p_df.columns = ['pr_ac'] g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1) g_r_p_df['rna_ac'] = g_r_p_df.index g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False)
def extract_long_reads(): """Filter fastq to longest reads.""" parser = argparse.ArgumentParser(description='Extract longest reads from a fastq.') parser.add_argument('input', help='Input .fastq file.') parser.add_argument('output', help='Output .fastq file.') parser.add_argument('longest', default=10, type=int, help='Percentage of longest reads to partition.') parser.add_argument('--others', default=None, help='Write all other reads to file.') args = parser.parse_args() record_dict = SeqIO.index(args.input, "fastq") ids = list(record_dict.keys()) lengths = np.fromiter( (len(record_dict[i]) for i in ids), dtype=int, count=len(ids) ) max_reads = len(ids) * (args.longest / 100) longest = np.argpartition(lengths, -max_reads)[-max_reads:] SeqIO.write( (record_dict[ids[i]] for i in longest), args.output, 'fastq' ) if args.others is not None: longest = set(longest) SeqIO.write( (record_dict[ids[i]] for i in range(len(ids)) if i not in longest), args.others, 'fastq' )
def test_fragment_stats(self): """Test the gathering of fragment statistics.""" top = path.dirname(__file__) bam = path.join(top, "data/test_bam_stats/stat_test.bam") ref = path.join(top, "data/test_bam_stats/stat_ref.fas") references = SeqIO.index(ref, format='fasta') chrom_lengths = {name: len(so) for name, so in six.iteritems(references)} res = stats.frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, verbose=False) self.maxDiff = None target = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(list(res['frags_fwd']['seq_0']), target)
def _fasta_to_dict(filename): """SeqIO.index wrapper for fasta files""" return SeqIO.index(filename=filename, format="fasta")
def _clean_index(index): """Clean all elements from an indexed fasta""" index_clean = {} for key, value in index.items(): index_clean[key] = _clean_seqrecord(value) return index_clean
def index_fasta(filename): """Create a fasta dict, with clean descriptions, key=id, value=seqrecord""" index = SeqIO.index( filename=filename, format="fasta" ) return _clean_index(index)
def _filter_orthology_matrix(self, remove_strains_with_no_orthology=True, remove_strains_with_no_differences=False, remove_genes_not_in_base_model=True): """Filters the orthology matrix by removing genes not in our base model, and also removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain. Args: remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model. Default is False because since orthology is found using a PID cutoff, all genes may be present but differences may be on the sequence level. remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our base model. This happens if we use a genome file for our model that has other genes in it. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') initial_num_strains = len(self.strains) # Adding names to the row and column of the orthology matrix self.df_orthology_matrix = self.df_orthology_matrix.rename_axis('gene').rename_axis("strain", axis="columns") # Gene filtering (of the orthology matrix) if remove_genes_not_in_base_model: # Check for gene IDs that are in the model and not in the orthology matrix # This is probably because: the CDS FASTA file for the base strain did not contain the correct ID # for the gene and consequently was not included in the orthology matrix # Save these and report them reference_strain_gene_ids = [x.id for x in self.reference_gempro.genes] self.missing_in_orthology_matrix = [x for x in reference_strain_gene_ids if x not in self.df_orthology_matrix.index.tolist()] self.missing_in_reference_strain = [y for y in self.df_orthology_matrix.index.tolist() if y not in reference_strain_gene_ids] # Filter the matrix for genes within our base model only self.df_orthology_matrix = self.df_orthology_matrix[self.df_orthology_matrix.index.isin(reference_strain_gene_ids)] log.info('Filtered orthology matrix for genes present in base model') log.warning('{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"'.format(len(self.missing_in_orthology_matrix))) log.warning('{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"'.format(len(self.missing_in_reference_strain))) # Strain filtering for strain_gempro in self.strains.copy(): if remove_strains_with_no_orthology: if strain_gempro.id not in self.df_orthology_matrix.columns: self.strains.remove(strain_gempro) log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id)) continue elif self.df_orthology_matrix[strain_gempro.id].isnull().all(): self.strains.remove(strain_gempro) log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id)) continue if remove_strains_with_no_differences: not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist() if len(not_in_strain) == 0: self.strains.remove(strain_gempro) log.info('{}: strain has no differences from the base, removed from analysis.') continue log.info('{} strains to be analyzed, {} strains removed'.format(len(self.strains), initial_num_strains - len(self.strains)))