我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用Bio.SeqIO.to_dict()。
def fasta_to_dict(fasta_filename): # not clear the use of biopython gains anything here with open(fasta_filename) as f: return {k: str(v.seq) for k,v in SeqIO.to_dict(SeqIO.parse(f,'fasta')).iteritems()}
def main(): records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta')) reader = csv.DictReader(sys.stdin, dialect="excel-tab") clusters = list(reader) groups = set([c['group'] for c in clusters]) for group in groups: print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group) with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout: SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta')
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None): ''' seq_table: pandas.DataFrame Samples on the columns; sequences on the rows records: index of Bio.Seq Indexed, unaligned input sequences. This could come from BioPython's SeqIO.to_dict or SeqIO.index. max_dist: float genetic distance cutoff above which a sequence will not be merged into an OTU min_fold: float Multiply the sequence's abundance by this fold to get the minimum abundance of an OTU for merging threshold_pval: float P-value below which a sequence will not be merged into an OTU log: filehandle Log file reporting the abundance, genetic, and distribution checks. ''' self.seq_table = seq_table self.records = records self.max_dist = max_dist self.min_fold = min_fold self.threshold_pval = threshold_pval self.log = log # get a list of the names of the sequences in order of their (decreasing) abundance self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False) # check that all sequence IDs in the table are in the fasta missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records] if len(missing_ids) > 0: raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids)) # initialize OTU information self.membership = {} self.otus = []
def extract_genes(output_folder, refs_folder, gff_folder, di): tp = 'gene' refs_files = sorted([join(refs_folder, f) for f in listdir(refs_folder) if isfile(join(refs_folder, f))]) gff_files = sorted([join(gff_folder, f) for f in listdir(gff_folder) if isfile(join(gff_folder, f))]) limit_info = dict( gff_type =['gene']) sequences = [] for ind in range(len(refs_files)): f_fasta = refs_files[ind] f_gff = gff_files[ind] nm = f_gff.split("/")[-1][:-4] record = SeqIO.to_dict(SeqIO.parse(f_fasta, "fasta")) new_record = {} p = re.compile("N\w_\w+\.\d") for it in record: new_id = p.findall(it)[0] new_record[new_id] = record[it] record = new_record in_handle = open(f_gff) for rec in GFF.parse(in_handle, limit_info=limit_info, base_dict=record): for f in rec.features: if f.qualifiers['ID'][0] in di.get(f_gff, []): subrecord = SeqRecord.SeqRecord(Seq.Seq(re.sub(r'[^ATGC]', 'A', str(f.extract(rec.seq)))), id=nm + "_" + rec.id + "_" + str(f.location), name=nm + "_" + str(f.location)+ "_" + tp, description= tp) sequences.append(subrecord) in_handle.close() with open(output_folder+'sc_genes_seq.fasta', "w") as output_handle: SeqIO.write(sequences, output_handle, "fasta")
def partial_genes_extract(contigs_path, outdir): contigs = SeqIO.to_dict(SeqIO.parse(contigs_path, "fasta")) partial_genes = [] with open(outdir+'prodigal_output.gff') as f: for rec in GFF.parse(f, base_dict=contigs): for feature in rec.features: extract_seq = str(feature.extract(rec.seq)) if feature.qualifiers['partial'][0] in ['11', '01', '10'] and len(extract_seq) > 110: subrecord = SeqRecord.SeqRecord(Seq.Seq(extract_seq), id='gene_'+str(len(partial_genes))+'_len_'+str(len(extract_seq)) + '_conf_'+str(feature.qualifiers['conf'][0]), description='patrial gene') partial_genes.append(subrecord) return partial_genes
def write_genes(genes_align, outdir): records = [] orfs = SeqIO.to_dict(SeqIO.parse(outdir+'ORFs.fasta', 'fasta')) for gene_cluster in genes_align: for orf in genes_align[gene_cluster][1]: if orfs[orf].seq is not None: records.append(SeqRecord.SeqRecord(seq=orfs[orf].seq, id='{0}_{1}'.format(gene_cluster, orf), name='{0}_{1}_conf={2}_len={3}'.format(gene_cluster, orf, genes_align[gene_cluster][0], len(orfs[orf])))) SeqIO.write(records, outdir+'results.fasta', 'fasta')
def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name): """ this module looks for genes that were not used in the consensus stage. usually are gene models without long reads support """ sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n') '''Adds the EVM records that are not present in the final contig evidence''' whole_fasta = open(whole_fasta_name, 'r') out_fasta_file = open(output_filename, 'r') outputMerged = open(output_merged_fasta_name, 'w') wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta')) count = 0 dictOut = {} outFasta = SeqIO.parse(out_fasta_file, 'fasta') for record in outFasta: if record.id in dictOut: dictOut[str(record.id) + '_' + str(count)] = str(record.seq) count += 1 else: dictOut[record.id] = str(record.seq) for key in list(wholeDict.keys()): if 'evm' in key and key not in dictOut: ident = '>Gene' + str(count) + '_' + key outputMerged.write( ident + '\n' + str(wholeDict[key].seq) + '\n') count += 1 for key, element in list(dictOut.items()): ident = '>Gene' + str(count) + '_' + key outputMerged.write(ident + '\n' + str(element) + '\n') count += 1 whole_fasta.close() outFasta.close() outputMerged.close()
def fasta2Dict(fasta_filename): """ Prepare a dictionary of all the sequences that is used together with the fasta file to make single fasta files for the assembly """ fasta_file = open(fasta_filename, 'r') fasta_dict2 = SeqIO.to_dict(SeqIO.parse(fasta_file, 'fasta')) fasta_dict = {} for key, seq2 in list(fasta_dict2.items()): seq = str(seq2.seq).replace("N", "") fasta_dict[key] = seq del fasta_dict2[key] fasta_file.close() return fasta_dict
def dna_from_reference(chrom='9'): #reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta' reference_hg19 = '/Users/sam/vqsr_data/Homo_sapiens_assembly19.fasta' record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta")) dna = str(record_dict[chrom].seq[10000000:75000000]) return dna
def dna_from_reference(chrom='21'): reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta' record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta")) dna = str(record_dict[chrom].seq[35000000:37000000]) return dna
def main(): ''' Back translate aligned amino acid sequences to nucleotide sequences while maintaining the alignment ''' #creating a parser parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description='Translate amino acid sequences into nucleotide sequences while maintaining the amino acid alignment and the order of sequences.') #adding arguments parser.add_argument('-a', metavar='<aa_aln.fasta>', type=str, help='input amino acid alignment file') parser.add_argument('-n', metavar='<nuc_aln.fasta>', type=str, help='input unaligned/aligned nucleotide/codon sequence file') parser.add_argument('-o', metavar='<codon_aln.fasta>', type=str, help='output codon alignment file') args = parser.parse_args() #set up output file name if none is given if args.o is None: nuc_aln_file = "codon_aln.fasta" else: nuc_aln_file = args.o aa_aln_file = args.a nuc_seq_file = args.n aa_aln = AlignIO.read(aa_aln_file, "fasta") nuc_seq = open(nuc_seq_file) #read in codon file codon_dict = SeqIO.to_dict(SeqIO.parse(nuc_seq, "fasta")) #read in codon sequence file as a dictionary, key=sequence ID, value=sequence itself back_translate(aa_aln,codon_dict,nuc_aln_file,gencode)
def read_seq_records_dict(input_object, format='fasta'): """Read SeqRecord objects to a dictionary from a file in the specified format. :param input_object: A file object or a file name. :param format: Input format (fasta by default). :returns: An iterator of SeqRecord objects. :rtype: dict """ handle = input_object if type(handle) == str: handle = open(handle, "rU") return SeqIO.to_dict(SeqIO.parse(handle, format))
def main(gff_file, fasta_file): out_file = "%s.gb" % os.path.splitext(gff_file)[0] fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) gff_iter = GFF.parse(gff_file, fasta_input) SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def parse_contigs(outputAssembly, threshold_float, verbose): """ Parses the output from iAssembler, to a single FASTA file """ if verbose: sys.stderr.write('Executing: Parse assembled consensus\n') fname = outputAssembly + 'contig_member' fname_exists = os.path.isfile(fname) if fname_exists: # part all the files in the tmp assembly folder contigInfo = open(outputAssembly + 'contig_member', 'r') contigs = {} total_reads = 0 for line in contigInfo: line = line.strip() line = line.split('\t') # count the reads for each assembled Unitig read_number = len(line) - 1 for element in line: if 'evm' in element: contigs[line[0]] = [read_number, element] if line[0] not in contigs: contigs[line[0]] = [read_number] # sum all the reads whitin one cluster total_reads += read_number contigInfo.close() # calculate the number of reads considering the threshold threshold = total_reads * float(threshold_float) global count_sequences real_contigs = {} count_unitigs = 1 for key, element in list(contigs.items()): # to retrieve only supported assembly if len(element) == 2: if element[0] >= threshold and 'evm' in element[1]: real_contigs[key] = element[1] + ' ' + str( element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences) elif element[0] < threshold and 'evm' in element[1]: real_contigs[key] = element[1] + ' ' + str( element[0]) + '_below_threshold_' + str(threshold) + ' loc_' + str(count_sequences) elif len(element) == 1: if element[0] >= threshold: real_contigs[key] = 'Unitig' + str(count_sequences) + '_' + str(count_unitigs) + ' ' + str( element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences) count_unitigs += 1 # writes the outputs fileAssembly = outputAssembly + 'unigene_seq.new.fasta' contigSeq = open(fileAssembly, 'r') contigDict = SeqIO.to_dict(SeqIO.parse(contigSeq, 'fasta')) output_filename = outputAssembly[:-1] + '_assembled.fasta' outputFile = open(output_filename, 'w') for iden, write_iden in list(real_contigs.items()): if iden in contigDict: outputFile.write('>' + write_iden + '\n' + str(contigDict[iden].seq) + '\n') contigSeq.close() outputFile.close() return
def load_dna_and_chrom_label(args, only_labels=None): record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta")) bed_dict, bed_labels = bed_file_labels_to_dict(args.bed_file) train_data = np.zeros(( args.samples, args.window_size, len(args.inputs) )) train_labels = np.zeros(( args.samples, len(bed_labels) )) idx_offset = (args.window_size/2) amiguity_codes = {'K':[0,0,0.5,0.5], 'M':[0.5,0.5,0,0], 'R':[0.5,0,0,0.5], 'Y':[0,0.5,0.5,0], 'S':[0,0.5,0,0.5], 'W':[0.5,0,0.5,0], 'B':[0,0.333,0.333,0.334], 'V':[0.333,0.333,0,0.334],'H':[0.333,0.333,0.334,0], 'D':[0.333,0,0.333,0.334], 'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25]} count = 0 while count < args.samples: contig_key, pos = sample_from_bed(bed_dict, contig_key_prefix='chr') contig = record_dict[contig_key] record = contig[pos-idx_offset: pos+idx_offset] cur_label_key = bed_file_label(bed_dict, contig_key, pos) if only_labels and not cur_label_key in only_labels: continue train_labels[count, args.labels[cur_label_key]] = 1 for i,b in enumerate(record.seq): B=b.upper() if B in args.inputs.keys(): train_data[count, i, args.inputs[B]] = 1.0 elif B in amiguity_codes.keys(): train_data[count, i, :4] = amiguity_codes[B] else: print('Error! Unknown code:', b) return count += 1 print('Label:', bed_labels.keys(), 'label counts:', np.sum(train_labels, axis=0)) print('Train data shape:', train_data.shape, ' Training labels shape:', train_labels.shape) return (train_data, train_labels)
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq): ''' open bed file to write the kmer sequence ''' with open(outfile_kmers_seq,'w') as of: pass wf = open(outfile_kmers_seq,'a+') # read names and postions from bed file records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta')) with open(kmer_bedfile) as rf: for line in rf: name,start,stop,gene,val,strand,rem = bedline_split(line) start = start long_seq_record = records[name] long_seq = long_seq_record.seq alphabet = long_seq.alphabet short_seq = str(long_seq)[start-1:stop] short_seq_record = SeqRecord(Seq(short_seq, alphabet)) short_seq_record.seq.strip() ''' If the sequence is in reverse strand, rev complement it ''' if strand == "+": kmer_seq = short_seq_record.seq elif strand == "-": kmer_seq = short_seq_record.seq.reverse_complement() else: print("Strand information not found for line:\n {}".format(line)) #quit() wf.write("{}\t{}\t{}\t{}\t{}\t{}{}\n".format(name,start,stop,gene,kmer_seq,strand,rem)) rf.close() wf.close() of.close() file_chk = check_files(outfile_kmers_seq) return(file_chk)
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq): ''' open bed file to write the kmer sequence ''' with open(outfile_kmers_seq,'w') as of: pass wf = open(outfile_kmers_seq,'a+') # read names and postions from bed file records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta')) with open(kmer_bedfile) as rf: for line in rf: name,start,stop,gene,val,strand = bedline_split(line) long_seq_record = records[name] long_seq = long_seq_record.seq alphabet = long_seq.alphabet short_seq = str(long_seq)[start-1:stop] short_seq_record = SeqRecord(Seq(short_seq, alphabet)) short_seq_record.seq.strip() ''' If the sequence is in reverse strand, rev complement it #edit 14-march-2017 (to extract kmers strictly to half open bed format) ''' if strand == "+": kmer_seq = short_seq_record.seq #kmer created will be one base greater than the kmer size, trim kmer_seq = kmer_seq[:1] elif strand == "-": kmer_seq = short_seq_record.seq.reverse_complement() kmer_seq = kmer_seq[:-1] else: print("Strand information not found for line:\n {}".format(line)) quit() wf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,start,stop,gene,kmer_seq,strand)) rf.close() wf.close() file_chk = check_files(outfile_kmers_seq) return(file_chk)