我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pysam.Samfile()。
def filter_bam(input_file, output_file, downweight_number, ctot, gtoa): """ Takes a bam file as input and weights the quality of the reads down. Need to ensure we write the header out :) Investigate pysam and look for a header, this should really help us understand how to get this bam filter working and writing the bam files directly back out to the terminal. """ bam = pysam.Samfile(input_file,'rb') bam_out = pysam.Samfile(output_file, 'wb',template=bam) for line in bam: change_bases_c = None change_bases_t = None seq = line.seq qual = line.qual if(ctot): change_bases_c = [check_c_2_t(nuc) and i < downweight_number for i, nuc in enumerate(seq)] if(gtoa): change_bases_t = [check_g_2_a(nuc) and (len(seq)-i) <= downweight_number for i, nuc in enumerate(seq)] new_qual = downweight_quality(qual,change_bases_c, change_bases_t) line.qual = new_qual bam_out.write(line)
def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome): """ Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences can exist in the reference that do not exist in the BAM, but not the other way around. """ handle = pysam.Samfile(bam_path, 'rb') bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])} difference = bam_sequences - fasta_sequences if len(difference) > 0: base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.' err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference])) raise UserException(err) missing_seqs = fasta_sequences - bam_sequences if len(missing_seqs) > 0: base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.' msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs])) logger.warning(msg)
def main(bam): # get contig2size #faidx = FastaIndex(fasta) #contig2size = {c: faidx.id2stats[c][0] for c in faidx} sam = pysam.Samfile(bam) contig2size = {c: s for c, s in zip(sam.references, sam.lengths)} # estimate on largest contigs longest_contigs = sorted(contig2size, key=lambda x: contig2size[x], reverse=1) totdata = [0, 0] for c in longest_contigs[:25]: data = bam2matches(bam, regions=[c], mapq=10) print c, contig2size[c], 1.*data[0] / sum(data), sum(data) for i in range(2): totdata[i] += data[i] data = totdata print 1.*data[0] / sum(data), sum(data)
def test_make_split_read_bam_file(self): sorted_bam = path.join(TEST_DATA_DIR, 'sorted.bam') with pysam.Samfile(sorted_bam, 'rb') as samfile: for read in samfile: if not read.cigarstring: continue for breakpoint in (10, 50, 100): if breakpoint >= read.rlen: continue for is_left_split in (True, False): split_read = make_split_read(read, breakpoint, is_left_split) cigar_items = list(Cigar(split_read.cigarstring).items()) clipped_item = cigar_items[0] if is_left_split else cigar_items[-1] min_clip_len = breakpoint if is_left_split else read.rlen - breakpoint # Can be longer if adjacent to another clip. self.assertGreaterEqual(clipped_item[0], min_clip_len) self.assertIn(clipped_item[1], ('S', 'H')) # Will be soft-clipped unless already hard-clipped.
def chimeric_reads(bamfile, virus_contig, duplicates): igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam" if os.path.exists(igv_chimeric_file): return igv_chimeric_file with pysam.Samfile(args.bamfile, "rb") as in_handle, \ pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle: for read in in_handle: try: chrom = in_handle.getrname(read.tid) except ValueError: continue if not is_chimeric_read(read, chrom, virus_contig): continue if read.qname in duplicates: continue out_handle.write(read) return igv_chimeric_file
def merge_by_key(bam_filenames, key_func, bam_out): file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile) total_reads = 0 heap = [] for bam_filename in bam_filenames: try: bam = file_cache.get(bam_filename) first_read = bam.next() heapq.heappush(heap, (key_func(first_read), first_read, bam_filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it to the bam. key, read, bam_filename = heapq.heappop(heap) bam = file_cache.get(bam_filename) bam_out.write(read) total_reads += 1 # Get the next read from the source bam we just wrote from # If that BAM file is out of reads, then we leave that one out try: next_read = bam.next() heapq.heappush(heap, (key_func(next_read), next_read, bam_filename)) except StopIteration: pass return total_reads
def create_bam_outfile(file_name, chrom_names, chrom_lengths, template=None, pgs=None, cos=None, rgs=None, replace_rg=False): """ Creates a bam file with given chromosome names and lengths. template is an existing bam file. If it is specified, chrom_names and chrom_lengths are ignored. pg is dictionary specifying a 'PG' entry. ID field is required; PN/CL/PP/DS/VN fields are optional. rgs is a list of dicts specifiying an 'RG' entry. If replace_rg is True, the existing 'RG' entry is overwritten. """ if template: header = template.header if pgs is not None: for pg in pgs: if not header.has_key('PG'): header['PG'] = [] # add in the PP field based on previous PG entry if len(header['PG']) > 0: pp = header['PG'][-1]['ID'] if pp is not None: pg['PP'] = pp header['PG'].append(pg) if cos is not None: for co in cos: if not header.has_key('CO'): header['CO'] = [] header['CO'].append(co) if rgs is not None: if replace_rg and header.has_key('RG') and len(rgs) > 0: header['RG'] = [] for rg in rgs: if not header.has_key('RG'): header['RG'] = [] header['RG'].append(rg) bam_file = pysam.Samfile(file_name, 'wb', header=header) tids = {name:n for (n, name) in enumerate(template.references)} else: header = {'SQ': [{'SN': chrom_names[n], 'LN': chrom_lengths[n]} for n in xrange(len(chrom_names))]} bam_file = pysam.Samfile(file_name, 'wb', header=header) tids = {chrom_names[n]:n for n in xrange(len(chrom_names))} return bam_file, tids
def create_bam_infile(file_name): bam_file = pysam.Samfile(file_name, 'rb') return bam_file
def create_sam_infile(file_name): sam_file = pysam.Samfile(file_name, 'r') return sam_file
def sort_by_name(file_name, sorted_prefix=None): """ Sorts a bam file by the read name, for paired-end """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_namesorted' sorted_name = sorted_prefix + '.bam' # NOTE -- need to update our internal samtools in order to use pysam.sort #pysam.sort('-n', file_name, sorted_prefix) subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix]) return pysam.Samfile(sorted_name, 'rb')
def filter_multihits(filename, max_hits, verbose, tmp_dir): """ Pre-processing function for cleaning up the input bam file. """ if verbose: print_time_stamp('filtering multi-mapped up to %d hits.' % max_hits) multiread_set=set() subprocess.call("samtools view -h %s | awk '{if($2 !~ /_/ && $3 !~ /_/) {print}}' | samtools view -bS - > %s/filter_random.bam" % (filename, tmp_dir), shell=True) oldfile=pysam.Samfile(tmp_dir + '/filter_random.bam','rb') new_filename=os.path.abspath(tmp_dir + '/filter%d.bam' % max_hits) sorted_filename=os.path.abspath(tmp_dir + '/filter%d.sorted.bam' % max_hits) newfile=pysam.Samfile(new_filename, 'wb', template=oldfile) for aligned_read in oldfile: try: if aligned_read.opt("NH") < max_hits: newfile.write(aligned_read) if aligned_read.opt("NH")>1: multiread_set.add(aligned_read.qname) except KeyError: newfile.write(aligned_read) oldfile.close() newfile.close() sort_cmd='samtools sort -T %s/ -o %s %s' % (tmp_dir, sorted_filename, new_filename) index_cmd='samtools index %s' % sorted_filename subprocess.call(sort_cmd, shell=True) subprocess.call(index_cmd, shell=True) subprocess.call('rm %s/filter_random.bam %s' % (tmp_dir, new_filename), shell=True) pickle.dump(multiread_set, open(tmp_dir + '/multiread_set.pdata', 'wb'), -1) return(sorted_filename, multiread_set)
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False): """DOCSTRING Args: Returns: """ # file handlers mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb') ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]} if unique_only: ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w') else: ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w') # read in GTF gene_annot = read_gtf(gtf_fp) # iteratively call peaks in each gene peak_counter = 0 for gene_name in tqdm(gene_annot): gene = gene_annot[gene_name] BED = call_gene_peak(bam_dict, gene, unique_only=unique_only, with_control=with_control, unstranded=unstranded) ofile.write(BED) #print BED peak_counter += len(BED.split('\n')) ofile.close() logger.info('called %i peaks'%peak_counter) return
def make_bam_handler_dict(ip_bam_list, con_bam_list): """DOCSTRING Args Returns """ bam_dict = defaultdict(list) try: ubam_ip, mbam_ip = ip_bam_list except: ubam_ip = ip_bam_list[0] mbam_ip = None for bam_fn in ubam_ip.split(','): if not os.path.isfile(bam_fn): raise Exception('%s not found'%bam_fn) bam_dict['ubam.ip'].append( pysam.Samfile(bam_fn, 'rb') ) if mbam_ip is not None: for bam_fn in mbam_ip.split(','): if not os.path.isfile(bam_fn): raise Exception('%s not found'%bam_fn) bam_dict['mbam.ip'].append( pysam.Samfile(bam_fn, 'rb') ) if con_bam_list is None: return bam_dict try: ubam_con, mbam_con = con_bam_list except: ubam_con = con_bam_list[0] mbam_con = None for bam_fn in ubam_con.split(','): if not os.path.isfile(bam_fn): raise Exception('%s not found'%bam_fn) bam_dict['ubam.con'].append( pysam.Samfile(bam_fn, 'rb') ) if mbam_con is not None: for bam_fn in mbam_con.split(','): if not os.path.isfile(bam_fn): raise Exception('%s not found'%bam_fn) bam_dict['mbam.con'].append( pysam.Samfile(bam_fn, 'rb') ) return bam_dict
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file should be sorted and indexed.") parser.add_option("-n","--subset-num",action="store",type="int",dest="subset_num",help="Number of small BAM files") parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. Output \"Prefix_num.bam\".") parser.add_option("-s","--skip-unmap",action="store_true",dest="skip_unmap", help="Skip unmapped reads.") (options,args)=parser.parse_args() if not (options.input_file and options.subset_num and options.output_prefix): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file): print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n' sys.exit(0) samfile = pysam.Samfile(options.input_file,'rb') sub_bam = {} count_bam={} for i in range(0,options.subset_num): sub_bam[i] = pysam.Samfile(options.output_prefix + '_' + str(i) +'.bam','wb',template=samfile) count_bam[i] = 0 total_alignment = 0 print >>sys.stderr, "Dividing " + options.input_file + " ...", try: while(1): aligned_read = samfile.next() if aligned_read.is_unmapped and options.skip_unmap is True: continue total_alignment += 1 tmp = randrange(0,options.subset_num) sub_bam[tmp].write(aligned_read) count_bam[tmp] += 1 except StopIteration: print >>sys.stderr, "Done" for i in range(0,options.subset_num): print "%-55s%d" % (options.output_prefix + '_' + str(i) +'.bam', count_bam[i])
def main(): usage="%prog [options]" + '\n' + __doc__ + "\n" parser = OptionParser(usage,version="%prog " + __version__) parser.add_option("-i","--input",action="store",type="string",dest="input_file",help="Input BAM file") parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]") parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default") parser.add_option("-n","--frag-num",action="store",type="int",dest="fragment_num",default=3,help="Minimum number of fragment. default=%default") (options,args)=parser.parse_args() if not (options.input_file and options.refgene_bed): parser.print_help() sys.exit(0) if not os.path.exists(options.input_file + '.bai'): print >>sys.stderr, "cannot find index file of input BAM file" print >>sys.stderr, options.input_file + '.bai' + " does not exists" sys.exit(0) for file in (options.input_file, options.refgene_bed): if not os.path.exists(file): print >>sys.stderr, file + " does NOT exists" + '\n' sys.exit(0) print '\t'.join([str(i) for i in ("chrom","tx_start", "tx_end","symbol","frag_count","frag_mean","frag_median","frag_std")]) for tmp in fragment_size(options.refgene_bed, pysam.Samfile(options.input_file), options.map_qual, options.fragment_num): print tmp
def __init__(self,inputFile): '''constructor. input could be bam or sam''' try: self.samfile = pysam.Samfile(inputFile,'rb') if len(self.samfile.header) ==0: print >>sys.stderr, "BAM/SAM file has no header section. Exit!" sys.exit(1) self.bam_format = True except: self.samfile = pysam.Samfile(inputFile,'r') if len(self.samfile.header) ==0: print >>sys.stderr, "BAM/SAM file has no header section. Exit!" sys.exit(1) self.bam_format = False
def fixmate(infile, outfile): inbam = pysam.Samfile(infile, 'rb') outbam = pysam.Samfile(outfile, 'wb', header=inbam.header, referencenames=inbam.references) qname = None nTotal = 0 nFixed = 0 count = 0; reads = [] gc.disable() for rseq in inbam.fetch(until_eof=True): nTotal += 1 if qname is None or qname == rseq.qname: qname = rseq.qname reads.append(rseq) else: count = process(reads, inbam.getrname) if count > 0: for r in reads: outbam.write(r) nFixed += count qname = rseq.qname del reads reads = [rseq] if nTotal % 200000 == 0: logger.info('%d read(s) fixed' % nTotal) gc.enable() gc.disable() count = process(reads, inbam.getrname) if count > 0: for r in reads: outbam.write(r) nFixed += count logger.info('%d read(s) processed' % nTotal) logger.info('%d read(s) fixed and written to file' % nFixed) inbam.close() outbam.close() return (nTotal, nFixed)
def genes_alignments(outdir): genes_align = {} align_file = pysam.Samfile(outdir+'ORFs_on_genes_align.sam', 'rb') for rec in align_file: if rec.cigarstring is not None: gene_in_contig_len = int(re.findall(r'_len_(\d+)', rec.reference_name)[0]) total_match = sum([i[1] for i in rec.cigar if i[0] == 0]) conf = float(re.findall(r'_conf_([0-9]*\.[0-9]*)', rec.reference_name)[0]) if gene_in_contig_len / total_match > 0.98: if genes_align.get(rec.reference_name) is None: genes_align[rec.reference_name] = conf, set() genes_align[rec.reference_name][1].add(rec.qname) return genes_align
def bam_is_paired(bam_path, num_reads=20000, paired_cutoff=0.75): """ Infers the paired-ness of a bam file. """ sam = pysam.Samfile(bam_path) count = 0 for rec in itertools.islice(sam, num_reads): if rec.is_paired: count += 1 if tools.mathOps.format_ratio(count, num_reads) > 0.75: return True elif tools.mathOps.format_ratio(count, num_reads) < 1 - paired_cutoff: return False else: raise UserException("Unable to infer pairing from bamfile {}".format(bam_path))
def is_bam(path): """Checks if a path is a BAMfile""" try: pysam.Samfile(path) except IOError: raise RuntimeError('Path {} does not exist'.format(path)) except ValueError: return False return True
def renamereads(inbamfn, outbamfn): inbam = pysam.Samfile(inbamfn, 'rb') outbam = pysam.Samfile(outbamfn, 'wb', template=inbam) paired = {} n = 0 p = 0 u = 0 w = 0 m = 0 for read in inbam.fetch(until_eof=True): n += 1 if read.is_paired: p += 1 if read.qname in paired: uuid = paired[read.qname] del paired[read.qname] read.qname = uuid outbam.write(read) w += 1 m += 1 else: newname = str(uuid4()) paired[read.qname] = newname read.qname = newname outbam.write(read) w += 1 else: u += 1 read.qname = str(uuid4()) outbam.write(read) w += 1 outbam.close() inbam.close()
def renamereads(inbamfn, outbamfn): inbam = pysam.Samfile(inbamfn, 'rb') outbam = pysam.Samfile(outbamfn, 'wb', template=inbam) paired = {} n = 0;p = 0;u = 0;w = 0;m = 0 for read in inbam.fetch(until_eof=True): n += 1 if read.is_paired: p += 1 if read.qname in paired: uuid = paired[read.qname] del paired[read.qname] read.qname = uuid outbam.write(read) w += 1;m += 1 else: newname = str(uuid4()) paired[read.qname] = newname read.qname = newname outbam.write(read) w += 1 else: u += 1 read.qname = str(uuid4()) outbam.write(read) w += 1 outbam.close() inbam.close()
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path): print "___ removing reads overlapping heterozygous region ___" inbamsorted = sub('.bam$','.sorted',inbamfn) pysam.sort(inbamfn, inbamsorted) pysam.index(inbamsorted+'.bam') alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" ) outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile ) bedfile = open(bedfn, 'r') for bedline in bedfile: c = bedline.strip().split() if (len(c) == 3 ): chr2 = c[0] chr = c[0].strip("chr") start = int(c[1]) end = int(c[2]) else : continue try: readmappings = alignmentfile.fetch(chr2, start, end) except ValueError as e: print("problem fetching the read ") for shortread in readmappings: try: outbam.write(shortread) except ValueError as e: print ("problem removing read :" + shortread.qname) outbamsorted = sub('.bam$','.sorted',outbamfn) pysam.sort(outbamfn, outbamsorted) bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path ) outbam.close()
def pileup(bamfile_name): samfile = pysam.Samfile(bamfile_name,"rb") output_pileup = open("test.cov", "w") sum_cov=0 for pos_pileup in samfile.pileup(): sum_cov+=pos_pileup.n if not pos_pileup.pos % 1e3: av_cov=int(sum_cov/1e3) sum_cov=0 print(pos_pileup.tid,pos_pileup.pos,av_cov,file=output_pileup)
def buildcov(self): """ Builds Barcode counts of the reads aligned to the genome input : bam_file and bed_file output : Returns bed file with Chr, start, end, barcode, no of occurences, strand """ before = datetime.now() barcode_pattern = re.compile(r'BX:(A-Z)?:[ACGT]*-[0-9]') self.__LOGGER.info("WARNING: Dropping Reads Alignments that are not barcoded") output_bed = open(self.barcode_bed,"wb") with open(self.bed_file) as regions: with pysam.Samfile(self.obam_file, "rb") as samfile: for line in regions: chr, start, end , info = line.rstrip('\n').rstrip('\r').split("\t") b_dict = {} try: readsinregion=samfile.fetch(chr,int(start),int(end)) for read in readsinregion: tags = dict(read.tags) if 'BX' in tags.keys(): if tags['BX'] in b_dict: b_dict[tags['BX']] += 1 else: b_dict[tags['BX']] = 1 except: self.__LOGGER.info("Skipping region : {0}:{1}:{2}".format(chr,start,end)) continue for k, v in b_dict.items(): output_bed.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chr,start,end,info,k,v)) regions.close() samfile.close() output_bed.close() after = datetime.now() self.__LOGGER.info('Computed Barcode profiles for the vcf regions completed in {0}'.format(str(after - before)))
def load_samfile(sam_file): """To load an indexed bam file""" ftype = sam_file.split(".")[-1] if ftype != "bam" and ftype != "sam": print("Error: file type need suffix of bam or sam.") sys.exit(1) # print("Loading a %s" %ftype + " with file name: %s" %sam_file) if ftype == "bam": samfile = pysam.Samfile(sam_file, "rb") else: samfile = pysam.Samfile(sam_file, "r") return samfile
def run(self): try: proc_name = self.name #Open the bams nBams = [] # nonTrim Bams tBams = [] # trim Bams for i in self.args.bam: nBams.append(pysam.Samfile(i)) for i in self.args.pacBam: tBams.append(pysam.Samfile(i)) while True: next_task = self.task_queue.get() if next_task is None: # Poison pill means shutdown logging.info('Thread %s: Exiting\n' % proc_name) self.task_queue.task_done() break try: answer = next_task(nBams, tBams) except Exception as e: logging.error("Exception raised in task %s" % (str(e))) self.task_queue.task_done() self.result_queue.put("Failure - UNK - %s" % str(e)) logging.info("fail in groupid=%s" % next_task.data.name) continue self.task_queue.task_done() self.result_queue.put(answer) return except Exception as e: logging.error("Consumer %s Died\nERROR: %s" % (self.name, e)) return
def mapTails(bam, args): if bam.endswith('.bam'): bam = pysam.Samfile(bam,'rb') elif bam.endswith('.sam'): bam = pysam.Samfile(bam) else: logging.error("Cannot open input file! %s" % (bam)) exit(1) logging.info("Extracting tails") tailfastq = tempfile.NamedTemporaryFile(suffix=".fastq", delete=False, dir=args.temp) tailfastq.close(); tailfastq = tailfastq.name r, t, m = extractTails(bam, outFq=tailfastq, minLength=args.minTail) logging.info("Parsed %d reads" % (r)) logging.info("Found %d tails" % (t)) logging.info("%d reads had double tails" % (m)) if t == 0: logging.info("No tails -- Exiting") exit(0) tailmap = tempfile.NamedTemporaryFile(suffix=".sam", delete=False, dir=args.temp) tailmap.close(); tailmap = tailmap.name callBlasr(tailfastq, args.reference, args.params, args.nproc, tailmap) bam.close() #reset return tailmap
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.result_queue = result_queue self.bam = pysam.Samfile(bamName) if referenceName is not None: self.reference = pysam.Fastafile(referenceName) else: self.reference = None self.honH5 = honH5 #self.args = args #self.kwargs = kwargs
def __call__(self, bam, reference, honH5): """ Takes a pysam.Samfile """ logging.info("Starting %s" % (self.name)) size = self.end - self.start regName = "%s:%d-%d" % (self.chrom, self.start, self.end) logging.debug("Making container for %s (%s %d bp)" % (self.groupName, regName, size)) logging.debug("Parsing bam" ) st = max(0, self.start) readCount = bam.count(self.chrom, st, self.end) reads = bam.fetch(self.chrom, st, self.end) if readCount == 0: logging.warning("No reads found in %s" % self.groupName) self.failed = True self.errMessage = "No reads found in %s" % self.groupName return else: logging.info("%d reads to parse in %s" % (readCount, self.groupName)) myData = self.countErrors(reads, self.start, size, self.args) #request loc on honH5 and flush results with honH5.acquireH5('a') as h5dat: out = h5dat.create_group(self.groupName) out.attrs["reference"] = self.chrom out.attrs["start"] = self.start out.attrs["end"] = self.end if size < CHUNKSHAPE[1]: chunk = (CHUNKSHAPE[0], size-1) else: chunk = CHUNKSHAPE container = out.create_dataset("data", data = myData, \ chunks=chunk, compression="gzip") h5dat.flush()
def __init__(self, threadidx, options, config, startline, endline, names): multiprocessing.Process.__init__(self) # Initializing variables self.threadidx = threadidx self.options = options self.config = config self.startline = startline self.endline = endline self.names = names # Initializing reads directory self.reads = dict() # Connecting to BAM file self.samfile = pysam.Samfile(options.input, "rb") # Connecting to transcript database file if not config['transcript_db'] is None: self.enstdb = pysam.Tabixfile(config['transcript_db']) else: self.enstdb = None # Initializing output files if int(options.threads) > 1: if config['outputs']['regions']: self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w') if config['outputs']['profiles']: self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w') if not config['transcript_db'] is None: self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w') else: if config['outputs']['regions']: self.out_targets = open(options.output + '_regions.txt', 'w') if config['outputs']['profiles']: self.out_profiles = open(options.output + '_profiles.txt', 'w') if not config['transcript_db'] is None: self.out_poor = open(options.output + '_poor.txt', 'w') # Checking if target is fail or pass
def igv_reads(bamfile, virus_contig): igv_chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.igv.bam" if os.path.exists(igv_chimeric_file): return igv_chimeric_file with pysam.Samfile(args.bamfile, "rb") as in_handle, \ pysam.Samfile(igv_chimeric_file, "wb", template=in_handle) as out_handle: contig = in_handle.gettid(args.virus_contig) for read in in_handle: if skip_read(read, contig, args.virus_contig, in_handle, False): continue chrom = in_handle.getrname(read.tid) out_handle.write(read) return igv_chimeric_file
def keep_chimeric_reads(bamfile, virus_contig): chimeric_file = os.path.splitext(args.bamfile)[0] + ".chimeric.bam" if os.path.exists(chimeric_file): return chimeric_file with pysam.Samfile(args.bamfile, "rb") as in_handle, \ pysam.Samfile(chimeric_file, "wb", template=in_handle) as out_handle: contig = in_handle.gettid(args.virus_contig) for read in in_handle: if skip_read(read, contig, args.virus_contig, in_handle, True): continue chrom = in_handle.getrname(read.tid) out_handle.write(read) return chimeric_file
def get_duplicates(bam_file): s = set() with pysam.Samfile(bam_file, "rb") as in_handle: for read in in_handle: if read.is_duplicate: s.update([read.qname]) return s
def concatenate_and_fix_bams(out_bamfile, bamfiles, drop_tags=None): """Concatenate bams with different references and populate tags. Assumes that the input bams have completely non-overlapping reference entries. No sorting assumed. Output reads as in the same order as input. Tags are stripped from the header (which is assumed to look like an AugmentedFastqHeader) and are stored as bam tags. Tags in drop_tags are completely dropped. Args: - drop_tags: Set of tag names to ignore. If None, than all tags will be included. """ template_bam = pysam.Samfile(bamfiles[0]) header = template_bam.header for bam_fn in bamfiles[1:]: bam = pysam.Samfile(bam_fn) header['SQ'].extend(bam.header['SQ']) refname_to_tid = { ref_head['SN']:idx for (idx, ref_head) in enumerate(header['SQ']) } bam_out = pysam.Samfile(out_bamfile, "wb", header=header) for bam_fn in bamfiles: bam = pysam.Samfile(bam_fn) for rec in bam: if rec.is_unmapped and rec.mate_is_unmapped: rec.reference_id = -1 rec.next_reference_id = -1 elif rec.is_unmapped and not rec.mate_is_unmapped: rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]] rec.reference_id = rec.next_reference_id elif not rec.is_unmapped and rec.mate_is_unmapped: rec.reference_id = refname_to_tid[bam.references[rec.reference_id]] rec.next_reference_id = rec.reference_id else: rec.reference_id = refname_to_tid[bam.references[rec.reference_id]] rec.next_reference_id = refname_to_tid[bam.references[rec.next_reference_id]] # Pull fields out of qname, and make tags, writing out to bam_out qname_split = rec.qname.split(cr_fastq.AugmentedFastqHeader.TAG_SEP) rec.qname = qname_split[0] tags = rec.tags for i in range(1, len(qname_split), 2): tag = (qname_split[i], qname_split[i+1]) # Don't add empty tags if len(tag[1]) > 0 and (drop_tags is None or not tag[0] in drop_tags): tags.append(tag) rec.tags = tags bam_out.write(rec)
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False): """DOCSTRING Args: Returns: """ # file handlers mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb') ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam) chr_list=[x['SN'] for x in ubam.header['SQ']] # construct the mread_dict; this will be needed throughout mread_dict = defaultdict(list) for alignment in mbam: mread_dict[alignment.qname].append(alignment) # keep a record of processed reads processed_mreads = set() # iterate through all mreads for read_qname in mread_dict: if read_qname in processed_mreads: continue ## construct the fully-connected subgraph for each read read_to_locations, processed_mreads = \ construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded) subgraph = set() for read in read_to_locations: _ = map(subgraph.add, read_to_locations[read].keys()) subgraph = list(subgraph) ## build the BIT tracks node_track, multi_reads_weights = \ construct_BIT_track(subgraph, read_to_locations, ubam, unstranded) ## run EM multi_reads_weights = \ run_EM(node_track, multi_reads_weights, w=winsize) ## write to obam for read in multi_reads_weights: for node in multi_reads_weights[read]: alignment = read_to_locations[read][node] score = round(multi_reads_weights[read][node][0], 3) alignment.set_tag('AS', score) alignment.set_tag('PG', 'CLAM') obam.write(alignment) # sort the final output logger.info('sorting output') obam.close() ubam.close() mbam.close() obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam') pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam')) pysam.index(obam_sorted_fn) os.remove(os.path.join(out_dir, 'realigned.bam')) return
def peakcaller(tmp_dir, out_dir, gtf_fp, unique_only=False, with_replicates=False, with_control=False, unstranded=False): """DOCSTRING Args: Returns: """ # file handlers mbam = pysam.Samfile(os.path.join(out_dir, 'realigned.sorted.bam'),'rb') ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') bam_dict = {'ubam.ip':[ubam], 'mbam.ip':[mbam]} if unique_only: ofile = open(os.path.join(out_dir, 'narrow_peaks.unique.bed'), 'w') else: ofile = open(os.path.join(out_dir, 'narrow_peaks.combined.bed'), 'w') # read in GTF logger.info('read gtf from "%s" '% fn) gene_annot = read_gtf(gtf_fp) # fetch tag counts in each gene ## gene_name: { 'ip': bin_interval_ip, 'con': bin_interval_con } logger.info('reading gene counts') gene_count = defaultdict(dict) for gene_name in tqdm(gene_annot): gene = gene_annot[gene_name] bin_interval_ip, bin_interval_con = \ gene_to_count(bam_dict, gene, unique_only=unique_only, with_control=with_control, unstranded=unstranded) if bin_interval_ip is None: ## if no IP reads in this gene continue gene_count[gene_name]['ip'] = bin_interval_ip gene_count[gene_name]['con'] = bin_interval_con # estimate global overdispersion param. for each dataset logger.info('estimating dispersion parameter') alpha_ip_vec = estim_dispersion_param(len(bam_dict['ubam.ip']), gene_count, 'ip' ) if with_control: alpha_con_vec = estim_dispersion_param(len(bam_dict['ubam.con']), gene_count, 'con' ) else: alpha_con_vec = np.asarray(alpha_ip_vec) # perform statistical test logger.info('calling peaks') for gene_name in gene_to_count: gene = gene_annot[gene_name] BED = test_gene_bin(gene, gene_count[gene_name]['ip'], gene_count[gene_name]['con'], alpha_ip_vec, alpha_con_vec) ofile.write(BED) peak_counter += len(BED.split('\n')) ofile.close() logger.info('called %i peaks'%peak_counter) return
def genebody_coverage(bam, position_list): ''' position_list is dict returned from genebody_percentile position is 1-based genome coordinate ''' samfile = pysam.Samfile(bam, "rb") aggreagated_cvg = collections.defaultdict(int) gene_finished = 0 for chrom, strand, positions in position_list.values(): coverage = {} for i in positions: coverage[i] = 0.0 chrom_start = positions[0]-1 if chrom_start <0: chrom_start=0 chrom_end = positions[-1] try: samfile.pileup(chrom, 1,2) except: continue for pileupcolumn in samfile.pileup(chrom, chrom_start, chrom_end, truncate=True): ref_pos = pileupcolumn.pos+1 if ref_pos not in positions: continue if pileupcolumn.n == 0: coverage[ref_pos] = 0 continue cover_read = 0 for pileupread in pileupcolumn.pileups: if pileupread.is_del: continue if pileupread.alignment.is_qcfail:continue if pileupread.alignment.is_secondary:continue if pileupread.alignment.is_unmapped:continue if pileupread.alignment.is_duplicate:continue cover_read +=1 coverage[ref_pos] = cover_read tmp = [coverage[k] for k in sorted(coverage)] if strand == '-': tmp = tmp[::-1] for i in range(0,len(tmp)): aggreagated_cvg[i] += tmp[i] gene_finished += 1 if gene_finished % 100 == 0: print >>sys.stderr, "\t%d transcripts finished\r" % (gene_finished), return aggreagated_cvg
def read_bam(bam_fn, precursors, clean = True): """ read bam file and perform realignment of hits """ bam_fn = _sam_to_bam(bam_fn) bam_fn = _bam_sort(bam_fn) mode = "r" if bam_fn.endswith("sam") else "rb" handle = pysam.Samfile(bam_fn, mode) reads = defaultdict(hits) for line in handle: if line.reference_id < 0: logger.debug("Sequence not mapped: %s" % line.reference_id) continue query_name = line.query_name if query_name not in reads and line.query_sequence == None: continue if line.query_sequence and line.query_sequence.find("N") > -1: continue if query_name not in reads: reads[query_name].set_sequence(line.query_sequence) reads[query_name].counts = _get_freq(query_name) if line.is_reverse: logger.debug("Sequence is reverse: %s" % line.query_name) continue chrom = handle.getrname(line.reference_id) # print "%s %s %s %s" % (line.query_name, line.reference_start, line.query_sequence, chrom) cigar = line.cigartuples iso = isomir() iso.align = line iso.set_pos(line.reference_start, len(reads[query_name].sequence)) logger.debug("READ::From BAM start %s end %s" % (iso.start, iso.end)) if len(precursors[chrom]) < line.reference_start + len(reads[query_name].sequence): continue iso.subs, iso.add, iso.cigar = filter.tune(reads[query_name].sequence, precursors[chrom], line.reference_start, cigar) logger.debug("READ::After tune start %s end %s" % (iso.start, iso.end)) if len(iso.subs) < 2: reads[query_name].set_precursor(chrom, iso) logger.info("Hits: %s" % len(reads)) if clean: reads = filter.clean_hits(reads) logger.info("Hits after clean: %s" % len(reads)) return reads
def main(argv): args = parse_args(argv) print args if args.input == "-" and sys.stdin.isatty(): sys.stderr.write("STDIN is a terminal, terminating!\n") return 1 elif sys.stdout.isatty(): sys.stderr.write("STDOUT is a terminal, terminating!\n") return 1 with pysam.Samfile(args.input, "rb") as infile: with pysam.Samfile("-", "wb", template=infile) as outfile: curpos = None curvariants = {} for (read_num, read) in enumerate(infile): if curpos and ((read.tid, read.pos) != curpos): # Sort order is defined as ascending 'tid's and positions if curpos > (read.tid, read.pos) and not read.is_unmapped: sys.stderr.write("ERROR: Input file does not appear " "to be sorted by coordinates at " "record %i, aborting ...\n" % (read_num,)) return 1 _flush_buffer(outfile, curvariants, args.remove_duplicates) curpos = None is_filtered = read.flag & _FILTERED_FLAGS is_collapsed = read.qname.startswith("M_") if is_filtered or not (read.qual and is_collapsed): outfile.write(read) continue curpos = (read.tid, read.pos) nkey = (read.is_reverse, read.pos, read.alen) if nkey in curvariants: curvariants[nkey][0].append(read) curvariants[nkey][1] += 1 else: curvariants[nkey] = [[read], 1] _flush_buffer(outfile, curvariants, args.remove_duplicates) return 0
def setup_hints(job, input_file_ids): """ Generates hints for a given genome with a list of BAMs. Will add annotation if it exists. """ # RNA-seq hints filtered_bam_file_ids = {'BAM': collections.defaultdict(list), 'INTRONBAM': collections.defaultdict(list)} for dtype, bam_dict in input_file_ids['bams'].iteritems(): if len(bam_dict) == 0: continue # Since BAMs are valid, we can assume that they all share the same header bam_file_id, bai_file_id = bam_dict.values()[0] bam_path = job.fileStore.readGlobalFile(bam_file_id) sam_handle = pysam.Samfile(bam_path) # triple disk usage to deal with name sorted bam disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) * 3 # generate reference grouping that will be used downstream until final cat step grouped_references = [tuple(x) for x in group_references(sam_handle)] for original_path, (bam_file_id, bai_file_id) in bam_dict.iteritems(): for reference_subset in grouped_references: j = job.addChildJobFn(namesort_bam, bam_file_id, bai_file_id, reference_subset, disk_usage, disk=disk_usage, cores=4, memory='16G') filtered_bam_file_ids[dtype][reference_subset].append(j.rv()) # IsoSeq hints iso_seq_hints_file_ids = [] iso_seq_file_ids = input_file_ids['iso_seq_bams'] if len(iso_seq_file_ids) > 0: for bam_file_id, bai_file_id in iso_seq_file_ids: disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) j = job.addChildJobFn(generate_iso_seq_hints, bam_file_id, bai_file_id, disk=disk_usage) iso_seq_hints_file_ids.append(j.rv()) # protein hints if input_file_ids['protein_fasta'] is not None: disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['protein_fasta']) j = job.addChildJobFn(generate_protein_hints, input_file_ids['protein_fasta'], input_file_ids['genome_fasta'], disk=disk_usage) protein_hints_file_id = j.rv() else: protein_hints_file_id = None # annotation hints if input_file_ids['annotation'] is not None: disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['annotation']) j = job.addChildJobFn(generate_annotation_hints, input_file_ids['annotation'], disk=disk_usage) annotation_hints_file_id = j.rv() else: annotation_hints_file_id = None return job.addFollowOnJobFn(merge_bams, filtered_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, protein_hints_file_id).rv()
def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, num_reads=50 ** 6): """ Slices out the reference subset from a BAM, name sorts that subset, then chunks the resulting reads up for processing by filterBam. """ def write_bam(r, ns_handle): """Write to the path, returns file ID""" outf = tools.fileOps.get_tmp_toil_file() outf_h = pysam.Samfile(outf, 'wb', template=ns_handle) for rec in r: outf_h.write(rec) outf_h.close() return job.fileStore.writeGlobalFile(outf) bam_path = job.fileStore.readGlobalFile(bam_file_id) is_paired = bam_is_paired(bam_path) job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai') name_sorted = tools.fileOps.get_tmp_toil_file(suffix='name_sorted.bam') cmd = [['samtools', 'view', '-b', bam_path] + list(reference_subset), ['sambamba', 'sort', '-t', '4', '-m', '15G', '-o', '/dev/stdout', '-n', '/dev/stdin']] tools.procOps.run_proc(cmd, stdout=name_sorted) ns_handle = pysam.Samfile(name_sorted) # this group may come up empty -- check to see if we have at least one mapped read try: _ = ns_handle.next() except StopIteration: return None # reset file handle to start ns_handle = pysam.Samfile(name_sorted) filtered_file_ids = [] r = [] for qname, reads in itertools.groupby(ns_handle, lambda x: x.qname): r.extend(list(reads)) if len(r) >= num_reads: file_id = write_bam(r, ns_handle) j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G') filtered_file_ids.append(j.rv()) r = [] # do the last bin, if its non-empty if len(r) > 0: file_id = write_bam(r, ns_handle) j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G') filtered_file_ids.append(j.rv()) return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory='16G').rv()
def test(argv): numpy.seterr(all="ignore") args = parseArgs(argv) setupLogging(True)#keep debug on.. you're testing! logging.critical(("Running HSpots.py directly implements testing mode. " "If you're trying to run the full, actual program, use " "Honey.py spots")) bam = pysam.Samfile(args.bam) reference = pysam.Fastafile(args.reference) try: if bam.header["HD"]["SO"] != "coordinate": logging.warning("BAM is not sorted by coordinates! Performance may be slower") except KeyError: logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct") logging.info("Running in test mode") #do what you will.. from here #spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208) #spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1) # This is what I need to start with #spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57) fh = open("honeymissing.bed") for line in fh.readlines(): data = line.strip().split('\t') spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \ size=int(data[3].split('=')[-1]), svtype="DEL") j = SpotCaller('group', spot.chrom, spot.start, spot.end, args) if j.supportingReadsFilter(spot, bam, args): consen = ConsensusCaller(spot, args) consen(bam, reference, 'none') for i in consen.newSpots: i.tags["seqmade"] = True print i if len(consen.newSpots) == 0: spot.tags["noseq"] = True print str(spot) else: spot.tags["filtfail"] = True print str(spot) #done with test code logging.info("Finished testing")