我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用pysam.sort()。
def merge_reads(bams, merged_output_dir): """ Merge bam files, can take odd numbers of reads. """ try: os.mkdir(merged_output_dir) except OSError: pass bam_list = get_bam_pairs(bams) for sample_id, bams in bam_list.items(): header = get_header(bams[0]) bam_out = os.path.join(merged_output_dir, os.path.basename(sample_id)) + '.bam' out = pysam.AlignmentFile(bam_out, 'wb', header=header) for bam in bams: sam = pysam.AlignmentFile(bam, 'rb') for read in sam: out.write(read) out.close() pysam.sort(bam_out, 'tmp') os.rename('tmp.bam',bam_out) pysam.index(bam_out)
def sort(file_name, sorted_prefix=None): """ Sorts and indexes the bam file given by file_name. """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_sorted' sorted_name = sorted_prefix + ".bam" subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
def sort_and_index(file_name, sorted_prefix=None): """ Sorts and indexes the bam file given by file_name. """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_sorted' sorted_name = sorted_prefix + '.bam' subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name]) pysam.index(sorted_name)
def sort_by_name(file_name, sorted_prefix=None): """ Sorts a bam file by the read name, for paired-end """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_namesorted' sorted_name = sorted_prefix + '.bam' # NOTE -- need to update our internal samtools in order to use pysam.sort #pysam.sort('-n', file_name, sorted_prefix) subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix]) return pysam.Samfile(sorted_name, 'rb')
def _save_dict(bed, out_fname, val_index=None): """Save data from dict to BED file.""" sites = pybedtools.BedTool( _iter_bed_dict(bed, val_index=val_index) ).saveas() sites1 = sites.sort().saveas(out_fname) return sites1
def find_roi_bam(chromosome_event): chr,event = chromosome_event .split("_") roi,sortbyname,sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path) exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_"+ chr +'.bed']) success = True try: if not terminating.is_set(): roisort = sub('.bam$', '.sorted', roi) if(os.path.isfile(exonsinroibed)): cmd=" ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]); runCommand(cmd) extractPairedReadfromROI(sortbyname, exonsinroibed, roi) removeIfEmpty(tmpbams_path,ntpath.basename(roi)) pysam.sort(roi ,roisort ) pysam.index(roisort+'.bam') os.remove(roi) except (KeyboardInterrupt): logger.error('Exception Crtl+C pressed in the child process in find_roi_bam for chr ' +chr + event) terminating.set() success=False return except Exception as e: logger.exception("Exception in find_roi_bam %s" ,e ) terminating.set() success=False return if(success): logger.debug("find_roi_bam complete successfully for "+chr + event) return
def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path): print "___ removing reads overlapping heterozygous region ___" inbamsorted = sub('.bam$','.sorted',inbamfn) pysam.sort(inbamfn, inbamsorted) pysam.index(inbamsorted+'.bam') alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" ) outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile ) bedfile = open(bedfn, 'r') for bedline in bedfile: c = bedline.strip().split() if (len(c) == 3 ): chr2 = c[0] chr = c[0].strip("chr") start = int(c[1]) end = int(c[2]) else : continue try: readmappings = alignmentfile.fetch(chr2, start, end) except ValueError as e: print("problem fetching the read ") for shortread in readmappings: try: outbam.write(shortread) except ValueError as e: print ("problem removing read :" + shortread.qname) outbamsorted = sub('.bam$','.sorted',outbamfn) pysam.sort(outbamfn, outbamsorted) bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path ) outbam.close()
def find_roi_amp(chromosome_event): chr, event = chromosome_event.split("_") roi, sortbyname, sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path) exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_" + chr + '.bed']) success = True try: if not terminating.is_set(): roisort = sub('.bam$', '.sorted', roi) if (os.path.isfile(exonsinroibed)): cmd = " ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]); runCommand(cmd) extractPairedReadfromROI(sortbyname, exonsinroibed, roi) removeIfEmpty(tmpbams_path, ntpath.basename(roi)) pysam.sort(roi, roisort) pysam.index(roisort + '.bam') os.remove(roi) except (KeyboardInterrupt): logger.error('Exception Crtl+C pressed in the child process in find_roi_bam for chr ' + chr + event) terminating.set() success = False return except Exception as e: logger.exception("Exception in find_roi_bam %s", e) terminating.set() success = False return if (success): logger.debug("find_roi_bam complete successfully for " + chr + event) return
def _sort_index(unsorted, output_bam): pysam.sort("-o", output_bam, "-m", PYSAM_SORT_MEM, unsorted) os.unlink(unsorted) pysam.index(output_bam)
def realigner(out_dir, tmp_dir, winsize=50, unstranded=False): """DOCSTRING Args: Returns: """ # file handlers mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb') ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb') obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam) chr_list=[x['SN'] for x in ubam.header['SQ']] # construct the mread_dict; this will be needed throughout mread_dict = defaultdict(list) for alignment in mbam: mread_dict[alignment.qname].append(alignment) # keep a record of processed reads processed_mreads = set() # iterate through all mreads for read_qname in mread_dict: if read_qname in processed_mreads: continue ## construct the fully-connected subgraph for each read read_to_locations, processed_mreads = \ construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded) subgraph = set() for read in read_to_locations: _ = map(subgraph.add, read_to_locations[read].keys()) subgraph = list(subgraph) ## build the BIT tracks node_track, multi_reads_weights = \ construct_BIT_track(subgraph, read_to_locations, ubam, unstranded) ## run EM multi_reads_weights = \ run_EM(node_track, multi_reads_weights, w=winsize) ## write to obam for read in multi_reads_weights: for node in multi_reads_weights[read]: alignment = read_to_locations[read][node] score = round(multi_reads_weights[read][node][0], 3) alignment.set_tag('AS', score) alignment.set_tag('PG', 'CLAM') obam.write(alignment) # sort the final output logger.info('sorting output') obam.close() ubam.close() mbam.close() obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam') pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam')) pysam.index(obam_sorted_fn) os.remove(os.path.join(out_dir, 'realigned.bam')) return
def filter_bam_maxtags(obam_fn, ibam_fn, max_tags=1): """DOCSTRING Args Returns """ assert max_tags>0 # prepare files ibam = pysam.Samfile(ibam_fn, 'rb') obam = pysam.Samfile(obam_fn, 'wb', template=ibam) # init collapse_dict = defaultdict(list) chr_list=[x['SN'] for x in ibam.header['SQ']] input_counter = 0 output_counter = 0 for chr in chr_list: # empty stack for each new chromosome stack = [] last_pos = -1 for read in ibam.fetch(chr): input_counter += 1 if not (input_counter % (5*(10**6)) ): logger.debug('collapsed %i alignments'%input_counter) if read.positions[0] > last_pos: new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags) output_counter += len(new_alignment_list) last_pos = read.positions[0] stack = [read] for new_alignment in new_alignment_list: new_alignment.query_sequence = '*' new_alignment.query_qualities = '0' _ = obam.write(new_alignment) else: stack.append(read) new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags) output_counter += len(new_alignment_list) last_pos = read.positions[0] for new_alignment in new_alignment_list: new_alignment.query_sequence = '*' new_alignment.query_qualities = '0' _ = obam.write(new_alignment) ibam.close() obam.close() #os.rename(obam_fn, ibam_fn) #pysam.sort(obam_fn) pysam.index(obam_fn) logger.info('Input = %s; Output = %s; Redundancy = %.2f'%(input_counter,output_counter, 1-float(output_counter)/input_counter)) return
def initialize(results_path,haplotype_path,cancer_dir_path): try: event_list=['gain','loss'] gaincnv = params.GetGainCNV() losscnv = params.GetLossCNV() logger.debug(' --- Initializing input files --- ') vcf_path = bamhelp.GetVCF() exons_path = bamhelp.GetExons() reference_path = bamhelp.GetRef() vpath, vcf = os.path.split(vcf_path) phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)]) vcftobed = "/".join([results_path, sub('.vcf$', '.bed', vcf)]) hap1vcf = "/".join([results_path,"hap1_het.vcf"]) hap2vcf = "/".join([results_path, "hap2_het.vcf"]) hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"]) hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"]) hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"]) hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"]) phased_bed = "/".join([results_path, "PHASED.BED"]) phaseVCF(vcf_path, phasedvcf) getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf) thinVCF(hap1vcf, hap1vcffiltered) thinVCF(hap2vcf, hap2vcffiltered) convertvcftobed(hap1vcffiltered+".recode.vcf", hap1vcffilteredtobed) convertvcftobed(hap2vcffiltered+".recode.vcf", hap2vcffilteredtobed) cmd1 = """sed -i 's/$/\thap1/' """+ hap1vcffilteredtobed cmd2 = """sed -i 's/$/\thap2/' """+ hap2vcffilteredtobed cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed' cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed runCommand(cmd1) runCommand(cmd2) runCommand(cmd3) runCommand(cmd4) os.remove('tmp.bed') for event in event_list: roibed = "/".join([haplotype_path, event + "_roi.bed"]) exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi.bed"]) nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"]) hetbed = "/".join([haplotype_path, event + "_het.bed"]) hetsnpbed = "/".join([haplotype_path, event + "_het_snp.bed"]) if(locals()[event + 'cnv']): intersectBed( exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True) intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True) splitBed(exonsinroibed, event+'_exons_in_roi_') splitBed(hetsnpbed, event+'_het_snp_') except: logger.exception("Initialization error !") raise logger.debug("--- initialization complete ---") return
def initialize_amp(results_path, haplotype_path, cancer_dir_path): try: event_list = ['gain', 'loss'] gaincnv = params.GetGainCNV() losscnv = params.GetLossCNV() logger.debug(' --- Initializing input files --- ') vcf_path = bamhelp.GetVCF() exons_path = bamhelp.GetExons() reference_path = bamhelp.GetRef() vpath, vcf = os.path.split(vcf_path) phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)]) vcftobed = "/".join([results_path, sub('.vcf$', '.bed', vcf)]) hap1vcf = "/".join([results_path, "hap1_het.vcf"]) hap2vcf = "/".join([results_path, "hap2_het.vcf"]) hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"]) hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"]) hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"]) hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"]) phased_bed = "/".join([results_path, "PHASED.BED"]) phaseVCF(vcf_path, phasedvcf) getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf) thinVCF(hap1vcf, hap1vcffiltered) thinVCF(hap2vcf, hap2vcffiltered) convertvcftobed(hap1vcffiltered + ".recode.vcf", hap1vcffilteredtobed) convertvcftobed(hap2vcffiltered + ".recode.vcf", hap2vcffilteredtobed) cmd1 = """sed -i 's/$/\thap1/' """ + hap1vcffilteredtobed cmd2 = """sed -i 's/$/\thap2/' """ + hap2vcffilteredtobed cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed' cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed runCommand(cmd1) runCommand(cmd2) runCommand(cmd3) runCommand(cmd4) os.remove('tmp.bed') for event in event_list: roibed = "/".join([haplotype_path, event + "_roi.bed"]) exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi.bed"]) nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"]) hetbed = "/".join([haplotype_path, event + "_het.bed"]) hetsnpbed = "/".join([haplotype_path, event + "_het_snp.bed"]) if (locals()[event + 'cnv']): intersectBed(exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True) intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True) splitBed(exonsinroibed, event + '_exons_in_roi_') splitBed(hetsnpbed, event + '_het_snp_') except: logger.exception("Initialization error !") raise logger.debug("--- initialization complete ---") return