Python pysam 模块，sort() 实例源码

我们从Python开源项目中，提取了以下13个代码示例，用于说明如何使用pysam.sort()。

项目：NGaDNAP 作者：theboocock | 项目源码 | 文件源码

def merge_reads(bams, merged_output_dir):
    """
        Merge bam files, can take odd numbers of reads. 
    """
    try:
        os.mkdir(merged_output_dir)
    except OSError:
        pass

    bam_list = get_bam_pairs(bams)
    for sample_id, bams in bam_list.items():
        header = get_header(bams[0])
        bam_out = os.path.join(merged_output_dir, os.path.basename(sample_id)) + '.bam'
        out = pysam.AlignmentFile(bam_out, 'wb', header=header)
        for bam in bams:
            sam = pysam.AlignmentFile(bam, 'rb')
            for read in sam:
                out.write(read)
        out.close()
        pysam.sort(bam_out, 'tmp')
        os.rename('tmp.bam',bam_out)
        pysam.index(bam_out)

项目：cellranger 作者：10XGenomics | 项目源码 | 文件源码

def sort(file_name, sorted_prefix=None):
    """ Sorts and indexes the bam file given by file_name.
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_sorted'

    sorted_name = sorted_prefix + ".bam"
    subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])

项目：cellranger 作者：10XGenomics | 项目源码 | 文件源码

def sort_and_index(file_name, sorted_prefix=None):
    """ Sorts and indexes the bam file given by file_name.
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_sorted'

    sorted_name = sorted_prefix + '.bam'
    subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
    pysam.index(sorted_name)

项目：cellranger 作者：10XGenomics | 项目源码 | 文件源码

def sort_by_name(file_name, sorted_prefix=None):
    """ Sorts a bam file by the read name, for paired-end
    """
    if sorted_prefix is None:
        sorted_prefix = file_name.replace('.bam', '') + '_namesorted'

    sorted_name = sorted_prefix + '.bam'
    # NOTE -- need to update our internal samtools in order to use pysam.sort
    #pysam.sort('-n', file_name, sorted_prefix)
    subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix])

    return pysam.Samfile(sorted_name, 'rb')

项目：iCount 作者：tomazc | 项目源码 | 文件源码

def _save_dict(bed, out_fname, val_index=None):
    """Save data from dict to BED file."""
    sites = pybedtools.BedTool(
        _iter_bed_dict(bed, val_index=val_index)
    ).saveas()
    sites1 = sites.sort().saveas(out_fname)
    return sites1

项目：bamgineer 作者：pughlab | 项目源码 | 文件源码

def find_roi_bam(chromosome_event):
    chr,event = chromosome_event .split("_")
    roi,sortbyname,sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path)
    exonsinroibed = "/".join([haplotype_path,   event + "_exons_in_roi_"+ chr +'.bed'])
    success = True
    try:
        if not terminating.is_set():
            roisort = sub('.bam$', '.sorted', roi)
            if(os.path.isfile(exonsinroibed)):

                 cmd=" ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]); runCommand(cmd)
                 extractPairedReadfromROI(sortbyname, exonsinroibed, roi)
                 removeIfEmpty(tmpbams_path,ntpath.basename(roi))
                 pysam.sort(roi ,roisort )
                 pysam.index(roisort+'.bam')
                 os.remove(roi)

    except (KeyboardInterrupt):
        logger.error('Exception Crtl+C pressed in the child process  in find_roi_bam for chr ' +chr + event)
        terminating.set()
        success=False
        return
    except Exception as e:   
        logger.exception("Exception in find_roi_bam %s" ,e )
        terminating.set()
        success=False
        return
    if(success):
        logger.debug("find_roi_bam complete successfully for "+chr + event) 
    return

项目：bamgineer 作者：pughlab | 项目源码 | 文件源码

def removeReadsOverlappingHetRegion(inbamfn, bedfn,outbamfn,path):
    print "___ removing reads overlapping heterozygous region ___"
    inbamsorted =  sub('.bam$','.sorted',inbamfn)
    pysam.sort(inbamfn, inbamsorted)
    pysam.index(inbamsorted+'.bam')

    alignmentfile = pysam.AlignmentFile(inbamsorted+'.bam', "rb" )
    outbam = pysam.Samfile(outbamfn, 'wb', template=alignmentfile )

    bedfile = open(bedfn, 'r')

    for bedline in bedfile:
        c = bedline.strip().split()

        if (len(c) == 3 ):
            chr2 = c[0]
            chr = c[0].strip("chr")
            start = int(c[1])
            end   = int(c[2])
        else :
            continue

        try:
            readmappings = alignmentfile.fetch(chr2, start, end)
        except  ValueError as e:
            print("problem fetching the read ")

        for shortread in readmappings:
            try:
                outbam.write(shortread)
            except ValueError as e:
                print ("problem removing read :" + shortread.qname)
    outbamsorted =  sub('.bam$','.sorted',outbamfn)            
    pysam.sort(outbamfn, outbamsorted)
    bamDiff(inbamsorted+'.bam', outbamsorted +'.bam', path )
    outbam.close()

项目：bamgineer 作者：pughlab | 项目源码 | 文件源码

def find_roi_amp(chromosome_event):
    chr, event = chromosome_event.split("_")
    roi, sortbyname, sortbyCoord, hetsnp = init_file_names(chr, event, tmpbams_path, haplotype_path)
    exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi_" + chr + '.bed'])
    success = True
    try:
        if not terminating.is_set():
            roisort = sub('.bam$', '.sorted', roi)
            if (os.path.isfile(exonsinroibed)):
                cmd = " ".join(["sort -u", exonsinroibed, "-o", exonsinroibed]);
                runCommand(cmd)
                extractPairedReadfromROI(sortbyname, exonsinroibed, roi)
                removeIfEmpty(tmpbams_path, ntpath.basename(roi))
                pysam.sort(roi, roisort)
                pysam.index(roisort + '.bam')
                os.remove(roi)

    except (KeyboardInterrupt):
        logger.error('Exception Crtl+C pressed in the child process  in find_roi_bam for chr ' + chr + event)
        terminating.set()
        success = False
        return
    except Exception as e:
        logger.exception("Exception in find_roi_bam %s", e)
        terminating.set()
        success = False
        return
    if (success):
        logger.debug("find_roi_bam complete successfully for " + chr + event)
    return

项目：clrsvsim 作者：color | 项目源码 | 文件源码

def _sort_index(unsorted, output_bam):
    pysam.sort("-o", output_bam, "-m", PYSAM_SORT_MEM, unsorted)
    os.unlink(unsorted)
    pysam.index(output_bam)

项目：CLAM 作者：Xinglab | 项目源码 | 文件源码

def realigner(out_dir, tmp_dir, winsize=50, unstranded=False):
    """DOCSTRING
    Args:
    Returns:
    """
    # file handlers
    mbam = pysam.Samfile(os.path.join(tmp_dir, 'multi.sorted.bam'),'rb')
    ubam = pysam.Samfile(os.path.join(tmp_dir, 'unique.sorted.bam'),'rb')
    obam = pysam.Samfile(os.path.join(out_dir, 'realigned.bam'), 'wb', template = mbam)
    chr_list=[x['SN'] for x in ubam.header['SQ']]

    # construct the mread_dict; this will be needed throughout
    mread_dict = defaultdict(list)
    for alignment in mbam:
        mread_dict[alignment.qname].append(alignment)

    # keep a record of processed reads
    processed_mreads = set()

    # iterate through all mreads
    for read_qname in mread_dict:
        if read_qname in processed_mreads:
            continue

        ## construct the fully-connected subgraph for each read
        read_to_locations, processed_mreads = \
            construct_subgraph(mbam, read_qname, mread_dict, processed_mreads, chr_list, winsize=winsize, unstranded=unstranded)
        subgraph = set()
        for read in read_to_locations:
            _ = map(subgraph.add, read_to_locations[read].keys())
        subgraph = list(subgraph)

        ## build the BIT tracks
        node_track, multi_reads_weights = \
            construct_BIT_track(subgraph, read_to_locations, ubam, unstranded)

        ## run EM
        multi_reads_weights = \
            run_EM(node_track, multi_reads_weights, w=winsize)

        ## write to obam
        for read in multi_reads_weights:
            for node in multi_reads_weights[read]:
                alignment = read_to_locations[read][node]
                score = round(multi_reads_weights[read][node][0], 3)
                alignment.set_tag('AS', score)
                alignment.set_tag('PG', 'CLAM')
                obam.write(alignment)
    # sort the final output
    logger.info('sorting output')
    obam.close()
    ubam.close()
    mbam.close()
    obam_sorted_fn = os.path.join(out_dir, 'realigned.sorted.bam')
    pysam.sort('-o', obam_sorted_fn, os.path.join(out_dir, 'realigned.bam'))
    pysam.index(obam_sorted_fn)
    os.remove(os.path.join(out_dir, 'realigned.bam'))
    return

项目：CLAM 作者：Xinglab | 项目源码 | 文件源码

def filter_bam_maxtags(obam_fn, ibam_fn, max_tags=1):
    """DOCSTRING
    Args
    Returns
    """
    assert max_tags>0
    # prepare files
    ibam = pysam.Samfile(ibam_fn, 'rb')
    obam = pysam.Samfile(obam_fn, 'wb', template=ibam)
    # init 
    collapse_dict = defaultdict(list)
    chr_list=[x['SN'] for x in ibam.header['SQ']]
    input_counter = 0
    output_counter = 0

    for chr in chr_list:
        # empty stack for each new chromosome
        stack = []
        last_pos = -1
        for read in ibam.fetch(chr):
            input_counter += 1
            if not (input_counter % (5*(10**6)) ):
                logger.debug('collapsed %i alignments'%input_counter)
            if read.positions[0] > last_pos:
                new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
                output_counter += len(new_alignment_list)
                last_pos = read.positions[0]
                stack = [read]
                for new_alignment in new_alignment_list:
                    new_alignment.query_sequence = '*'
                    new_alignment.query_qualities = '0'
                    _ = obam.write(new_alignment)
            else:
                stack.append(read)
        new_alignment_list, collapse_dict = collapse_stack(stack, collapse_dict, max_tags)
        output_counter += len(new_alignment_list)
        last_pos = read.positions[0]
        for new_alignment in new_alignment_list:
            new_alignment.query_sequence = '*'
            new_alignment.query_qualities = '0'
            _ = obam.write(new_alignment)
    ibam.close()
    obam.close()
    #os.rename(obam_fn, ibam_fn)
    #pysam.sort(obam_fn)
    pysam.index(obam_fn)
    logger.info('Input = %s; Output = %s; Redundancy = %.2f'%(input_counter,output_counter, 1-float(output_counter)/input_counter))
    return

项目：bamgineer 作者：pughlab | 项目源码 | 文件源码

def initialize(results_path,haplotype_path,cancer_dir_path):

    try:
        event_list=['gain','loss']
        gaincnv = params.GetGainCNV()
        losscnv = params.GetLossCNV()
        logger.debug(' --- Initializing input files  --- ')
        vcf_path = bamhelp.GetVCF()
        exons_path = bamhelp.GetExons()
        reference_path = bamhelp.GetRef()
        vpath, vcf = os.path.split(vcf_path)
        phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)])
        vcftobed =  "/".join([results_path, sub('.vcf$', '.bed', vcf)])

        hap1vcf = "/".join([results_path,"hap1_het.vcf"])
        hap2vcf = "/".join([results_path, "hap2_het.vcf"])
        hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"])
        hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"])
        hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"])
        hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"])
        phased_bed =  "/".join([results_path, "PHASED.BED"])

        phaseVCF(vcf_path, phasedvcf)
        getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf)
        thinVCF(hap1vcf, hap1vcffiltered)
        thinVCF(hap2vcf, hap2vcffiltered)
        convertvcftobed(hap1vcffiltered+".recode.vcf", hap1vcffilteredtobed)
        convertvcftobed(hap2vcffiltered+".recode.vcf", hap2vcffilteredtobed)

        cmd1 = """sed -i 's/$/\thap1/' """+ hap1vcffilteredtobed
        cmd2 = """sed -i 's/$/\thap2/' """+ hap2vcffilteredtobed
        cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed'
        cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed  

        runCommand(cmd1)
        runCommand(cmd2)
        runCommand(cmd3)
        runCommand(cmd4)
        os.remove('tmp.bed')  

        for  event in event_list: 
            roibed = "/".join([haplotype_path,  event + "_roi.bed"])
            exonsinroibed = "/".join([haplotype_path,   event + "_exons_in_roi.bed"])
            nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"])
            hetbed = "/".join([haplotype_path, event + "_het.bed"])
            hetsnpbed = "/".join([haplotype_path,  event + "_het_snp.bed"])

            if(locals()[event + 'cnv']):
                intersectBed( exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True)
                intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True)
                splitBed(exonsinroibed, event+'_exons_in_roi_')
                splitBed(hetsnpbed, event+'_het_snp_')

    except:  
        logger.exception("Initialization error !")
        raise
    logger.debug("--- initialization complete ---")    
    return

项目：bamgineer 作者：pughlab | 项目源码 | 文件源码

def initialize_amp(results_path, haplotype_path, cancer_dir_path):
    try:
        event_list = ['gain', 'loss']
        gaincnv = params.GetGainCNV()
        losscnv = params.GetLossCNV()
        logger.debug(' --- Initializing input files  --- ')
        vcf_path = bamhelp.GetVCF()
        exons_path = bamhelp.GetExons()
        reference_path = bamhelp.GetRef()
        vpath, vcf = os.path.split(vcf_path)
        phasedvcf = "/".join([results_path, sub('.vcf$', '_phased.vcf.gz', vcf)])
        vcftobed = "/".join([results_path, sub('.vcf$', '.bed', vcf)])

        hap1vcf = "/".join([results_path, "hap1_het.vcf"])
        hap2vcf = "/".join([results_path, "hap2_het.vcf"])
        hap1vcffiltered = "/".join([results_path, "hap1_het_filtered"])
        hap2vcffiltered = "/".join([results_path, "hap2_het_filtered"])
        hap1vcffilteredtobed = "/".join([results_path, "hap1_het_filtered.bed"])
        hap2vcffilteredtobed = "/".join([results_path, "hap2_het_filtered.bed"])
        phased_bed = "/".join([results_path, "PHASED.BED"])

        phaseVCF(vcf_path, phasedvcf)
        getVCFHaplotypes(phasedvcf, hap1vcf, hap2vcf)
        thinVCF(hap1vcf, hap1vcffiltered)
        thinVCF(hap2vcf, hap2vcffiltered)
        convertvcftobed(hap1vcffiltered + ".recode.vcf", hap1vcffilteredtobed)
        convertvcftobed(hap2vcffiltered + ".recode.vcf", hap2vcffilteredtobed)

        cmd1 = """sed -i 's/$/\thap1/' """ + hap1vcffilteredtobed
        cmd2 = """sed -i 's/$/\thap2/' """ + hap2vcffilteredtobed
        cmd3 = "cat " + hap1vcffilteredtobed + " " + hap2vcffilteredtobed + " > " + 'tmp.bed'
        cmd4 = "sort -V -k1,1 -k2,2 tmp.bed > " + phased_bed

        runCommand(cmd1)
        runCommand(cmd2)
        runCommand(cmd3)
        runCommand(cmd4)
        os.remove('tmp.bed')

        for event in event_list:
            roibed = "/".join([haplotype_path, event + "_roi.bed"])
            exonsinroibed = "/".join([haplotype_path, event + "_exons_in_roi.bed"])
            nonhetbed = "/".join([haplotype_path, event + "_non_het.bed"])
            hetbed = "/".join([haplotype_path, event + "_het.bed"])
            hetsnpbed = "/".join([haplotype_path, event + "_het_snp.bed"])

            if (locals()[event + 'cnv']):
                intersectBed(exons_path, locals()[event + 'cnv'], exonsinroibed, wa=True)
                intersectBed(phased_bed, exonsinroibed, hetsnpbed, wa=True)
                splitBed(exonsinroibed, event + '_exons_in_roi_')
                splitBed(hetsnpbed, event + '_het_snp_')

    except:
        logger.exception("Initialization error !")
        raise
    logger.debug("--- initialization complete ---")
    return