我们从Python开源项目中,提取了以下12个代码示例,用于说明如何使用pysam.Tabixfile()。
def main(opts): # read in INDEL mutations indels = pd.read_csv(opts['input'], sep='\t') # pysam tabix uses 1-based coordinates pysam.tabix_index(opts['blacklist'], force=True, seq_col=0, start_col=1, end_col=2) # query black list to find INDELs with no hits non_coding_ixs, coding_ixs = [], [] black_list = pysam.Tabixfile(opts['blacklist']) for i, row in indels.iterrows(): result = black_list.fetch(reference=row['Chromosome'], start=row['Start_Position'], end=row['End_Position']) if not list(result): non_coding_ixs.append(i) else: coding_ixs.append(i) black_list.close() # save non-coding indels indels.ix[non_coding_ixs, :].to_csv(opts['output'], sep='\t', index=False) indels.ix[coding_ixs, :].to_csv(opts['blacklist_output'], sep='\t', index=False)
def create_tabix_infile(file_name): return pysam.Tabixfile(file_name)
def __init__(self, vcffile=None): self.vcffile = vcffile self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0] self.dbnfsp_reader = pysam.Tabixfile(settings.dbnsfp, encoding='utf-8') self.header = self.dbnfsp_reader.header.__next__().decode('utf8').strip().split('\t') # create folder snpeff if it doesn't exists if not os.path.exists('dbnfsp'): os.makedirs('dbnfsp')
def __prep_tabix(self, path): if path not in self.readers: self.readers[path] = Tabixfile(path) return self.readers[path]
def __init__(self, options, genelist, transcriptlist): self.options = options # Openning tabix file representing the Ensembl database self.tabixfile = pysam.Tabixfile(options.args['ensembl']) self.proteinSeqs = dict() self.exonSeqs = dict() self.genelist = genelist self.transcriptlist = transcriptlist # Find transcripts overlapping with a variant
def __init__(self, options): # Openning tabix file representing the dbSNP database self.tabixfile = pysam.Tabixfile(options.args['dbsnp']) # Annotating a variant based on dbSNP data
def __init__(self, threadidx, options, config, startline, endline, names): multiprocessing.Process.__init__(self) # Initializing variables self.threadidx = threadidx self.options = options self.config = config self.startline = startline self.endline = endline self.names = names # Initializing reads directory self.reads = dict() # Connecting to BAM file self.samfile = pysam.Samfile(options.input, "rb") # Connecting to transcript database file if not config['transcript_db'] is None: self.enstdb = pysam.Tabixfile(config['transcript_db']) else: self.enstdb = None # Initializing output files if int(options.threads) > 1: if config['outputs']['regions']: self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w') if config['outputs']['profiles']: self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w') if not config['transcript_db'] is None: self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w') else: if config['outputs']['regions']: self.out_targets = open(options.output + '_regions.txt', 'w') if config['outputs']['profiles']: self.out_profiles = open(options.output + '_profiles.txt', 'w') if not config['transcript_db'] is None: self.out_poor = open(options.output + '_poor.txt', 'w') # Checking if target is fail or pass
def __init__(self,options): self.options=options # Openning tabix file representing the Ensembl database self.tabixfile=pysam.Tabixfile(options.args['ensembl']) self.proteinSeqs=dict() self.exonSeqs=dict() # Finding trancsripts that overlap with a variant
def __init__(self,options): # Openning tabix file representing the dbSNP database self.tabixfile=pysam.Tabixfile(options.args['dbsnp']) # Annotating a variant based on dbSNP data
def __init__(self, vcffile=None): self.vcffile = vcffile self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0] # create folder merge if it doesn't exists if not os.path.exists('merge'): os.makedirs('merge') # enter inside folder os.chdir('merge') self.annotation_files = OrderedDict() pysam.tabix_index('../snpeff/snpeff.output.vcf', preset='vcf') self.annotation_files['snpeff'] = { 'info': 'EFF', 'file': pysam.Tabixfile('../snpeff/snpeff.output.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../vep/vep.output.sorted.vcf', preset='vcf') self.annotation_files['vep'] = { 'info': 'CSQ', 'file': pysam.Tabixfile('../vep/vep.output.sorted.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../snpsift/snpsift.final.vcf', preset='vcf') self.annotation_files['vartype'] = { 'info': 'VARTYPE,SNP,MNP,INS,DEL,MIXED,HOM,HET', 'file': pysam.Tabixfile('../snpsift/snpsift.final.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../decipher/hi_predictions.vcf', preset='vcf') self.annotation_files['decipher'] = { 'info': 'HI_PREDICTIONS', 'file': pysam.Tabixfile('../decipher/hi_predictions.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../pynnotator/pynnotator.vcf', preset='vcf') # genomes1k dbsnp clinvar esp6500 ensembl_phen ensembl_clin self.pynnotator_tags = ['genomes1k', 'dbsnp', 'clinvar', 'esp6500', 'ensembl_phen', 'ensembl_clin'] self.annotation_files['pynnotator'] = { 'info': 'ALL', 'file': pysam.Tabixfile('../pynnotator/pynnotator.vcf.gz', 'r', encoding="utf-8") } pysam.tabix_index('../func_pred/func_pred_sorted.vcf', preset='vcf') self.annotation_files['dbnfsp'] = { 'info': 'dbNSFP_SIFT_score,dbNSFP_SIFT_converted_rankscore,dbNSFP_SIFT_pred,dbNSFP_Uniprot_acc_Polyphen2,dbNSFP_Uniprot_id_Polyphen2,dbNSFP_Uniprot_aapos_Polyphen2,dbNSFP_Polyphen2_HDIV_score,dbNSFP_Polyphen2_HDIV_rankscore,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_score,dbNSFP_Polyphen2_HVAR_rankscore,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_LRT_score,dbNSFP_LRT_converted_rankscore,dbNSFP_LRT_pred,dbNSFP_LRT_Omega,dbNSFP_MutationTaster_score,dbNSFP_MutationTaster_converted_rankscore,dbNSFP_MutationTaster_pred,dbNSFP_MutationTaster_model,dbNSFP_MutationTaster_AAE,dbNSFP_MutationAssessor_UniprotID,dbNSFP_MutationAssessor_variant,dbNSFP_MutationAssessor_score,dbNSFP_MutationAssessor_rankscore,dbNSFP_MutationAssessor_pred,dbNSFP_FATHMM_score,dbNSFP_FATHMM_converted_rankscore,dbNSFP_FATHMM_pred,dbNSFP_PROVEAN_score,dbNSFP_PROVEAN_converted_rankscore,dbNSFP_PROVEAN_pred,dbNSFP_Transcript_id_VEST3,dbNSFP_Transcript_var_VEST3,dbNSFP_VEST3_score,dbNSFP_VEST3_rankscore,dbNSFP_MetaSVM_score,dbNSFP_MetaSVM_rankscore,dbNSFP_MetaSVM_pred,dbNSFP_MetaLR_score,dbNSFP_MetaLR_rankscore,dbNSFP_MetaLR_pred,dbNSFP_Reliability_index,dbNSFP_M-CAP_score,dbNSFP_M-CAP_rankscore,dbNSFP_M-CAP_pred,dbNSFP_REVEL_score,dbNSFP_REVEL_rankscore,dbNSFP_MutPred_score,dbNSFP_MutPred_rankscore,dbNSFP_MutPred_protID,dbNSFP_MutPred_AAchange,dbNSFP_MutPred_Top5features,dbNSFP_CADD_raw,dbNSFP_CADD_raw_rankscore,dbNSFP_CADD_phred,dbNSFP_DANN_score,dbNSFP_DANN_rankscore,dbNSFP_fathmm-MKL_coding_score,dbNSFP_fathmm-MKL_coding_rankscore,dbNSFP_fathmm-MKL_coding_pred,dbNSFP_fathmm-MKL_coding_group,dbNSFP_Eigen_coding_or_noncoding,dbNSFP_Eigen-raw,dbNSFP_Eigen-phred,dbNSFP_Eigen-PC-raw,dbNSFP_Eigen-PC-phred,dbNSFP_Eigen-PC-raw_rankscore,dbNSFP_GenoCanyon_score,dbNSFP_GenoCanyon_score_rankscore,dbNSFP_integrated_fitCons_score,dbNSFP_integrated_fitCons_rankscore,dbNSFP_integrated_confidence_value,dbNSFP_GM12878_fitCons_score,dbNSFP_GM12878_fitCons_rankscore,dbNSFP_GM12878_confidence_value,dbNSFP_H1-hESC_fitCons_score,dbNSFP_H1-hESC_fitCons_rankscore,dbNSFP_H1-hESC_confidence_value,dbNSFP_HUVEC_fitCons_score,dbNSFP_HUVEC_fitCons_rankscore,dbNSFP_clinvar_rs,dbNSFP_clinvar_clnsig,dbNSFP_clinvar_trait,dbNSFP_clinvar_golden_stars', 'file': pysam.Tabixfile('../func_pred/func_pred_sorted.vcf.gz', 'r', encoding="utf-8") } self.dbsnp = pysam.Tabixfile(settings.dbsnp, 'r', encoding="utf-8")
def annotate(self, out_prefix): # print 'Hello' # print self.dbnfsp_reader # header is at: # 24 SIFT_score: SIFT score (SIFTori). # 105 HUVEC_confidence_value: 0 - highly significant scores (approx. p<.003); 1 - significant scores # 188 clinvar_rs: rs number from the clinvar data set # 191 clinvar_golden_stars: ClinVar Review Status summary. # print 'input',vcffile, out_prefix, dbnsfp dbnfsp_reader = pysam.Tabixfile(settings.dbnsfp, 'r', encoding='utf-8') # print('header') for item in dbnfsp_reader.header: header = item.strip().split('\t') # header = dbnfsp_reader.header.next().strip().split('\t') vcffile = 'dbnfsp/part.%s.vcf' % (out_prefix) vcf_reader = open('%s' % (vcffile)) vcf_writer = open('dbnfsp/dbnfsp.%s.vcf' % (out_prefix), 'w', encoding="utf-8") for line in vcf_reader: if line.startswith('#'): if line.startswith('#CHROM'): vcf_writer.writelines(dbnfsp_header) vcf_writer.writelines(line) else: variant = line.split('\t') variant[0] = variant[0].replace('chr', '') index = '%s-%s' % (variant[0], variant[1]) # print index try: records = dbnfsp_reader.fetch(variant[0], int(variant[1]) - 1, int(variant[1])) except: records = [] for record in records: ann = record.strip().split('\t') ispresent = False if variant[3] == ann[2]: alts = variant[4].split(',') alts_ann = ann[3].split(',') # compare ALT for alt in alts: if alt in alts_ann: ispresent = True if ispresent: new_ann = [] for k, item in enumerate(header): idx = k if ann[idx] != '.': new_ann.append('dbNSFP_%s=%s' % (item, ann[idx].replace(';', '|'))) variant[7] = '%s;%s' % (variant[7], ";".join(new_ann)) vcf_writer.writelines("\t".join(variant))
def tabix_file(self, record, reference_tabix, columns=None, fallback='unknown'): """ Retrieve columns from a tabix file (i.e. gzipped and tabixxed tsv file) :param record: query VCF record :param reference_tabix: instance of Tabixfile :param columns: names of columns to retrieve :param fallback: fallback value :return: dict of {column: value} """ default = {x: fallback for x in columns} if record.CHROM.startswith('chr') and not any( [x.startswith('chr') for x in reference_tabix.contigs] ): chrom = record.CHROM.replace('chr', '') else: chrom = record.CHROM # adjust range (pysam's ranges are 0-based, half-open) start, end = record.POS - 1, record.POS # we ONLY consider the LAST line of any header field. This MUST be tab-delimited if not hasattr(self, "__tabix_header_{0}".format(reference_tabix)): setattr(self, "__tabix_header_{0}".format(reference_tabix), [x for x in reference_tabix.header]) tabix_header = getattr( self, "__tabix_header_{0}".format(reference_tabix) )[-1].strip().split("\t") try: ref_iter = reference_tabix.fetch(chrom, start=start, end=end) except ValueError: return default ref_piter = PeekableIterator(ref_iter) if ref_piter.peek() is None: return default for ref_rec in ref_piter: row = ref_rec.split('\t') assert len(row) == len(tabix_header), \ "{0} {1}".format(len(row), len(tabix_header)) # check POS, REF, ALT ~ only fetch value if all three are the same # with their record counterpart if str(record.POS) == row[1] and record.REF == row[2] and \ ','.join([str(x) for x in record.ALT]) == row[3]: for colname in columns: default[colname] = collapse_values(row[tabix_header.index(colname)]) else: if ref_piter.peek() is None: return default return default