我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用itertools.izip()。
def visualize(self, vis, colored=True): try: tids = set(self.ids) except: return vis for hid, hbox in izip(self.ids, self.bboxes): cv2.rectangle(vis, (hbox[0], hbox[1]), (hbox[2], hbox[3]), (0,255,0), 1) vis = super(BoundingBoxKLT, self).viz(vis, colored=colored) # for tid, pts in self.tm_.tracks.iteritems(): # if tid not in tids: continue # cv2.polylines(vis, [np.vstack(pts.items).astype(np.int32)[-4:]], False, # (0,255,0), thickness=1) # tl, br = np.int32(pts.latest_item)-2, np.int32(pts.latest_item)+2 # cv2.rectangle(vis, (tl[0], tl[1]), (br[0], br[1]), (0,255,0), -1) # OpenCVKLT.draw_tracks(self, vis, colored=colored, max_track_length=10) return vis
def pipe_weighted_edgelist_to_convert(matrix, bin_filename, weight_filename): """ Pipe a weighted edgelist (COO sparse matrix) to Louvain's convert utility """ raise ValueError('Unsupported method at the moment') devnull = open(os.devnull, 'w') proc = subprocess.Popen([LOUVAIN_CONVERT_BINPATH, '-i', '/dev/stdin', '-o', bin_filename, '-w', weight_filename, ], stdin=subprocess.PIPE, stdout=devnull, stderr=devnull) # Stream text triplets to 'convert' for ijx in itertools.izip(matrix.row, matrix.col, matrix.data): proc.stdin.write('%d\t%d\t%f\n' % ijx) proc.stdin.close() proc.wait() devnull.close()
def pipe_unweighted_edgelist_to_convert(matrix, bin_filename): """ Pipe an unweighted edgelist (COO sparse matrix) to Louvain's convert utility """ devnull = open(os.devnull, 'w') proc = subprocess.Popen([LOUVAIN_CONVERT_BINPATH, '-i', '/dev/stdin', '-o', bin_filename, ], stdin=subprocess.PIPE, stdout=devnull, stderr=devnull) # Stream text triplets to 'convert' for ij in itertools.izip(matrix.row, matrix.col): proc.stdin.write('%d\t%d\n' % ij) proc.stdin.close() proc.wait() devnull.close()
def numpy_groupby(values, keys): """ Group a collection of numpy arrays by key arrays. Yields (key_tuple, view_tuple) where key_tuple is the key grouped on and view_tuple is a tuple of views into the value arrays. values: tuple of arrays to group keys: tuple of sorted, numeric arrays to group by """ if len(values) == 0: return if len(values[0]) == 0: return for key_array in keys: assert len(key_array) == len(keys[0]) for value_array in values: assert len(value_array) == len(keys[0]) # The indices where any of the keys differ from the previous key become group boundaries key_change_indices = np.logical_or.reduce(tuple(np.concatenate(([1], np.diff(key))) != 0 for key in keys)) group_starts = np.flatnonzero(key_change_indices) group_ends = np.roll(group_starts, -1) group_ends[-1] = len(keys[0]) for group_start, group_end in itertools.izip(group_starts, group_ends): yield tuple(key[group_start] for key in keys), tuple(value[group_start:group_end] for value in values)
def load(group): gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read()) if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR): gene_names = list(getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read()) else: gene_names = gene_ids assert len(gene_ids) == len(gene_names) genes = [cr_constants.Gene(id, name, None, None, None) for id, name in itertools.izip(gene_ids, gene_names)] bcs = list(getattr(group, cr_constants.H5_BCS_ATTR).read()) matrix = GeneBCMatrix(genes, bcs) shape = getattr(group, cr_constants.H5_MATRIX_SHAPE_ATTR).read() data = getattr(group, cr_constants.H5_MATRIX_DATA_ATTR).read() indices = getattr(group, cr_constants.H5_MATRIX_INDICES_ATTR).read() indptr = getattr(group, cr_constants.H5_MATRIX_INDPTR_ATTR).read() # quick check to make sure indptr increases monotonically (to catch overflow bugs) assert np.all(np.diff(indptr)>=0) matrix.m = sp_sparse.csc_matrix((data, indices, indptr), shape=shape) return matrix
def vdj_filter_barcodes_cb(self, cell_barcodes, barcodes, counts, total_read_pairs, assemblable_read_pairs, recovered_cells): self._get_metric_attr('vdj_filtered_bcs').set_value(len(cell_barcodes)) cell_barcodes = set(cell_barcodes) cell_read_pairs = 0 barcoded_read_pairs = 0 for bc, count in itertools.izip(barcodes, counts): if bc in cell_barcodes: cell_read_pairs += count barcoded_read_pairs += count self._get_metric_attr('vdj_filtered_bcs_cum_frac').set_value(cell_read_pairs, barcoded_read_pairs) self._get_metric_attr('vdj_total_raw_read_pairs_per_filtered_bc').set_value(total_read_pairs, len(cell_barcodes)) self._get_metric_attr('vdj_assemblable_read_pairs_per_filtered_bc').set_value(assemblable_read_pairs, len(cell_barcodes)) self._get_metric_attr('vdj_sequencing_efficiency').set_value(assemblable_read_pairs, total_read_pairs) self._get_metric_attr('vdj_filtered_bcs_relative_difference_from_recovered_cells').set_value(len(cell_barcodes) - recovered_cells, recovered_cells)
def split(args): assert len(args.read1s) == len(args.read2s) chunks = [] # Determine the number of buckets required to achieve # the given chunk size. chunks_per_gem_group = {} with open(args.reads_summary) as f: reads_summary = json.load(f) for gg in args.gem_groups: readpairs = reads_summary['%d_total_reads_per_gem_group' % gg] chunks_per_gem_group[str(gg)] = max(2, int(math.ceil(float(readpairs) / \ args.readpairs_per_chunk))) for fastq1, fastq2 in itertools.izip(args.read1s, args.read2s): chunks.append({ 'read1s_chunk': fastq1, 'read2s_chunk': fastq2, 'chunks_per_gem_group': chunks_per_gem_group, }) return {'chunks': chunks}
def iteritems(self, every_k_frames=1): for rgb_im, depth_im, mask_im, loc in \ izip(self.rgb.iteritems(every_k_frames=every_k_frames), self.depth.iteritems(every_k_frames=every_k_frames), self.mask.iteritems(every_k_frames=every_k_frames), self.locations[::every_k_frames]): rgb = np.zeros(shape=UWRGBDObjectDataset.default_rgb_shape, dtype=np.uint8) depth = np.zeros(shape=UWRGBDObjectDataset.default_depth_shape, dtype=np.uint16) mask = np.zeros(shape=UWRGBDObjectDataset.default_depth_shape, dtype=np.uint8) rgb[loc[1]:loc[1]+rgb_im.shape[0], loc[0]:loc[0]+rgb_im.shape[1]] = rgb_im depth[loc[1]:loc[1]+depth_im.shape[0], loc[0]:loc[0]+depth_im.shape[1]] = depth_im mask[loc[1]:loc[1]+mask_im.shape[0], loc[0]:loc[0]+mask_im.shape[1]] = mask_im # Only a single bbox per image yield AttrDict(img=rgb, depth=depth, mask=mask, bbox=[AttrDict( coords=np.float32([loc[0], loc[1], loc[0]+mask_im.shape[1], loc[1]+mask_im.shape[0]]), target=self.target, category=UWRGBDDataset.get_category_name(self.target), instance=self.instance)])
def iteritems(self, every_k_frames=1): index = 0 # , bbox, pose bbox, pose = None, None for rgb_im, depth_im, instance, label in izip(islice(self._ims, 0, None, every_k_frames), islice(self._depths, 0, None, every_k_frames), islice(self._instances, 0, None, every_k_frames), islice(self._labels, 0, None, every_k_frames) ): index += every_k_frames yield self._process_items(index, rgb_im, depth_im, instance, label, bbox, pose) # def iterinds(self, inds): # for index, rgb_im, depth_im, bbox, pose in izip(inds, # self.rgb.iterinds(inds), # self.depth.iterinds(inds), # [self.bboxes[ind] for ind in inds], # [self.poses[ind] for ind in inds]): # yield self._process_items(index, rgb_im, depth_im, bbox, pose)
def iter_keys_values(self, keys, inds=None, verbose=False): for key in keys: if key not in self.keys_: raise RuntimeError('Key %s not found in dataset. keys: %s' % (key, self.keys_)) idx, ii = 0, 0 total_chunks = len(self.meta_file_.chunks) inds = np.sort(inds) if inds is not None else None for chunk_idx, chunk in enumerate(progressbar(self.meta_file_.chunks, size=total_chunks, verbose=verbose)): data = AttrDict.load(self.get_chunk_filename(chunk_idx)) # if inds is None: items = (data[key] for key in keys) for item in izip(*items): yield item # else: # for i, item in enumerate(data[key]): # if inds[ii] == idx + i: # yield item # ii += 1 # if ii >= len(inds): break # idx += len(data[key])
def _records_protocol_v1(self, ifile): reader = csv.reader(ifile, dialect=CsvDialect) try: fieldnames = reader.next() except StopIteration: return mv_fieldnames = {name: name[len('__mv_'):] for name in fieldnames if name.startswith('__mv_')} if len(mv_fieldnames) == 0: for values in reader: yield OrderedDict(izip(fieldnames, values)) return for values in reader: record = OrderedDict() for fieldname, value in izip(fieldnames, values): if fieldname.startswith('__mv_'): if len(value) > 0: record[mv_fieldnames[fieldname]] = self._decode_list(value) elif fieldname not in record: record[fieldname] = value yield record
def gen_values(self, n, reversed = False, shuffled = False, gen_dupes = False): if reversed: keys = xrange(n-1,-1,-1) else: keys = xrange(n) if shuffled: keys = list(keys) r = random.Random(1234827) r.shuffle(keys) if gen_dupes: return itertools.chain( itertools.izip(keys, xrange(0, 2*n, 2)), itertools.islice(itertools.izip(keys, xrange(0, 2*n, 2)), 10, None), ) else: return itertools.izip(keys, xrange(0, 2*n, 2))
def constant_time_compare(val1, val2): """Returns True if the two strings are equal, False otherwise. The time taken is independent of the number of characters that match. Do not use this function for anything else than comparision with known length targets. This is should be implemented in C in order to get it completely right. """ if _builtin_constant_time_compare is not None: return _builtin_constant_time_compare(val1, val2) len_eq = len(val1) == len(val2) if len_eq: result = 0 left = val1 else: result = 1 left = val2 for x, y in izip(bytearray(left), bytearray(val2)): result |= x ^ y return result == 0
def adadelta(parameters, gradients, rho=0.95, eps=1e-6): """ Reference: ADADELTA: An Adaptive Learning Rate Method, Zeiler 2012. https://arxiv.org/abs/1212.5701 Adapted from the Adadelta implementation from Tensorflow. """ accum = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters] accum_updates = [theano.shared(numpy.zeros(p.get_value().shape, floatX)) for p in parameters] new_accum = [rho * g0 + (1.0 - rho) * (g**2) for g0, g in izip(accum, gradients)] updates = [tensor.sqrt(d0 + eps) / tensor.sqrt(g0 + eps) * g for d0, g0, g in izip(accum_updates, new_accum, gradients)] new_accum_updates = [rho * d0 + (1.0 - rho) * (d**2) for d0, d in izip(accum_updates, updates)] accum_ = zip(accum, new_accum) accum_updates_ = zip(accum_updates, new_accum_updates) parameters_ = [ (p, (p - d)) for p,d in izip(parameters, updates)] return accum_ + accum_updates_ + parameters_
def safeStringComparison(s1, s2): """ Performs a string comparison in constant time. This should prevent side-channel (timing) attacks on passwords etc. :param s1: First string to compare :type s1: string | unicode :param s2: Second string to compare :type s2: string | unicode :return: True if both strings are equal, False otherwise :return type: bool """ isOkay = True if type(s1) != type(s2): isOkay = False # We have a unicode/str messup here if len(s1) != len(s2): isOkay = False for x, y in izip(s1, s2): if x != y: isOkay = False return isOkay
def infecting_node(infected_vec, infecting_vec, node_vec): ''' Returns a vector of nodes of infecting events. Arguments: infecting_vec - vector of infecting event ids infected_vec - vector of event ids node_vec - vector of infected node ids ''' infecting_node_vec = [] eventid_to_node = {} for (evid, inf_evid, nodeid) in izip(infected_vec, infecting_vec, node_vec): eventid_to_node[int(evid)] = nodeid infecting_node_vec.append(eventid_to_node[int(inf_evid)]) infecting_node_vec = np.array(infecting_node_vec).flatten() return (infecting_node_vec, eventid_to_node)
def kf_sim(sim): """ Process each simulation trial generated with :func:`setup_random_test` with a Kalman filter and return the posterior state estimates and error covariances. """ post = defaultdict(dict) for l in range(sim['L']): x_hat_l, P_l = kalman_filter(sim[l]['y'], sim['H'], sim['R'], sim['F'], sim['Q'], sim['mu'], sim['PI']) post[l]['x_hat'] = x_hat_l if l == 0: post['P'] = P_l post[l]['error'] = [] for x_i, x_hat_i in izip(sim[l]['x'], post[l]['x_hat']): post[l]['error'].append(x_hat_i - x_i) return post
def sqrt_kf_sim(sim): """ Process each simulation trial generated with :func:`setup_random_test` with a Kalman filter and return the posterior state estimates and error covariances. """ post = defaultdict(dict) for l in range(sim['L']): x_hat_l, P_sqrt_l = sqrt_kalman_filter(sim[l]['y'], sim['H'], sim['R_sqrt'], sim['F'], sim['Q_sqrt'], sim['mu'], sim['PI_sqrt']) post[l]['x_hat'] = x_hat_l if l == 0: post['P'] = [NP.matmul(x, x.T) for x in P_sqrt_l] post[l]['error'] = [] for x_i, x_hat_i in izip(sim[l]['x'], post[l]['x_hat']): post[l]['error'].append(x_hat_i - x_i) return post
def _copy_from(self, curs, nrecs, srec, copykw): f = StringIO() for i, c in izip(xrange(nrecs), cycle(string.ascii_letters)): l = c * srec f.write("%s\t%s\n" % (i, l)) f.seek(0) curs.copy_from(MinimalRead(f), "tcopy", **copykw) curs.execute("select count(*) from tcopy") self.assertEqual(nrecs, curs.fetchone()[0]) curs.execute("select data from tcopy where id < %s order by id", (len(string.ascii_letters),)) for i, (l,) in enumerate(curs): self.assertEqual(l, string.ascii_letters[i] * srec)
def __init__(self, event_funcs): super(_WrappedFunctionListener, self).__init__() if len(event_funcs) % 2 != 0: raise InvalidArgumentException(( u"event_funcs??????" u"???(???1, ??1, ???2, ??2, ...)???")) for event_name, func in itertools.izip( event_funcs[::2], event_funcs[1::2]): if not event_name.startswith("on_"): event_name = "on_" + event_name if event_name not in AbstractListener.EVENTS: raise InvalidArgumentException(u"??????{}".format(event_name)) def event_handler_factory(func): def event_handler(ctx, f=func): return f(ctx) return event_handler setattr(self, event_name, event_handler_factory(func))
def compare(tokens, trans_tokens): pairs = [] same_len = len(tokens) == len(trans_tokens) consecutive = False for a, b in zip(tokens, trans_tokens): if a != b: if consecutive and not same_len: break if not consecutive: consecutive = True pairs.append((a, b)) else: consecutive = False return pairs
def article_to_pairs(arg): article, direction = arg pairs = [] if 'text' not in article: return [] sents = sent_tokenize(article['text'], language='norwegian') translations = translate(sents, direction) for sent, trans in zip(sents, translations): trans_tokens = tokenize(trans) tokens = tokenize(sent) pairs += compare(tokens, trans_tokens) del article del sents del translations return pairs
def filter2(criterion, key_list, other_list): """ Filter two lists of corresponding items based on some function of the first list. """ # Make the output lists out1 = [] out2 = [] for key_val, other_val in itertools.izip(key_list, other_list): # Pair up the items if criterion(key_val): # The key passed the filter, so take both. out1.append(key_val) out2.append(other_val) return out1, out2
def save_images(nifti_files, anat, roi_dict, out_dir, **kwargs): '''Saves multiple nifti images using multiprocessing. Uses `multiprocessing`. Args: nifti_files (list): list of nifti file paths. anat (nipy.core.api.image.image.Image): anatomical image. roi_dict (dict): dictionary of cluster dictionaries. out_dir (str): output directory path. **kwargs: extra keyword arguments. ''' p = mp.Pool(30) idx = [int(f.split('/')[-1].split('.')[0]) for f in nifti_files] args_iter = itertools.izip(nifti_files, itertools.repeat(anat), [roi_dict[i] for i in idx], [path.join(out_dir, '%d.png' % i) for i in idx], idx) p.map(save_helper, args_iter) p.close() p.join()
def DoRenaming(options, deps): """Copy and rename files given in options.renaming_sources and update deps.""" src_files = list(itertools.chain.from_iterable( build_utils.ParseGnList(f) for f in options.renaming_sources)) dest_files = list(itertools.chain.from_iterable( build_utils.ParseGnList(f) for f in options.renaming_destinations)) if (len(src_files) != len(dest_files)): print('Renaming source and destination files not match.') sys.exit(-1) for src, dest in itertools.izip(src_files, dest_files): if os.path.isdir(src): print ('renaming diretory is not supported.') sys.exit(-1) else: CopyFile(src, os.path.join(options.dest, dest), deps)
def pbkdf2_bin(data, salt, iterations=1000, keylen=24, hashfunc=None): """Returns a binary digest for the PBKDF2 hash algorithm of `data` with the given `salt`. It iterates `iterations` time and produces a key of `keylen` bytes. By default SHA-1 is used as hash function, a different hashlib `hashfunc` can be provided. """ hashfunc = hashfunc or hashlib.sha1 mac = hmac.new(data, None, hashfunc) def _pseudorandom(x, mac=mac): h = mac.copy() h.update(x) return map(ord, h.digest()) buf = [] for block in xrange(1, -(-keylen // mac.digest_size) + 1): rv = u = _pseudorandom(salt + _pack_int(block)) for i in xrange(iterations - 1): u = _pseudorandom(''.join(map(chr, u))) rv = starmap(xor, izip(rv, u)) buf.extend(rv) return ''.join(map(chr, buf))[:keylen]
def encode_to_proto(self): p = HistogramProto() p.min = float(self.min) p.max = float(self.max) p.num = float(self.num) p.sum = float(self.sum) p.sum_squares = float(self.sum_squares) bucket_limits = [] buckets = [] for i, (end, count) in enumerate(izip(self.bucket_limits, self.buckets)): if (i == len(self.bucket_limits) - 1 or count > 0.0 or self.buckets[i + 1] > 0.0): bucket_limits.append(float(end)) buckets.append(float(count)) p.bucket_limit.extend(bucket_limits) p.bucket.extend(buckets) return p
def entity_to_gmsh(self, e, dim, lc, gmshself=True): # do not duplicate entity in gmsh i = self.entities[dim].index(e) gmsh_e = self.gmsh_entities[dim][i] if gmsh_e is not None: return gmsh_e if dim==0: # create Point e = e + tuple(0. for i in range(3 - self.dim)) gmsh_e = py4gmsh.Point(e, lc) self.gmsh_entities[0][i] = gmsh_e #print gmsh_e, e return gmsh_e # dim>0: recursively generate facets and entity itself facets = _facets(e) facets = [self.entity_to_gmsh(f, dim-1, lc) for f in facets] orient = _orientations(dim-1) loop = FacetLoop[dim-1]([o+s for o, s in izip(orient, facets)]) if gmshself: gmsh_e = Entity[dim](loop) self.gmsh_entities[dim][i] = gmsh_e #print gmsh_e, e return gmsh_e
def __setitem__(self, query_filter, value): """Add a new filter by setting it on all subqueries. If any of the setting operations raise an exception, the ones that succeeded are undone and the exception is propagated upward. Args: query_filter: a string of the form "property operand". value: the value that the given property is compared against. """ saved_items = [] for index, query in enumerate(self.__bound_queries): saved_items.append(query.get(query_filter, None)) try: query[query_filter] = value except: for q, old_value in itertools.izip(self.__bound_queries[:index], saved_items): if old_value is not None: q[query_filter] = old_value else: del q[query_filter] raise
def scan(self): with open(self.__filename) as f: fields = f.readline().strip().split() result = {} for (name, format), value in itertools.izip(self.FIELDS, fields): result[name] = format(value) return result
def write_genome_fasta(self, out_fasta_fn): if len(self.genomes) > 1: with open(out_fasta_fn, 'w') as f: for genome_prefix, in_fasta_fn in itertools.izip(self.genome_prefixes, self.in_fasta_fns): with open(in_fasta_fn, 'r') as g: for line in g: line = line.strip() if line.startswith('>'): line = '>' + genome_prefix + '_' + line[1:] f.write(line + '\n') else: cr_utils.copy(self.in_fasta_fns[0], out_fasta_fn)
def write_genome_gtf(self, out_gtf_fn): with open(out_gtf_fn, 'wb') as f: writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='') for genome_prefix, in_gtf_fn in itertools.izip(self.genome_prefixes, self.in_gtf_fns): if len(self.genomes) > 1: prefix_func = lambda s: '%s_%s' % (genome_prefix, s) else: prefix_func = lambda s: s transcript_to_chrom = {} cross_chrom_transcripts = set() for row, is_comment, properties in self.gtf_reader_iter(in_gtf_fn): if is_comment: writer.writerow(row) continue chrom = prefix_func(row[0]) row[0] = chrom if 'transcript_id' in properties: properties['transcript_id'] = prefix_func(properties['transcript_id']) curr_tx = properties['transcript_id'] if curr_tx in transcript_to_chrom and transcript_to_chrom[curr_tx] != chrom: # ignore recurrences of a transcript on different chromosomes - it will break the STAR index cross_chrom_transcripts.add(curr_tx) continue transcript_to_chrom[curr_tx] = chrom if 'gene_id' in properties: properties['gene_id'] = prefix_func(properties['gene_id']) if 'gene_name' in properties: properties['gene_name'] = prefix_func(properties['gene_name']) row[8] = self.format_properties_dict(properties) writer.writerow(row) print "WARNING: The following transcripts appear on multiple chromosomes in the GTF:" print '\n'.join(list(cross_chrom_transcripts)) + '\n' print "This can indicate a problem with the reference or annotations. Only the first chromosome will be counted."
def report(self): d = {str(k):int(v) for k, v in itertools.izip(xrange(0, 1 + self.max_value), self.counts)} d[">%d" % self.max_value] = int(self.counts[-1]) return d
def get_nonzero(self): i_array, j_array = self.m.nonzero() return [(self.genes[i], self.bcs[j], self.m[i, j]) for i, j in itertools.izip(i_array, j_array)]
def load_genes_from_h5_group(group): """ Load just the genes from an h5 """ gene_ids = list(getattr(group, cr_constants.H5_GENE_IDS_ATTR).read()) if hasattr(group, cr_constants.H5_GENE_NAMES_ATTR): gene_names = list(getattr(group, cr_constants.H5_GENE_NAMES_ATTR).read()) else: gene_names = gene_ids assert len(gene_ids) == len(gene_names) genes = [cr_constants.Gene(id, name, None, None, None) for id, name in itertools.izip(gene_ids, gene_names)] return genes
def build_from_mol_counter(molecule_counter, subsample_rate=1.0, subsample_result=None): """ Construct a GeneBCMatrices object from a MoleculeCounter. Args: subsample_result (dict) - Return some metrics results into this dict. """ # Reconstruct all barcode sequences in the original matrices barcode_whitelist = cr_utils.load_barcode_whitelist(molecule_counter.get_barcode_whitelist()) barcode_length = molecule_counter.get_barcode_length() or len(barcode_whitelist[0]) gem_groups = molecule_counter.get_gem_groups() barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, gem_groups) # Reconstruct Gene tuples from the molecule info ref columns gene_ids = molecule_counter.get_ref_column('gene_ids') genome_ids = molecule_counter.get_ref_column('genome_ids') gene_names = molecule_counter.get_ref_column('gene_names') gene_tuples = [cr_constants.Gene(gid, gname, None, None, None) for (gid, gname) in itertools.izip(gene_ids, gene_names)] genes = cr_utils.split_genes_by_genomes(gene_tuples, genome_ids) matrices = GeneBCMatrices(genome_ids, genes, barcode_seqs) # Track results of subsampling reads = 0 for mol in molecule_counter.get_molecule_iter(barcode_length, subsample_rate=subsample_rate): matrices.add(mol.genome, mol.gene_id, mol.barcode) reads += mol.reads if subsample_result is not None: subsample_result['mapped_reads'] = reads return matrices
def get_molecule_iter(self, barcode_length, subsample_rate=1.0): """ Return an iterator on Molecule tuples """ assert subsample_rate >= 0 and subsample_rate <= 1.0 # Store the previous compressed barcode so we don't have to decompress every single row prev_compressed_bc = None prev_gem_group = None prev_bc = None # Load the molecule data mol_barcodes = self.get_column('barcode') mol_gem_groups = self.get_column('gem_group') mol_genome_ints = self.get_column('genome') mol_gene_ints = self.get_column('gene') mol_reads = self.get_column('reads') gene_ids = self.get_ref_column('gene_ids') genome_ids = self.get_ref_column('genome_ids') if subsample_rate < 1.0: mol_reads = np.random.binomial(mol_reads, subsample_rate) for compressed_bc, gem_group, genome_int, gene_int, reads in itertools.izip(mol_barcodes, mol_gem_groups, mol_genome_ints, mol_gene_ints, mol_reads): if reads == 0: continue # Decompress the cell barcode if necessary if compressed_bc == prev_compressed_bc and gem_group == prev_gem_group: bc = prev_bc else: bc = cr_utils.format_barcode_seq(self.decompress_barcode_seq(compressed_bc, barcode_length=barcode_length), gem_group) yield Molecule(barcode=bc, genome=genome_ids[genome_int], gene_id=gene_ids[gene_int], reads=reads)