我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用h5py.File()。
def allocate(self, shape, data_dtype=None): if data_dtype is None: data_dtype = self.data_dtype if self._parallel_write: self.my_file = h5py.File(self.file_name, mode='w', driver='mpio', comm=comm) self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape) else: self.my_file = h5py.File(self.file_name, mode='w') if self.is_master: if self.compression != '': self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape, compression=self.compression, chunks=True) else: self.my_file.create_dataset(self.h5_key, dtype=data_dtype, shape=shape, chunks=True) self.my_file.close() self._read_from_header()
def test_patch_for_similarities(params, extension): file_out_suff = params.get('data', 'file_out_suff') template_file = file_out_suff + '.templates%s.hdf5' %extension if os.path.exists(template_file): try: myfile = h5py.File(template_file, 'r', libver='latest') version = myfile.get('version')[0].decode('ascii') myfile.close() except Exception: version = None else: raise Exception('No templates found! Check suffix?') if version is not None: if (StrictVersion(version) >= StrictVersion('0.6.0')): return True else: print_and_log(["Version is below 0.6.0"], 'debug', logger) return False
def test_validating(self): #mpi_launch('fitting', self.file_name, 2, 0, 'False') a, b = os.path.splitext(os.path.basename(self.file_name)) file_name, ext = os.path.splitext(self.file_name) file_out = os.path.join(os.path.abspath(file_name), a) result_name = os.path.join(file_name, 'injected') spikes = {} result = h5py.File(os.path.join(result_name, '%s.result.hdf5' %a)) for key in result.get('spiketimes').keys(): spikes[key] = result.get('spiketimes/%s' %key)[:] juxta_file = file_out + '.juxta.dat' f = numpy.memmap(juxta_file, shape=(self.length,1), dtype=self.parser.get('validating', 'juxta_dtype'), mode='w+') f[spikes['temp_9']] = 100 del f mpi_launch('validating', self.file_name, 2, 0, 'False')
def report(self, summary_json_paths, barcode_summary_h5_path, recovered_cells, cell_bc_seqs): assert len(cell_bc_seqs) == len(self.matrices) barcode_summary_h5 = h5.File(barcode_summary_h5_path, 'r') d = {} d.update(self._report_genome_agnostic_metrics( summary_json_paths, barcode_summary_h5, recovered_cells, cell_bc_seqs)) # Compute genome-specific metrics for i, (genome, matrix) in enumerate(self.matrices.iteritems()): for key, value in matrix.report(genome, barcode_summary_h5, recovered_cells, cell_bc_seqs=cell_bc_seqs[i], ).iteritems(): key = '_'.join([genome, key]) d[key] = value return d
def write_data_frame(fn, df): ''' Write the pandas dataframe object to an HDF5 file. Each column is written as a single 1D dataset at the top level of the HDF5 file, using the native pandas datatype''' # Always write a fresh file -- the 'w' argument to h5py.File is supposed to truncate an existing file, but it doesn't appear to work correctly if os.path.exists(fn): os.remove(fn) f = h5py.File(fn, "w") # To preserve column order, write columns to an attribute column_names = np.array(list(df.columns)) f.attrs.create("column_names", column_names) for col in df.columns: write_data_column(f, df[col]) f.close()
def read_data_frame(fn, query_cols=[]): ''' Load a pandas DataFrame from an HDF5 file. If a column list is specified, only load the matching columns ''' with h5py.File(fn, 'r') as f: column_names = f.attrs.get("column_names") column_names = get_column_intersection(column_names, query_cols) df = p.DataFrame() # Add the columns progressively to save memory for name in column_names: ds = f[name] if has_levels(ds): indices = ds[:] uniques = get_levels(ds) # This method of constructing of Categorical avoids copying the indices array # which saves memory for big datasets df[name] = p.Categorical(indices, categories=uniques, ordered=False, fastpath=True) else: df[name] = p.Series(ds[:]) return df
def read_data_frame_indexed_no_concat(fn, tabix_queries, query_cols = [], coords = True): ''' Read rows from the HDF5 data frame that match each tabix query in the queries list. A tabix query is in the form ('chr1', 100, 200). query_cols is a list of columns you want to return. If coords is True, then it it will return coordinates regardless of query_cols. If coords is False, it will only return the columns specified in query_cols. Returns a list of pandas DataFrames, one for each query. ''' f = h5py.File(fn, 'r') # read the index tabix_index = read_tabix_index(f) dfs = [] for q in tabix_queries: r = _read_data_frame_indexed_sub(f, tabix_index, q, query_cols = query_cols, coords = coords) dfs.append(r) f.close() # Return the union of the queries return dfs
def check_filters(fast5_file, min_length, min_mean_qual, min_qual_window, window_size): try: hdf5_file = h5py.File(fast5_file, 'r') names = get_hdf5_names(hdf5_file) basecall_location = get_best_fastq_hdf5_location(hdf5_file, names) if basecall_location: fastq_str = hdf5_file[basecall_location].value try: parts = fastq_str.split(b'\n') seq, quals = parts[1], parts[3] except IndexError: fastq_str, seq, quals = '', '', '' if not fastq_str or not seq: return False, 0 if min_mean_qual and get_mean_qscore(quals) < min_mean_qual: return False, 0 if min_length and len(seq) < min_length: return False, 0 if min_qual_window and get_min_window_qscore(quals, window_size) < min_qual_window: return False, 0 return True, len(seq) except (IOError, RuntimeError): pass return False, 0
def min_window_qual_and_length(fast5_file, window_size): try: hdf5_file = h5py.File(fast5_file, 'r') names = get_hdf5_names(hdf5_file) basecall_location = get_best_fastq_hdf5_location(hdf5_file, names) if basecall_location: fastq_str = hdf5_file[basecall_location].value try: parts = fastq_str.split(b'\n') seq, quals = parts[1], parts[3] return get_min_window_qscore(quals, window_size), len(seq), fast5_file except IndexError: pass except (IOError, RuntimeError): pass return 0.0, 0, fast5_file
def save_h5_data_label_normal(h5_filename, data, label, normal, data_dtype='float32', label_dtype='uint8', noral_dtype='float32'): h5_fout = h5py.File(h5_filename) h5_fout.create_dataset( 'data', data=data, compression='gzip', compression_opts=4, dtype=data_dtype) h5_fout.create_dataset( 'normal', data=normal, compression='gzip', compression_opts=4, dtype=normal_dtype) h5_fout.create_dataset( 'label', data=label, compression='gzip', compression_opts=1, dtype=label_dtype) h5_fout.close() # Write numpy array data and label to h5_filename
def main(): parser = generate_parser() args = parser.parse_args() infile1 = h5py.File(args.input1, 'r') infile2 = h5py.File(args.input2, 'r') resolutions = numpy.intersect1d(infile1['resolutions'][...], infile2['resolutions'][...]) chroms = numpy.intersect1d(infile2['chromosomes'][...], infile2['chromosomes'][...]) results = {} data1 = load_data(infile1, chroms, resolutions) data2 = load_data(infile2, chroms, resolutions) infile1.close() infile2.close() results = {} results[(args.input1.split('/')[-1].strip('.quasar'), args.input2.split('/')[-1].strip('.quasar'))] = correlate_samples(data1, data2) for resolution in data1.keys(): for chromo in chroms: plt.scatter(data1[resolution][chromo][1].flatten(),data2[resolution][chromo][1].flatten(),alpha=0.1,color='red') plt.show() plt.savefig(args.output+'.res'+str(resolution)+'.chr'+chromo+'.pdf')
def fill_hdf5_with_sparse_by_chunk(mym1,mym2,fname,chunksize): start1=0 end1=0 n=mym1.shape[0] f=h5py.File(fname,'w') m1hdf5=f.create_dataset('m1',shape=(n,n),dtype='float') m2hdf5=f.create_dataset('m2',shape=(n,n),dtype='float') while end1<n: end1=np.min([n,(start1+chunksize)]) print 'start1: '+str(start1) if (end1-start1)==1: m1hdf5[start1,:]=mym1[start1,:].toarray() m2hdf5[start1,:]=mym2[start1,:].toarray() else: m1hdf5[start1:end1,:]=mym1[start1:end1,:].toarray() m2hdf5[start1:end1,:]=mym2[start1:end1,:].toarray() start1=end1 print 'sum of 1' print m1hdf5[:,:].sum() print m2hdf5[:,:].sum() f.close()
def __init__(self, data=None, info=None, dtype=None, file=None, copy=False, **kwargs): object.__init__(self) #self._infoOwned = False self._isHDF = False if file is not None: self._data = None self.readFile(file, **kwargs) if kwargs.get("readAllData", True) and self._data is None: raise Exception("File read failed: %s" % file) else: self._info = info if (hasattr(data, 'implements') and data.implements('MetaArray')): self._info = data._info self._data = data.asarray() elif isinstance(data, tuple): ## create empty array with specified shape self._data = np.empty(data, dtype=dtype) else: self._data = np.array(data, dtype=dtype, copy=copy) ## run sanity checks on info structure self.checkInfo()
def transpose(self, *args): if len(args) == 1 and hasattr(args[0], '__iter__'): order = args[0] else: order = args order = [self._interpretAxis(ax) for ax in order] infoOrder = order + list(range(len(order), len(self._info))) info = [self._info[i] for i in infoOrder] order = order + list(range(len(order), self.ndim)) try: if self._isHDF: return MetaArray(np.array(self._data).transpose(order), info=info) else: return MetaArray(self._data.transpose(order), info=info) except: print(order) raise #### File I/O Routines
def export(self, fileName=None): if not HAVE_HDF5: raise RuntimeError("This exporter requires the h5py package, " "but it was not importable.") if not isinstance(self.item, PlotItem): raise Exception("Must have a PlotItem selected for HDF5 export.") if fileName is None: self.fileSaveDialog(filter=["*.h5", "*.hdf", "*.hd5"]) return dsname = self.params['Name'] fd = h5py.File(fileName, 'a') # forces append to file... 'w' doesn't seem to "delete/overwrite" data = [] appendAllX = self.params['columnMode'] == '(x,y) per plot' for i,c in enumerate(self.item.curves): d = c.getData() if appendAllX or i == 0: data.append(d[0]) data.append(d[1]) fdata = numpy.array(data).astype('double') dset = fd.create_dataset(dsname, data=fdata) fd.close()
def __load_page_data(self): self.__clearRows() if hasattr(self,"selectChan"): with hp.File(self.file_name,"r") as f: sampling_rate = f["analogs"][self.selectChan]["sampling_rate"].value start_time = f["analogs"][self.selectChan]["start_time"].value start_point = sampling_rate*self.row_num*self.current_page end_point = sampling_rate*self.row_num*(self.current_page+1) self.page_data = f["analogs"][self.selectChan]["data"][start_point:end_point] self.sigma = np.median(np.abs(self.page_data)/0.6745) Thr = self.thresholds[self.selectChan] * self.sigma self.sampling_rate = sampling_rate self.row_wins_rois = [0]*self.row_num for i in range(self.row_num): start_point = i*sampling_rate end_point = (i+1)*sampling_rate if self.page_data[start_point:end_point].size: ys = self.page_data[start_point:end_point] xs = np.arange(ys.size) line = MultiLine(np.array([xs]),np.array([ys]),"w") self.row_wins[i].addItem(line) self.row_wins_rois[i] = pg.InfiniteLine(pos=Thr,angle=0,movable=False) self.row_wins_rois[i].setZValue(10) self.row_wins[i].addItem(self.row_wins_rois[i])
def __load_waveforms(self,selectChan,file_name): spk_startswith = "spike_{0}".format(selectChan) with hp.File(file_name,"r") as f: times = list() waveforms = list() for chn_unit in f["spikes"].keys(): if chn_unit.startswith(spk_startswith): tep_time = f["spikes"][chn_unit]["times"].value waveform = f["spikes"][chn_unit]["waveforms"].value times.append(tep_time) waveforms.append(waveform) if times: times = np.hstack(times) waveforms = np.vstack(waveforms) sort_index = np.argsort(times) waveforms = waveforms[sort_index] return waveforms else: return None
def h5_io(filename, spike_to_load, analog_to_load): spikes = dict() analogs = dict() events = dict() comments = dict() with hp.File(filename,'r') as f: for key in f.keys(): if key=='events': events['times'] = f[key]['times'].value events['labels'] = f[key]['labels'].value elif key=='comments': comments['times'] = f[key]['times'].value comments['labels'] = f[key]['labels'].value elif key=='spikes': for tem_key in f[key].keys(): if tem_key in spike_to_load: spikes[tem_key] = f[key][tem_key]['times'].value elif key=='analogs': for tem_key in f[key].keys(): if tem_key in analog_to_load: analogs[tem_key] = dict() analogs[tem_key]['data'] = f[key][tem_key]['data'].value analogs[tem_key]['sampling_rate'] = f[key][tem_key]['sampling_rate'].value analogs[tem_key]['start_time'] = f[key][tem_key]['start_time'].value return events,comments,spikes,analogs
def gen_tracking_db(database, tracking_stats): """Generate TrackingDataset structure. Parameters ---------- database : h5py.File HDF5 file object tracking_stats : dictionary the dictionary that contains TrackingDataset's stats Returns ------- database : h5py.File HDF5 file object with multiple groups """ primary_list = tracking_stats["primary_list"] for pc in primary_list: if pc not in database: database.create_group(pc) print "[MESSAGE] Primary group %s is created" % (pc) print "[MESSAGE] TrackingDataset HDF5 structure is generated."
def gen_caltech256_db(database, caltech256_stats): """Generate Caltech-256 structure. Parameters ---------- database : h5py.File HDF5 file object caltech256_stats : dictionary the dictionary that contains Caltech-256's stats Returns ------- database : h5py.File HDF5 file object with multiple groups """ caltech256_list = caltech256_stats["caltech256_list"] for class_name in caltech256_list: if class_name not in database: database.create_group(class_name) print "[MESSAGE] Class %s is created" % (class_name) print "[MESSAGE] Caltech-256 HDF5 structure is generated."
def gen_ucf50_db(database, ucf50_stats): """Generate UCF50 structure. Parameters ---------- database : h5py.File HDF5 file object ucf50_stats : dictionary the dictionary that contains UCF50's stats Returns ------- database : h5py.File HDF5 file object with multiple groups """ ucf50_list = ucf50_stats["ucf50_list"] for category in ucf50_list: if category not in database: database.create_group(category) print "[MESSAGE] Category %s is created" % (category) print "[MESSAGE] UCF-50 HDF5 structure is generated."
def time_hdf5(): data_path = create_hdf5(BATCH_SIZE * NSTEPS) f = h5py.File(data_path) durs = [] for step in tqdm.trange(NSTEPS, desc='running hdf5'): start_time = time.time() arr = f['data'][BATCH_SIZE * step: BATCH_SIZE * (step+1)] read_time = time.time() arr = copy.deepcopy(arr) copy_time = time.time() durs.append(['hdf5 read', step, read_time - start_time]) durs.append(['hdf5 copy', step, copy_time - read_time]) f.close() os.remove(data_path) durs = pandas.DataFrame(durs, columns=['kind', 'stepno', 'dur']) return durs
def mean_variance_normalisation(h5f, mvn_h5f, vad=None): """Do mean variance normlization. Optionnaly use a vad. Parameters: ---------- h5f: str. h5features file name mvn_h5f: str, h5features output name """ dset = h5py.File(h5f).keys()[0] if vad is not None: raise NotImplementedError else: data = h5py.File(h5f)[dset]['features'][:] features = data epsilon = np.finfo(data.dtype).eps mean = np.mean(data) std = np.std(data) mvn_features = (features - mean) / (std + epsilon) shutil.copy(h5f, mvn_h5f) h5py.File(mvn_h5f)[dset]['features'][:] = mvn_features
def h5features_feats2stackedfeats(fb_h5f, stackedfb_h5f, nframes=7): """Create stacked features version of h5features file Parameters: ---------- fb_h5f: str. h5features file name stackedfb_h5f: str, h5features output name """ dset_name = h5py.File(fb_h5f).keys()[0] files = h5py.File(fb_h5f)[dset_name]['items'] def aux(f): return stack_fbanks(h5features.read(fb_h5f, from_item=f)[1][f], nframes=nframes) def time_f(f): return h5features.read(fb_h5f, from_item=f)[0][f] h5features_compute(files, stackedfb_h5f, featfunc=aux, timefunc=time_f)
def load_data(name='ac3', N=-1, prefix=None, gold=False): '''Load data ''' if not 'mri' in name: if gold: filename = '~/compresso/data/' + name + '/gold/' + name + '_gold.h5' else: filename = '~/compresso/data/' + name + '/rhoana/' + name + '_rhoana.h5' with h5py.File(os.path.expanduser(filename), 'r') as hf: output = np.array(hf['main'], dtype=np.uint64) else: filename = '~/compresso/data/MRI/' + name + '.h5' with h5py.File(os.path.expanduser(filename), 'r') as hf: output = np.array(hf['main'], dtype=np.uint64) if (not N == -1): output = output[0:N,:,:] return output
def write_hdf5(file, data, label_class, label_bbox, label_landmarks): # transform to np array data_arr = np.array(data, dtype = np.float32) # print data_arr.shape # if no swapaxes, transpose to num * channel * width * height ??? # data_arr = data_arr.transpose(0, 3, 2, 1) label_class_arr = np.array(label_class, dtype = np.float32) label_bbox_arr = np.array(label_bbox, dtype = np.float32) label_landmarks_arr = np.array(label_landmarks, dtype = np.float32) with h5py.File(file, 'w') as f: f['data'] = data_arr f['label_class'] = label_class_arr f['label_bbox'] = label_bbox_arr f['label_landmarks'] = label_landmarks_arr # list_file format: # image_path | label_class | label_boundingbox(4) | label_landmarks(10)
def main(): parser = argparse.ArgumentParser(description=""" python add_attr_to_hdf5.py file.hdf5 attr_name attr_value Add an attribute to an HDF5 file. """) parser.add_argument('filepath') parser.add_argument('attr_name') parser.add_argument('attr_value') #parser.add_argument('-o', '--options', default='yo', # help="Some option", type='str') #parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() with h5py.File(args.filepath) as f: f.attrs[args.attr_name] = args.attr_value
def dump(self, target): """Serializes MPArray to :code:`h5py.Group`. Recover using :func:`~load`. :param target: :code:`h5py.Group` the instance should be saved to or path to h5 file (it's then serialized to /) """ if isinstance(target, str): import h5py with h5py.File(target, 'w') as outfile: return self.dump(outfile) for prop in ('ranks', 'shape'): # these are only saved for convenience target.attrs[prop] = str(getattr(self, prop)) # these are actually used in MPArray.load target.attrs['len'] = len(self) target.attrs['canonical_form'] = self.canonical_form for site, lten in enumerate(self._lt): target[str(site)] = lten
def test_dump_and_load(tmpdir, dtype): mpa = factory.random_mpa(5, [(4,), (2, 3), (1,), (4,), (4, 3)], (4, 7, 1, 3), dtype=dtype) mpa.canonicalize(left=1, right=3) with h5.File(str(tmpdir / 'dump_load_test.h5'), 'w') as buf: newgroup = buf.create_group('mpa') mpa.dump(newgroup) with h5.File(str(tmpdir / 'dump_load_test.h5'), 'r') as buf: mpa_loaded = mp.MPArray.load(buf['mpa']) assert_mpa_identical(mpa, mpa_loaded) mpa.dump(str(tmpdir / 'dump_load_test_str.h5')) mpa_loaded = mp.MPArray.load(str(tmpdir / 'dump_load_test_str.h5')) assert_mpa_identical(mpa, mpa_loaded) ############################################################################### # Algebraic operations # ###############################################################################
def average_models(best, L=6, model_dir='', model_name='ra.h5'): print '... merging' print '{} {:d}-{:d}'.format(model_dir, best-L/2, best+L/2) params = {} side_info = {} attrs = {} for i in xrange(max(best-L/2, 0), best+L/2): with h5py.File(osp.join(model_dir, model_name+'.'+str(i)), 'r') as f: for k, v in f.attrs.items(): attrs[k] = v for p in f.keys(): if '#' not in p: side_info[p] = f[p][...] elif p in params: params[p] += np.array(f[p]).astype('float32') / L else: params[p] = np.array(f[p]).astype('float32') / L with h5py.File(osp.join(model_dir, model_name+'.merge'), 'w') as f: for p in params.keys(): f[p] = params[p] for s in side_info.keys(): f[s] = side_info[s] for k, v in attrs.items(): f.attrs[k] = v
def save_h5(filename, **kwargs): '''Save data to an hdf5 file. Parameters ---------- filename : str Path to the file kwargs key-value pairs of data See Also -------- load_h5 ''' with h5py.File(filename, 'w') as hf: hf.update(kwargs)
def save_as_hdf5_acc(g, outHDF5): NumAcc = len(g.accessions) log.info("Writing into HDF5 file acc wise") h5file = h5py.File(outHDF5, 'w') NumSNPs = len(g.snps) h5file.create_dataset('accessions', data=g.accessions, shape=(NumAcc,)) h5file.create_dataset('positions', data=g.positions, shape=(NumSNPs,),dtype='i4') h5file['positions'].attrs['chrs'] = g.chrs h5file['positions'].attrs['chr_regions'] = g.chr_regions h5file.create_dataset('snps', shape=(NumSNPs, NumAcc), dtype='int8', compression="gzip", chunks=((NumSNPs, 1))) for i in range(NumAcc): h5file['snps'][:,i] = np.array(g.snps)[:,i] if i+1 % 10 == 0: log.info("written SNP info for %s accessions", i+1) h5file['snps'].attrs['data_format'] = g.data_format h5file['snps'].attrs['num_snps'] = NumSNPs h5file['snps'].attrs['num_accessions'] = NumAcc h5file.close()
def get_1000G_snps(sumstats, out_file): sf = np.loadtxt(sumstats,dtype=str,skiprows=1) h5f = h5py.File('ref/Misc/1000G_SNP_info.h5','r') rf = h5f['snp_chr'][:] h5f.close() ind1 = np.in1d(sf[:,1],rf[:,2]) ind2 = np.in1d(rf[:,2],sf[:,1]) sf1 = sf[ind1] rf1 = rf[ind2] ### check order ### if sum(sf1[:,1]==rf1[:,2])==len(rf1[:,2]): print 'Good!' else: print 'Shit happens, sorting sf1 to have the same order as rf1' O1 = np.argsort(sf1[:,1]) O2 = np.argsort(rf1[:,2]) O3 = np.argsort(O2) sf1 = sf1[O1][O3] out = ['hg19chrc snpid a1 a2 bp or p'+'\n'] for i in range(len(sf1[:,1])): out.append(sf1[:,0][i]+' '+sf1[:,1][i]+' '+sf1[:,2][i]+' '+sf1[:,3][i]+' '+rf1[:,1][i]+' '+sf1[:,5][i]+' '+sf1[:,6][i]+'\n') ff = open(out_file,"w") ff.writelines(out) ff.close()
def load_weights(params, path, num_conv): print 'Loading gan weights from ' + path with h5py.File(path, 'r') as hdf5: params['skipthought2image'] = theano.shared(np.copy(hdf5['skipthought2image'])) params['skipthought2image-bias'] = theano.shared(np.copy(hdf5['skipthought2image-bias'])) for i in xrange(num_conv): params['W_conv{}'.format(i)] = theano.shared(np.copy(hdf5['W_conv{}'.format(i)])) params['b_conv{}'.format(i)] = theano.shared(np.copy(hdf5['b_conv{}'.format(i)])) # Flip w,h axes params['W_conv{}'.format(i)] = params['W_conv{}'.format(i)][:,:,::-1,::-1] w = np.abs(np.copy(hdf5['W_conv{}'.format(i)])) print 'W_conv{}'.format(i), np.min(w), np.mean(w), np.max(w) b = np.abs(np.copy(hdf5['b_conv{}'.format(i)])) print 'b_conv{}'.format(i), np.min(b), np.mean(b), np.max(b) return params
def _load_sentences_embeddings(self): # load the test sentences and the expected LM embeddings with open(os.path.join(FIXTURES, 'sentences.json')) as fin: sentences = json.load(fin) # the expected embeddings expected_lm_embeddings = [] for k in range(len(sentences)): embed_fname = os.path.join( FIXTURES, 'lm_embeddings_{}.hdf5'.format(k) ) expected_lm_embeddings.append([]) with h5py.File(embed_fname, 'r') as fin: for i in range(10): sent_embeds = fin['%s' % i][...] sent_embeds_concat = numpy.concatenate( (sent_embeds[0, :, :], sent_embeds[1, :, :]), axis=-1 ) expected_lm_embeddings[-1].append(sent_embeds_concat) return sentences, expected_lm_embeddings
def test_read_hdf5_format_file(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") vocab.add_token_to_namespace("word2") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 5) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) embedding_layer = Embedding.from_params(vocab, params) assert numpy.allclose(embedding_layer.weight.data.numpy(), embeddings)
def test_read_hdf5_raises_on_invalid_shape(self): vocab = Vocabulary() vocab.add_token_to_namespace("word") embeddings_filename = self.TEST_DIR + "embeddings.hdf5" embeddings = numpy.random.rand(vocab.get_vocab_size(), 10) with h5py.File(embeddings_filename, 'w') as fout: _ = fout.create_dataset( 'embedding', embeddings.shape, dtype='float32', data=embeddings ) params = Params({ 'pretrained_file': embeddings_filename, 'embedding_dim': 5, }) with pytest.raises(ConfigurationError): _ = Embedding.from_params(vocab, params)
def _read_pretrained_hdf5_format_embedding_file(embeddings_filename: str, # pylint: disable=invalid-name embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens") -> torch.FloatTensor: """ Reads from a hdf5 formatted file. The embedding matrix is assumed to be keyed by 'embedding' and of size ``(num_tokens, embedding_dim)``. """ with h5py.File(embeddings_filename, 'r') as fin: embeddings = fin['embedding'][...] if list(embeddings.shape) != [vocab.get_vocab_size(namespace), embedding_dim]: raise ConfigurationError( "Read shape {0} embeddings from the file, but expected {1}".format( list(embeddings.shape), [vocab.get_vocab_size(namespace), embedding_dim])) return torch.FloatTensor(embeddings)
def load_grid8(return_imsize=True): """Load grid 8x8. Parameters ---------- return_imsize : bool return a tuple with grid size if True Returns ------- db : h5py.File a HDF5 file object imsize : tuple (optional) grid size """ file_path = os.path.join(rlvision.RLVISION_DATA, "HDF5", "gridworld_8.hdf5") if not os.path.isfile(file_path): raise ValueError("The dataset %s is not existed!" % (file_path)) if return_imsize is True: return h5py.File(file_path, mode="r"), (8, 8) else: return h5py.File(file_path, mode="r")
def encoder(args, model): latent_dim = args.latent_dim data, charset = load_dataset(args.data, split = False) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size = latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) x_latent = model.encoder.predict(data) if args.save_h5: h5f = h5py.File(args.save_h5, 'w') h5f.create_dataset('charset', data = charset) h5f.create_dataset('latent_vectors', data = x_latent) h5f.close() else: np.savetxt(sys.stdout, x_latent, delimiter = '\t')
def main(): args = get_arguments() model = MoleculeVAE() data, data_test, charset = load_dataset(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size = args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) x_latent = model.encoder.predict(data) if not args.visualize: if not args.save_h5: np.savetxt(sys.stdout, x_latent, delimiter = '\t') else: h5f = h5py.File(args.save_h5, 'w') h5f.create_dataset('charset', data = charset) h5f.create_dataset('latent_vectors', data = x_latent) h5f.close() else: visualize_latent_rep(args, model, x_latent)
def fetch_data_one(self,dataitem,cycle): self.h5 = mrT.File(self.filename,'r') try: data = self.h5[self.cycle_header+str(cycle)]['SE_DATASET'][dataitem] except ValueError: try: data = self.h5[self.cycle_header+str(cycle)].attrs.get(dataitem, None) except TypeError: data = self.h5[self.cycle_header+str(cycle)][dataitem] try: while data.shape[0] < 2: data = data[0] except (IndexError, AttributeError): None self.h5.close() return data
def fromh5(path, datapath=None, dataslice=None, asnumpy=True, preptrain=None): """ Opens a hdf5 file at path, loads in the dataset at datapath, and returns dataset as a numpy array. """ # Check if path exists (thanks Lukas!) assert os.path.exists(path), "Path {} does not exist.".format(path) # Init file h5file = h5.File(path) # Init dataset h5dataset = h5file[datapath] if datapath is not None else h5file.values()[0] # Slice dataset h5dataset = h5dataset[dataslice] if dataslice is not None else h5dataset # Convert to numpy if required h5dataset = np.asarray(h5dataset) if asnumpy else h5dataset # Apply preptrain h5dataset = preptrain(h5dataset) if preptrain is not None else h5dataset # Close file h5file.close() # Return return h5dataset
def __check_valid_key__(self, key): file = h5py.File(self.file_name) all_fields = [] file.visit(all_fields.append) if not key in all_fields: print_and_log(['The key %s can not be found in the dataset! Keys found are:' %key, ", ".join(all_fields)], 'error', logger) sys.exit(1) file.close()
def _open(self, mode='r'): if mode in ['r+', 'w'] and self._parallel_write: self.my_file = h5py.File(self.file_name, mode=mode, driver='mpio', comm=comm) else: self.my_file = h5py.File(self.file_name, mode=mode) self.data = self.my_file.get(self.h5_key)
def set_streams(self, stream_mode): if stream_mode == 'single-file': sources = [] to_write = [] count = 0 params = self.get_description() my_file = h5py.File(self.file_name) all_matches = [re.findall('\d+', u) for u in my_file.keys()] all_streams = [] for m in all_matches: if len(m) > 0: all_streams += [int(m[0])] idx = numpy.argsort(all_streams) for i in xrange(len(all_streams)): params['h5_key'] = my_file.keys()[idx[i]] new_data = type(self)(self.file_name, params) sources += [new_data] to_write += ['We found the datafile %s with t_start %d and duration %d' %(new_data.file_name, new_data.t_start, new_data.duration)] print_and_log(to_write, 'debug', logger) return sources elif stream_mode == 'multi-files': return H5File.set_streams(stream_mode)