我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用h5py.special_dtype()。
def create_levels(ds, levels): # Create a dataset in the LEVEL_GROUP # and store as native numpy / h5py types level_grp = ds.file.get(LEVEL_GROUP) if level_grp is None: # Create a LEVEL_GROUP level_grp = ds.file.create_group(LEVEL_GROUP) ds_name = ds.name.split("/")[-1] dt = h5py.special_dtype(vlen=str) level_grp.create_dataset(ds_name, shape = [len(levels)], maxshape = (None,), dtype = dt, data = levels, compression = COMPRESSION, chunks = (CHUNK_SIZE,))
def text_to_h5py_dataset(text_path, dst_path): # The simplest is to load everything to memory first. # If memory becomes an issue, this code can be optimized. words = [] with open(text_path, 'r') as src: for line in src: words.extend(line.strip().split()) with h5py.File(dst_path, 'w') as dst: dtype = h5py.special_dtype(vlen=bytes) table = dst.create_dataset('words', (len(words),), dtype=dtype) table[:] = words dst.attrs['split'] = H5PYDataset.create_split_array({ 'train' : { 'words' : (0, len(words)) } })
def add_words_ids_to_squad(h5_file, vocab): """Digitizes test with a vocabulary. Also saves the vocabulary into the hdf5 file. """ with h5py.File(h5_file, 'a') as dst: unicode_dtype = h5py.special_dtype(vlen=unicode) dst.create_dataset('text_ids', (dst['text'].shape[0],), 'int64') dst.create_dataset('vocab_words', (vocab.size(),), unicode_dtype) dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64') dst['text_ids'][:] = map(vocab.word_to_id, dst['text'][:]) dst['vocab_words'][:] = vocab.words dst['vocab_freqs'][:] = vocab.frequencies ### SNLI ###
def add_text_h5(fid, path, data): """Add text data (UTF-8) to the given path in the HDF5 file with handle fid. Arguments: - fid is the file handle to the HDF5 file - path is the base path inside the HDF5 file - data is the text data as a string """ dset = fid.create_dataset(name=path, shape=(1,), dtype=h5py.special_dtype(vlen=str), data=data, compression="gzip")
def load_pretrained(): #glove_vec = ["glove_wiki_50","glove_wiki_150","glove_wiki_300"] glove_vec = ["glove_wiki_300"] #glove_vec = ["glove_wiki_50"] filename = 'glove_pretrained.h5' #import tensorflow as tf #sess = tf.InteractiveSession() features, words = load_h5py('glove_wiki_300',filename=root + glove_vec_fold + filename) filename = 'glove.h5' features = normalize(np.array(features), axis=1, norm='l2') with h5py.File(root + glove_vec_fold + filename, "w") as hf: hf.create_dataset(glove_vec[0], data=features) string_dt = h5py.special_dtype(vlen=str) hf.create_dataset(glove_vec[0] + "_words", data=words, dtype=string_dt) for vec in glove_vec: data, words = load_h5py(vec, filename=root + glove_vec_fold + "glove.h5") print(data.shape, words.shape) time.sleep(5)
def create_partition_function(self, f_w2v, f_h5): print("Building the partition function") # Load the model from disk M = load_w2vec() words = M.wv.index2word ZT = [] INPUT_ITR = tqdm.tqdm(words) # Compute the partition function for each word for w in INPUT_ITR: UE = self.energy(M.wv.syn0, M[w]) z = compute_partition_stats(UE) ZT.append(z) # Save the partition function to disk # (special care needed for h5py unicode strings) dt = h5py.special_dtype(vlen=unicode) with h5py.File(f_h5, 'w') as h5: h5.create_dataset("words", (len(words),), dtype=dt, data=[w.encode('utf8') for w in words]) h5.attrs['vocab_N'] = len(words) h5['Z'] = ZT
def _precompute(self, Xy_generator, cache): with h5py.File(cache, mode='w') as fp: # initialize with a fixed number of sequences n_sequences = 1000 y = fp.create_dataset( 'y', shape=(n_sequences, ), dtype=h5py.special_dtype(vlen=bytes), maxshape=(None, )) for i, (X_, y_) in enumerate(Xy_generator): if i == 0: _, n_samples, n_features = X_.shape X = fp.create_dataset( 'X', dtype=X_.dtype, compression='gzip', shape=(n_sequences, n_samples, n_features), chunks=(1, n_samples, n_features), maxshape=(None, n_samples, n_features)) # increase number of sequences on demand if i == n_sequences: n_sequences = int(n_sequences * 1.1) y.resize(n_sequences, axis=0) X.resize(n_sequences, axis=0) # store current X, y in file y[i] = y_ X[i] = X_ # resize file to exactly match the number of sequences y.resize(i, axis=0) X.resize(i, axis=0)
def save_h5py(arrays, string_arrs, names, filename="glove.h5"): with h5py.File(filename, "w") as hf: for i in range(len(arrays)): hf.create_dataset(names[i], data=arrays[i]) string_dt = h5py.special_dtype(vlen=str) hf.create_dataset(names[i] + "_words", data=string_arrs[i], dtype=string_dt) return True
def to_hdf5(self, hf, df, **kwargs): """ Add datasets to a group for an HDF5 file handler """ if self.dielectronic: grp_name = '/'.join([self.element, self.ion_name, 'dielectronic', self.filetype]) else: grp_name = '/'.join([self.element, self.ion_name, self.filetype]) if grp_name not in hf: grp = hf.create_group(grp_name) grp.attrs['chianti_version'] = df.meta['chianti_version'] grp.attrs['footer'] = df.meta['footer'] else: grp = hf[grp_name] hf['/'.join([self.element, self.ion_name])].attrs['element'] = self.element hf['/'.join([self.element, self.ion_name])].attrs['ion'] = self.ion_name for name in df.colnames: col = df[name] if type(col) == u.Quantity: data = col.value else: data = col.data if '<U' in data.dtype.str: numchar = data.dtype.str[2:] data = data.astype('|S{}'.format(numchar)) if name in grp: ds = grp[name] else: if data.dtype == np.dtype('O'): ragged_dtype = h5py.special_dtype(vlen=np.dtype('float64')) ds = grp.create_dataset(name, data=data, dtype=ragged_dtype) else: ds = grp.create_dataset(name, data=data, dtype=data.dtype) if col.unit is None: ds.attrs['unit'] = 'SKIP' else: ds.attrs['unit'] = col.unit.to_string() ds.attrs['description'] = df.meta['descriptions'][name]
def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'): f = h5py.File(output, "w") compress_option = dict(compression="gzip", compression_opts=9, shuffle=True) words_flatten = '\n'.join(vocabulary) f.attrs['vocab_len'] = len(vocabulary) print len(vocabulary) dt = h5py.special_dtype(vlen=str) _dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option) _dset_vocab[...] = [words_flatten] _dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option) _dset[...] = embedding_matrix f.flush() f.close()
def cli_render(input, output, size): '''Render a JSONlines dataset to numpy arrays, saved in an HDF5 file. ''' chars = [] images = [] for line in input: datum = json.loads(line) chars.append(datum['target']) images.append(render( [np.array(s) for s in datum['strokes']], size)) vocab = list(sorted(set(chars))) char_to_index = {ch: y for y, ch in enumerate(vocab)} with h5py.File(output, 'a') as f: str_dt = h5py.special_dtype(vlen=str) f.require_dataset( 'vocab', (len(vocab),), dtype=str_dt )[...] = vocab f.require_dataset( 'x', shape=(len(images), size, size), dtype=np.float32 )[...] = np.array(images) f.require_dataset( 'y', shape=(len(chars),), dtype=np.int )[...] = np.array([char_to_index[ch] for ch in chars])
def _save_hdf5(self, buffer_list): """ :param buffer_list: :return: """ file_name = strftime("%Y-%m-%d-%H:%M:%S", gmtime()) output_path = os.path.join(self.data_store_path, file_name) h5file = h5py.File(output_path, 'w', chunk=True) dt_vlen = h5py.special_dtype(vlen=str) dt_arr = np.dtype((dt_vlen, (self.sent_max_len,))) h5raw = h5file.create_dataset('rawdata', (len(buffer_list),), dtype=dt_arr) for i in range(len(buffer_list)): h5raw[i] = np.array(buffer_list[i], dtype=object) h5file.flush() h5file.close()
def hdf_create(self, output_path, filecnt, channel, image_arr, shape_arr, lable_arr, name_arr): h5file = h5py.File(output_path, mode='w') dtype = h5py.special_dtype(vlen=np.dtype('uint8')) hdf_features = h5file.create_dataset('image_features', (filecnt,), dtype=dtype) hdf_shapes = h5file.create_dataset('image_features_shapes', (filecnt, channel),dtype='int32') hdf_labels = h5file.create_dataset('targets', (filecnt,), dtype='S240') hdf_names = h5file.create_dataset('names', (filecnt,), dtype='S240') # Attach shape annotations and scales hdf_features.dims.create_scale(hdf_shapes, 'shapes') hdf_features.dims[0].attach_scale(hdf_shapes) hdf_shapes_labels = h5file.create_dataset('image_features_shapes_labels', (3,), dtype='S7') hdf_shapes_labels[...] = ['channel'.encode('utf8'), 'height'.encode('utf8'), 'width'.encode('utf8')] hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') hdf_features.dims[0].attach_scale(hdf_shapes_labels) # Add axis annotations hdf_features.dims[0].label = 'batch' for i in range(len(image_arr)): hdf_features[i] = image_arr[i] hdf_shapes[i] = shape_arr[i] hdf_labels[i] = lable_arr[i] hdf_names[i] = name_arr[i] h5file.flush() h5file.close()
def write_psites(tpsites,psites_number,filename): with h5py.File(filename,"w") as fout: ds = h5py.special_dtype(vlen=str) dt = h5py.special_dtype(vlen=np.dtype("int32")) fout.create_dataset("transcript_ids",data=tpsites.keys(),dtype=ds) fout.create_dataset("p_sites",data=tpsites.values(),dtype=dt, compression="gzip") fout.create_dataset("psites_number",data=psites_number,dtype="int32") return None
def dump_h5_var(filename, prefix, prefix_shape, data): ''' Dumps variable length data to a new dataset or appends to the existed dataset ''' h5f = h5py.File(filename, 'a') ds = h5f.get(prefix) ds_shp = h5f.get(prefix_shape) if not ds: var_dt = h5py.special_dtype(vlen=np.dtype(data[0].dtype)) ds = h5f.create_dataset(prefix, shape=(len(data),), maxshape=(None,), dtype=var_dt) dim = len(data[0].shape) ds_shp = h5f.create_dataset(prefix_shape, shape=(len(data),dim), maxshape=(None,dim), dtype=np.int64) offset = 0 offset_shp = 0 else: offset = len(ds) offset_shp = len(ds) ds.resize(len(ds) + len(data), axis=0) ds_shp.resize(len(ds_shp) + len(data), axis=0) for i in range(len(data)): ds[offset+i] = data[i].flatten() ds_shp[offset_shp+i] = data[i].shape h5f.close()
def export_data_h5(vocabulary, embedding_matrix, output='embedding.h5'): f = h5py.File(output, "w") compress_option = dict(compression="gzip", compression_opts=9, shuffle=True) words_flatten = '\n'.join(vocabulary) f.attrs['vocab_len'] = len(vocabulary) dt = h5py.special_dtype(vlen=str) _dset_vocab = f.create_dataset('words_flatten', (1, ), dtype=dt, **compress_option) _dset_vocab[...] = [words_flatten] _dset = f.create_dataset('embedding', embedding_matrix.shape, dtype=embedding_matrix.dtype, **compress_option) _dset[...] = embedding_matrix f.flush() f.close()
def add_word_ids_to_snli(h5_file, vocab): with h5py.File(h5_file, 'a') as dst: N = len(dst['sentence1']) assert len(dst['sentence2']) == N dst.create_dataset('vocab_words', (vocab.size(),), h5py.special_dtype(vlen=unicode)) dst.create_dataset('vocab_freqs', (vocab.size(),), 'int64') dst['vocab_words'][:] = vocab.words dst['vocab_freqs'][:] = vocab.frequencies dtype = h5py.special_dtype(vlen=np.dtype('int32')) sentence1_ds = dst.create_dataset('sentence1_ids', (N, ), dtype=dtype) sentence2_ds = dst.create_dataset('sentence2_ids', (N, ), dtype=dtype) ### h5py nonsense ### sentence1_ds_shapes = dst.create_dataset('sentence1_ids_shapes', (N, 1), dtype=("int")) sentence2_ds_shapes = dst.create_dataset('sentence2_ids_shapes', (N, 1), dtype=("int")) ds_shape_labels = dst.create_dataset('ds_ids_shape_labels', (1, ), dtype=("S20")) ### h5py nonsense ### sentence1_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence1'][:]]) sentence2_ds[:] = np.array([[vocab.word_to_id(w) for w in s] for s in dst['sentence2'][:]]) ### h5py nonsense ### sentence1_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence1'][:]]) sentence2_ds_shapes[:] = np.array([np.array(x).shape for x in dst['sentence2'][:]]) ds_shape_labels[:] = np.array(['sentence_len']) sentence1_ds.dims.create_scale(sentence1_ds_shapes, 'shapes') sentence1_ds.dims[0].attach_scale(sentence1_ds_shapes) sentence1_ds.dims.create_scale(ds_shape_labels, 'shape_labels') sentence1_ds.dims[0].attach_scale(ds_shape_labels) sentence2_ds.dims.create_scale(sentence2_ds_shapes, 'shapes') sentence2_ds.dims[0].attach_scale(sentence2_ds_shapes) sentence2_ds.dims.create_scale(ds_shape_labels, 'shape_labels') sentence2_ds.dims[0].attach_scale(ds_shape_labels) ### h5py nonsense ### dst.attrs['split'] = H5PYDataset.create_split_array({ 'all': { 'sentence1': (0, N), 'sentence2': (0, N), 'sentence1_ids': (0, N), 'sentence2_ids': (0, N), 'label': (0, N), 'text': (0, len(dst['text'])) } })
def processNMostCommon(N=3, wavdirpath=PATH_TRAIN_IN_16KWAVS, xmlpicklepath=PATH_TRAIN_OUT_XMLPICKLEFILE, todirrootpath=PATH_TRAIN_OUT_HDF5): global spectrogramWindowLength if not os.path.exists(todirrootpath): os.makedirs(todirrootpath) spectrogramHeight = 200 f = h5py.File(os.path.join(todirrootpath,"data_top{}_nozero.hdf5".format(N)), "w") dsetX = f.create_dataset('X', (0,1,spectrogramHeight,spectrogramWindowLength), maxshape=(None, 1,spectrogramHeight,spectrogramWindowLength)) dsety = f.create_dataset('y', (0,N), maxshape=(None,N)) dsetMediaId = f.create_dataset('MediaId', (0,1), maxshape=(None,1)) dsetClassId = f.create_dataset('ClassId', (0,1), maxshape=(None,1), dtype=h5py.special_dtype(vlen=unicode)) import pickle df = pd.read_pickle(xmlpicklepath) # read the metadata # if we would like to keep recordings with a given quality than we can do it here by uncommenting the next line #df = filterByQuality(df, 0, 3) df["OFGS"] = df.apply(mergeOFGS, axis=1) # merge Order, Family, Genus, Species df_mc = getMostCommon(df, N) # get N most common classes from the dataset df = None # let GC free up some memory print("Metadata loaded") # Shuffle rows df_mc = df_mc.iloc[np.random.permutation(len(df_mc))] df_mc.reset_index(drop=True, inplace=True) (lb,binaryLabels) = getOneHotClassId(df_mc) # generate one-hot labels pickle.dump(lb, open(os.path.join(todirrootpath,"labelBinarizer_top{}.pickle".format(N)), 'wb')) # process the selected files of top N classes and save the data into HDF5 fileRanges = np.hstack((np.arange(0, len(df_mc), 30), len(df_mc))) for i in range(len(fileRanges)-1): tempSG = wavsToSpectrogramByList(wavdirpath, df_mc.FileName[fileRanges[i]: fileRanges[i+1]], dontFilter=False) X, y, fn, cIds = spectrogramListToT4(tempSG, \ binaryLabels[fileRanges[i]: fileRanges[i+1]], \ filenames = df_mc.MediaId[fileRanges[i]: fileRanges[i+1]].values, N=spectrogramWindowLength, \ classIds = df_mc.ClassId[fileRanges[i]: fileRanges[i+1]].values) #convert to t4 pre_len = dsetX.shape[0] add_len = X.shape[0] dsetX.resize(pre_len+add_len, axis=0) dsety.resize(pre_len+add_len, axis=0) dsetMediaId.resize(pre_len + add_len, axis=0) dsetClassId.resize(pre_len + add_len, axis=0) dsetX[pre_len:pre_len+add_len,:,:,:] = X dsety[pre_len:pre_len+add_len,:] = y dsetMediaId[pre_len:pre_len+add_len,:] = np.transpose([[int(i) for i in fn]]) dsetClassId[pre_len:pre_len+add_len,:] = np.transpose([[s.encode('utf8') for s in cIds]]) f.flush() f.close return (X,y,fn) # return last batch for debug purposes
def get_mat_test_metadata(): test_f = h5py.File(test_mat_metadata_file, 'w') f = h5py.File(train_mat_metadata_file) refs, ds = f['#refs#'], f['digitStruct'] t_ds = test_f.create_group('digitStruct') ref_dtype = h5py.special_dtype(ref=h5py.Reference) t_refs = test_f.create_group('#refs#') data_idx = 0 def create_t_real_data(ref): nonlocal data_idx real = refs[ref] if isinstance(real, h5py.Group): created_group = t_refs.create_group('data_%s' % data_idx) data_idx += 1 attrs = 'label top left width height'.split() for attr in attrs: reshaped = real[attr].value.reshape(-1) data_count = reshaped.shape[0] if isinstance(reshaped[0], h5py.Reference): t_real_attr = created_group.create_dataset(attr, shape=(data_count, 1), dtype=ref_dtype) for i in range(data_count): t_real_attr[i, 0] = create_t_real_data(reshaped[i]) else: created_group.create_dataset(attr, data=real[attr].value) data_idx += 1 return created_group.ref else: t_real = t_refs.create_dataset('data_%s' % data_idx, data=real.value) data_idx += 1 return t_real.ref def create_t_element(t_group, name, ref_group, data_count): reshaped = ref_group[name].value.reshape(-1) data_count = reshaped.shape[0] if data_count is None else data_count created_dataset = t_group.create_dataset(name, (data_count, 1), dtype=ref_dtype) for i in range(data_count): created_dataset[i, 0] = create_t_real_data(reshaped[i]) create_t_element(t_ds, 'name', ds, test_data_count) create_t_element(t_ds, 'bbox', ds, test_data_count) test_f.close() return test_mat_metadata_file
def _main(args): voc_path = os.path.expanduser(args.path_to_voc) train_ids = get_ids(voc_path, train_set) val_ids = get_ids(voc_path, val_set) test_ids = get_ids(voc_path, test_set) train_ids_2007 = get_ids(voc_path, sets_from_2007) total_train_ids = len(train_ids) + len(train_ids_2007) # Create HDF5 dataset structure print('Creating HDF5 dataset structure.') fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5') voc_h5file = h5py.File(fname, 'w') uint8_dt = h5py.special_dtype( vlen=np.dtype('uint8')) # variable length uint8 vlen_int_dt = h5py.special_dtype( vlen=np.dtype(int)) # variable length default int train_group = voc_h5file.create_group('train') val_group = voc_h5file.create_group('val') test_group = voc_h5file.create_group('test') # store class list for reference class ids as csv fixed-length numpy string voc_h5file.attrs['classes'] = np.string_(str.join(',', classes)) # store images as variable length uint8 arrays train_images = train_group.create_dataset( 'images', shape=(total_train_ids, ), dtype=uint8_dt) val_images = val_group.create_dataset( 'images', shape=(len(val_ids), ), dtype=uint8_dt) test_images = test_group.create_dataset( 'images', shape=(len(test_ids), ), dtype=uint8_dt) # store boxes as class_id, xmin, ymin, xmax, ymax train_boxes = train_group.create_dataset( 'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt) val_boxes = val_group.create_dataset( 'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt) test_boxes = test_group.create_dataset( 'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt) # process all ids and add to datasets print('Processing Pascal VOC 2007 datasets for training set.') last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images, train_boxes) print('Processing Pascal VOC 2012 training set.') add_to_dataset( voc_path, '2012', train_ids, train_images, train_boxes, start=last_2007 + 1) print('Processing Pascal VOC 2012 val set.') add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes) print('Processing Pascal VOC 2007 test set.') add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes) print('Closing HDF5 file.') voc_h5file.close() print('Done.')
def convert_dtype(srcdt, ctx): """ Return a dtype based on input dtype, converting any Reference types from h5py style to h5pyd and vice-versa. """ msg = "convert dtype: {}, type: {}, len: {}".format(srcdt, type(srcdt), len(srcdt)) logging.info(msg) if ctx["verbose"]: print(msg) if len(srcdt) > 0: fields = [] for name in srcdt.fields: item = srcdt.fields[name] # item is a tuple of dtype and integer offset field_dt = convert_dtype(item[0], ctx) fields.append((name, field_dt)) tgt_dt = np.dtype(fields) else: # check if this a "special dtype" if srcdt.metadata and 'ref' in srcdt.metadata: ref = srcdt.metadata['ref'] if is_reference(ref): if is_h5py(ctx['fout']): tgt_dt = h5py.special_dtype(ref=h5py.Reference) else: tgt_dt = h5pyd.special_dtype(ref=h5pyd.Reference) elif is_regionreference(ref): if is_h5py(ctx['fout']): tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) else: tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) else: msg = "Unexpected ref type: {}".format(srcdt) logging.error(msg) raise TypeError(msg) elif srcdt.metadata and 'vlen' in srcdt.metadata: src_vlen = srcdt.metadata['vlen'] tgt_base = convert_dtype(src_vlen, ctx) if is_h5py(ctx['fout']): tgt_dt = h5py.special_dtype(vlen=tgt_base) else: tgt_dt = h5pyd.special_dtype(vlen=tgt_base) else: tgt_dt = srcdt return tgt_dt #----------------------------------------------------------------------------------