我们从Python开源项目中,提取了以下24个代码示例,用于说明如何使用tables.openFile()。
def get_tfidf_model(self): with open(self.data_path + 'tfidf.pkl', 'rb') as pkl_file: vectorizer = cPickle.load(pkl_file) vectorizer.tokenizer = self.tokenize with tables.openFile(self.data_path + 'tfidf_keys.hdf', 'r') as f: keys = f.root.keys.read() with tables.openFile(self.data_path + 'tfidf_values.hdf', 'r') as f: values = f.root.values.read() vectorizer.vocabulary_ = dict(zip(keys, values)) with tables.openFile(self.data_path + 'tfidf_stop_words.hdf', 'r') as f: vectorizer.stop_words_ = set(f.root.stop_words.read()) return vectorizer
def is_mini (ltable) : ''' Check to see if this is an external file, and re-open 'a' ''' from tables import openFile from re import compile # Das_t is always in an external file Das_tRE = compile ("(/Experiment_g/Receivers_g/Das_g_.*)/Das_t") ltablepath = ltable._v_pathname if Das_tRE.match (ltablepath) : ltablefile = ltable._v_file if ltablefile.mode != 'a' : filename = ltablefile.filename ltablefile.close () mini = openFile (filename, 'a') ltable = mini.get_node (ltablepath) add_reference (ltablepath, ltable) return ltable
def write_tables(): import tables dtype = np.dtype("S7,f4,f4,f4,f4,i4") t0 = time() sarray = np.fromiter(((str(i), float(i), float(2*i), None, float(4*i), i) for i in xrange(N)), dtype, count=N) t1 = time() - t0 print "Created sarray with %d rows in %.3fs" % (N, t1) t0 = time() h5f = tables.openFile("market.h5", "w") table = h5f.createTable(h5f.root, "market", dtype) table.append(sarray) h5f.close() t1 = time() - t0 print "[PyTables] Stored %d rows in %.3fs" % (N, t1)
def write_tables2(): import tables dtype = np.dtype("S7,f4,f4,f4,f4,i4") # t0 = time() # sarray = np.fromiter(((str(i), float(i), float(2*i), None, float(4*i), i) # for i in xrange(N)), dtype, count=N) # t1 = time() - t0 # print "Created sarray with %d rows in %.3fs" % (N, t1) t0 = time() h5f = tables.openFile("market.h5", "w") table = h5f.createTable(h5f.root, "market", dtype) count = 10000 for j in xrange(count, N, count): sarray = np.fromiter(((str(i), float(i), float(2*i), None, float(4*i), i) for i in xrange(j)), dtype) table.append(sarray) h5f.close() t1 = time() - t0 print "[PyTables] Stored %d rows in %.3fs" % (N, t1)
def merge_all_files_into_pytables(file_dir, file_out): """ process each file into pytables """ start = None start = datetime.datetime.now() out_h5 = tables.openFile(file_out, mode="w", title="bars", filters=tables.Filters(complevel=9, complib='zlib')) table = None for file_in in glob.glob(file_dir + "/*.gz"): gzip_file = gzip.open(file_in) expected_header = ["dt", "sid", "open", "high", "low", "close", "volume"] csv_reader = csv.DictReader(gzip_file) header = csv_reader.fieldnames if header != expected_header: logging.warn("expected header %s\n" % (expected_header)) logging.warn("header_found %s" % (header)) return for current_date, rows in parse_csv(csv_reader): table = out_h5.createTable("/TD", "date_" + current_date, OHLCTableDescription, expectedrows=len(rows), createparents=True) table.append(rows) table.flush() if table is not None: table.flush() end = datetime.datetime.now() diff = (end - start).seconds logging.debug("finished it took %d." % (diff))
def get_length(path): if tables.__version__[0] == '2': target_table = tables.openFile(path, 'r') target_index = target_table.getNode('/indices') else: target_table = tables.open_file(path, 'r') target_index = target_table.get_node('/indices') return target_index.shape[0]
def synchronized_open_file(*args, **kwargs): if tables.__version__[0] == '2': tbf = tables.openFile(*args, **kwargs) else: tbf = tables.open_file(*args, **kwargs) return tbf
def _f_open(self, args): if not self._opened: self._filename = args["filename"] self._title = args["title"] if self._title is None or not isinstance(self._title, basestring): self._title = strftime("PicoTape-%Y%m%d-%H%M%S") self._limit = args["limit"] self._overwrite = args["overwrite"] if self._filename is not None: self._fhandle = None error = "OK" try: if not os.path.exists(os.path.dirname(self._filename)): error = "Path to %s not found" % self._filename elif not self._overwrite and os.path.exists(self._filename): error = "File %s exists" % self._filename else: self._fhandle = tb.openFile(self._filename, title=self._title, mode="w") except Exception as ex: self._fhandle = None error = ex.message if self._fhandle is not None: self._opened = True self._readq.put(error) else: self._memstore = True self._opened = True self._readq.put("OK") self._stats = args["stats"] and not self._memstore
def open_h5_file_read(h5filename): """ Open an existing H5 in read mode. Same function as in hdf5_utils, here so we avoid one import """ return tables.openFile(h5filename, mode='r')
def safe_hdf(array, name): if os.path.isfile(name + '.hdf') and not args.overwrite: logger.warning("Not saving %s, already exists." % (name + '.hdf')) else: if os.path.isfile(name + '.hdf'): logger.info("Overwriting %s." % (name + '.hdf')) else: logger.info("Saving to %s." % (name + '.hdf')) with tables.openFile(name + '.hdf', 'w') as f: atom = tables.Atom.from_dtype(array.dtype) filters = tables.Filters(complib='blosc', complevel=5) ds = f.createCArray(f.root, name.replace('.', ''), atom, array.shape, filters=filters) ds[:] = array
def open_h5_file_read(h5filename): """ Open an existing H5 in read mode. """ return tables.openFile(h5filename, mode='r')
def open_h5_file_append(h5filename): """ Open an existing H5 in append mode. """ return tables.openFile(h5filename, mode='a') ################################################ MAIN #####################################
def process_filelist_train(filelist=None,testsongs=None,tmpfilename=None): """ Main function, process all files in the list (as long as their track_id is not in testsongs) INPUT filelist - a list of song files testsongs - set of track ID that we should not use tmpfilename - where to save our processed features """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return # dimension fixed (12-dimensional timbre vector) ndim = 12 finaldim = 90 # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION') output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,finaldim),'', expectedrows=len(filelist)) output.createEArray(group,'artist_id',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #',cnt_f # check what file/song is this h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) track_id = GETTERS.get_track_id(h5) if track_id in testsongs: # just in case, but should not be necessary print 'Found test track_id during training? weird.',track_id h5.close() continue # extract features, then close file processed_feats = compute_features(h5) h5.close() if processed_feats is None: continue # save features to tmp file output.root.data.artist_id.append( np.array( [artist_id] ) ) output.root.data.feats.append( processed_feats ) # we're done, close output output.close() return
def train(nthreads,maindir,output,testsongs,trainsongs=None): """ Main function to do the training Do the main pass with the number of given threads. Then, reads the tmp files, creates the main output, delete the tmpfiles. INPUT - nthreads - number of threads to use - maindir - dir of the MSD, wehre to find song files - output - main model, contains everything to perform KNN - testsongs - set of songs to ignore - trainsongs - list of songs to use for training (FASTER) RETURN - nothing :) """ # sanity checks if os.path.isfile(output): print 'ERROR: file',output,'already exists.' return # initial time t1 = time.time() # do main pass tmpfiles = process_filelist_train_main_pass(nthreads,maindir,testsongs,trainsongs=trainsongs) if tmpfiles is None: print 'Something went wrong, tmpfiles are None' return # intermediate time t2 = time.time() stimelen = str(datetime.timedelta(seconds=t2-t1)) print 'Main pass done after',stimelen; sys.stdout.flush() # find approximate number of rows per tmpfiles h5 = tables.openFile(tmpfiles[0],'r') nrows = h5.root.data.artist_id.shape[0] * len(tmpfiles) h5.close() # create output output = tables.openFile(output, mode='a') group = output.createGroup("/",'data','KNN MODEL FILE FOR ARTIST RECOGNITION') output.createEArray(group,'feats',tables.Float64Atom(shape=()),(0,90),'feats', expectedrows=nrows) output.createEArray(group,'artist_id',tables.StringAtom(18,shape=()),(0,),'artist_id', expectedrows=nrows) # aggregate temp files for tmpf in tmpfiles: h5 = tables.openFile(tmpf) output.root.data.artist_id.append( h5.root.data.artist_id[:] ) output.root.data.feats.append( h5.root.data.feats[:] ) h5.close() # delete tmp file os.remove(tmpf) # close output output.close() # final time t3 = time.time() stimelen = str(datetime.timedelta(seconds=t3-t1)) print 'Whole training done after',stimelen # done return
def process_filelist_test(filelist=None,model=None,tmpfilename=None,K=1): """ Main function, process all files in the list (as long as their track_id is not in testsongs) INPUT filelist - a list of song files model - h5 file containing feats and artist_id for all train songs tmpfilename - where to save our processed features K - K-nn parameter (default=1) """ # sanity check for arg in locals().values(): assert not arg is None,'process_filelist_train, missing an argument, something still None' if os.path.isfile(tmpfilename): print 'ERROR: file',tmpfilename,'already exists.' return if not os.path.isfile(model): print 'ERROR: model',model,'does not exist.' return # dimension fixed (12-dimensional timbre vector) ndim = 12 finaldim = 90 # create kdtree h5model = tables.openFile(model, mode='r') assert h5model.root.data.feats.shape[1]==finaldim,'inconsistency in final dim' kd = ANN.kdtree(h5model.root.data.feats) # create outputfile output = tables.openFile(tmpfilename, mode='a') group = output.createGroup("/",'data','TMP FILE FOR ARTIST RECOGNITION') output.createEArray(group,'artist_id_real',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) output.createEArray(group,'artist_id_pred',tables.StringAtom(18,shape=()),(0,),'', expectedrows=len(filelist)) # iterate over files cnt_f = 0 for f in filelist: cnt_f += 1 # verbose if cnt_f % 50000 == 0: print 'training... checking file #',cnt_f # check what file/song is this h5 = GETTERS.open_h5_file_read(f) artist_id = GETTERS.get_artist_id(h5) track_id = GETTERS.get_track_id(h5) if track_id in testsongs: # just in case, but should not be necessary print 'Found test track_id during training? weird.',track_id h5.close() continue # extract features, then close file processed_feats = compute_features(h5) h5.close() if processed_feats is None: continue # do prediction artist_id_pred = do_prediction(processed_feats,kd,h5model,K) # save features to tmp file output.root.data.artist_id_real.append( np.array( [artist_id] ) ) output.root.data.artist_id_pred.append( np.array( [artist_id_pred] ) ) # we're done, close output output.close() return
def test(nthreads,model,testsongs,K): """ Main function to do the training Do the main pass with the number of given threads. Then, reads the tmp files, creates the main output, delete the tmpfiles. INPUT - nthreads - number of threads to use - model - h5 files containing feats and artist_id for all train songs - testsongs - set of songs to ignore - K - K-nn parameter RETURN - nothing :) """ # initial time t1 = time.time() # do main pass tmpfiles = process_filelist_test_main_pass(nthreads,model,testsongs,K) if tmpfiles is None: print 'Something went wrong, tmpfiles are None' return # intermediate time t2 = time.time() stimelen = str(datetime.timedelta(seconds=t2-t1)) print 'Main pass done after',stimelen; sys.stdout.flush() # aggregate temp files artist_id_found = 0 total_predictions = 0 for tmpf in tmpfiles: h5 = tables.openFile(tmpf) for k in range( h5.root.data.artist_id_real.shape[0] ): total_predictions += 1 if h5.root.data.artist_id_real[k] == h5.root.data.artist_id_pred[k]: artist_id_found += 1 h5.close() # delete tmp file os.remove(tmpf) # final time t3 = time.time() stimelen = str(datetime.timedelta(seconds=t3-t1)) print 'Whole testing done after',stimelen # results print 'We found the right artist_id',artist_id_found,'times out of',total_predictions,'predictions.' print 'e.g., accuracy is:',artist_id_found*1./total_predictions # done return
def test(nthreads,model,testsongs,npicks,winsize,finaldim,K,typecompress): """ Main function to do the testing Do the main pass with the number of given threads. Then, reads the tmp files, computes the score, delete the tmpfiles. INPUT - nthreads - number of threads to use - model - h5 files containing feats and year for all train songs - testsongs - songs to test on - npicks - number of samples to pick per song - winsize - window size (in beats) of a sample - finaldim - final dimension of the sample, something like 5? - K - K-nn parameter - typecompress - feature type, one of: 'picks', 'corrcoeff', 'cov' RETURN - nothing """ # initial time t1 = time.time() # do main pass tmpfiles = process_filelist_test_main_pass(nthreads,model,testsongs, npicks,winsize,finaldim,K, typecompress) if tmpfiles is None: print 'Something went wrong, tmpfiles are None' return # intermediate time t2 = time.time() stimelen = str(datetime.timedelta(seconds=t2-t1)) print 'Main pass done after',stimelen; sys.stdout.flush() # aggregate temp files year_real = [] year_pred = [] for tmpf in tmpfiles: h5 = tables.openFile(tmpf) year_real.extend( h5.root.data.year_real[:] ) year_pred.extend( h5.root.data.year_pred[:] ) h5.close() # delete tmp file os.remove(tmpf) # result BENCHMARK.evaluate(year_real,year_pred,verbose=1) # final time t3 = time.time() stimelen = str(datetime.timedelta(seconds=t3-t1)) print 'Whole testing done after',stimelen # done return
def create_aggregate_file(h5filename,title='H5 Aggregate File',force=False,expectedrows=1000,complevel=1, summaryfile=False): """ Create a new HDF5 file for all songs. It will contains everything that are in regular song files. Tables created empty. If force=False, refuse to overwrite an existing file Raise a ValueError if it's the case. If summaryfile=True, creates a sumary file, i.e. no arrays Other optional param is the H5 file. DETAILS - if you create a very large file, try to approximate correctly the number of data points (songs), it speeds things up with arrays (by setting the chunking correctly). - we set the compression level to 1 by default, it uses the ZLIB library to disable compression, set it to 0 Setups the groups, each containing a table 'songs' with one row: - metadata - analysis """ # check if file exists if not force: if os.path.exists(h5filename): raise ValueError('file exists, can not create HDF5 song file') # summary file? change title if summaryfile: title = 'H5 Summary File' # create the H5 file h5 = tables.openFile(h5filename, mode='w', title='H5 Song File') # set filter level h5.filters = tables.Filters(complevel=complevel,complib='zlib') # setup the groups and tables # group metadata group = h5.createGroup("/",'metadata','metadata about the song') table = h5.createTable(group,'songs',DESC.SongMetaData,'table of metadata for one song', expectedrows=expectedrows) # group analysis group = h5.createGroup("/",'analysis','Echo Nest analysis of the song') table = h5.createTable(group,'songs',DESC.SongAnalysis,'table of Echo Nest analysis for one song', expectedrows=expectedrows) # group musicbrainz group = h5.createGroup("/",'musicbrainz','data about the song coming from MusicBrainz') table = h5.createTable(group,'songs',DESC.SongMusicBrainz,'table of data coming from MusicBrainz', expectedrows=expectedrows) # create arrays if not summaryfile: create_all_arrays(h5,expectedrows=expectedrows) # close it, done h5.close()