我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用dill.dump()。
def saga_cv_cache(*args): arghash = sha1(repr(args).encode('utf-8')).hexdigest() fn = "res/baseline_linear_{}.dill".format(arghash) try: with open(fn, 'rb') as f: out = dill.load(f) logging.info("Loaded cached version.") except FileNotFoundError: logging.info("Computing...") out = saga_cv(*args) with open(fn, 'wb') as f: dill.dump(out, f) return out
def save(self, path): """Save model to a pickle located at `path`""" with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data), f)
def test_chain_dump_obj(): walker1 = SimpleNamespace(_dump_obj=lambda *_: "walker1") walker2 = SimpleNamespace(_dump_obj=lambda *_: "walker2") tmp_file = br.TempFile() tmp_file.write("outfile results") chain = SimpleNamespace(walkers=[walker1, walker2], outfile=tmp_file.path, cold_heat=0.1, hot_heat=0.2, step_counter=20, best_score_ever_seen=100, _dump_obj=mcmcmc._Chain._dump_obj) dump = chain._dump_obj(chain) assert dump["walkers"] == ["walker1", "walker2"] assert dump["cold_heat"] == 0.1 assert dump["hot_heat"] == 0.2 assert dump["step_count"] == 20 assert dump["best_score"] == 100 assert dump["results"] == "outfile results"
def test_chain_apply_dump(capsys): walker1 = SimpleNamespace(_apply_dump=lambda *_: print("Applying dump to walker1")) walker2 = SimpleNamespace(_apply_dump=lambda *_: print("Applying dump to walker2")) tmp_file = br.TempFile() chain = SimpleNamespace(walkers=[walker1, walker2], outfile=tmp_file.path, cold_heat=None, hot_heat=None, step_counter=None, best_score_ever_seen=None, _apply_dump=mcmcmc._Chain._apply_dump) var_dict = {"walkers": [None, None], "cold_heat": 0.1, "hot_heat": 0.2, "step_count": 20, "best_score": 100, "results": "Some results"} chain._apply_dump(chain, var_dict) assert chain.walkers == [walker1, walker2] out, err = capsys.readouterr() assert out == "Applying dump to walker1\nApplying dump to walker2\n" assert chain.cold_heat == 0.1 assert chain.hot_heat == 0.2 assert chain.step_counter == 20 assert chain.best_score_ever_seen == 100 assert tmp_file.read() == "Some results"
def test_mcmcmc_resume(capsys): mc_obj = SimpleNamespace(dumpfile="does_not_exist", resume=mcmcmc.MCMCMC.resume) assert mc_obj.resume(mc_obj) is False tmp_file = br.TempFile(byte_mode=True) dill.dump(["a", "b", "c"], tmp_file) mc_obj.dumpfile = tmp_file.path chain1 = SimpleNamespace(_apply_dump=lambda *_: print("applying chain1")) chain2 = SimpleNamespace(_apply_dump=lambda *_: print("applying chain2")) chain3 = SimpleNamespace(_apply_dump=lambda *_: print("applying chain3")) mc_obj.chains = [chain1, chain2, chain3] mc_obj.run = lambda *_: print("Running") assert mc_obj.resume(mc_obj) is True out, err = capsys.readouterr() assert out == "applying chain1\napplying chain2\napplying chain3\nRunning\n", print(out)
def run_preprocessing(runner, outdir): t = runner.preprocess() print '\nFINISHED preprocessing. Output directory:' print " ", os.path.abspath(outdir) resultjson = {} for field in t.outputfields: if field == 'pdbstring': with open(os.path.join(outdir, 'prep.pdb'), 'w') as outfile: print >> outfile, t.getoutput('pdbstring') else: resultjson[field] = t.getoutput(field) with open(os.path.join(outdir, 'prep.json'), 'w') as outfile: json.dump(resultjson, outfile) with open(os.path.join(outdir, 'workflow_state.dill'), 'w') as outfile: dill.dump(runner, outfile)
def dill_words(num_words, fname="words.dill"): fname = os.path.join(os.path.dirname(os.path.realpath(__file__)), fname) try: if os.path.isfile(fname): words = dill.load(open(fname, "rb")) if(len(words) < ip_handling.iutils.get_ipv6_word_possibilities()): os.remove(fname) raise Exception # go into except block to reload words return words else: words = load_words(num_words) if(len(words) < ip_handling.iutils.get_ipv6_word_possibilities()): raise Exception # go into except block to reload words dill.dump(words, open(fname, "wb")) return words except: try: words = load_words(num_words) if(len(words) < ip_handling.iutils.get_ipv6_word_possibilities()): raise Exception # go into except block to reload words dill.dump(words, open(fname, "wb")) return words except: return load_words(num_words)
def save(network, sess, filename=None): """Save the variables contained by a network to disk.""" to_save = collections.defaultdict(dict) variables = snt.get_variables_in_module(network) for v in variables: split = v.name.split(":")[0].split("/") module_name = split[-2] variable_name = split[-1] to_save[module_name][variable_name] = v.eval(sess) if filename: with open(filename, "wb") as f: pickle.dump(to_save, f) return to_save
def main(prepare, use, do, get, params, debug): if get is not None: do = get if prepare is not None and use in ['ht', 'yjb', 'yh', 'gf', 'xq']: user = easytrader.use(use, debug) user.prepare(prepare) with open(ACCOUNT_OBJECT_FILE, 'wb') as f: dill.dump(user, f) if do is not None: with open(ACCOUNT_OBJECT_FILE, 'rb') as f: user = dill.load(f) if len(params) > 0: result = getattr(user, do)(*params) else: result = getattr(user, do) json_result = json.dumps(result, indent=4, ensure_ascii=False, sort_keys=True) click.echo(json_result)
def persist(self, X, y, thesaurus): """ Save the data and the processed thesaurus. Parameters ---------- X: sparse matrix The train data: Will be compressed. y: sparse matrix The label data: Will be compressed. thesaurus: ThesaurusReader ThesaurusReader object: Will be pickled. """ print('Persisting features to disk') self._delete_old_files() self._save(self._persist_name('X'), X) self._save(self._persist_name('y'), y) with open(self._persist_name('TR'), mode='wb') as f: pickle.dump(thesaurus, f)
def save(network, sess, filename=None): """Save the variables contained by a network to disk.""" to_save = collections.defaultdict(dict) variables = nn.get_variables_in_module(network) for v in variables: split = v.name.split(":")[0].split("/") module_name = split[-2] variable_name = split[-1] to_save[module_name][variable_name] = v.eval(sess) if filename: with open(filename, "wb") as f: pickle.dump(to_save, f) return to_save
def restore(file_name="dump.bin"): return pickle.load(open(file_name, 'rb')) # class Encoding: # pass # @extension # class Math: # WOOOT? just # import math as Math # def __getattr__(self, attr): # import sys # import math # # ruby method_missing !!! # import inspect # for name, obj in inspect.getmembers(sys.modules['math']): # if name==attr: return obj # return False
def save_vector_cache(vectors, vector_out_file, filetype='', **kwargs): logging.info("Saving {} vectors to cache {}".format(len(vectors),vector_out_file)) if (vector_out_file.endswith('.dill') or filetype == 'dill'): with open(vector_out_file, 'wb') as data_file: dill.dump(vectors, data_file, protocol=kwargs.get('dill_protocol', 3)) elif (vector_out_file.endswith('.joblib') or filetype == 'joblib'): joblib.dump(vectors, vector_out_file, compress=kwargs.get('joblib_compression', 3), protocol=kwargs.get('joblib_protocol', 3)) elif (vector_out_file.endswith('.sqlite') or filetype == 'sqlite'): autocommit = kwargs.pop('autocommit', True) if (isinstance(vectors, SqliteDict)): vectors.commit() else: with SqliteDict(vector_out_file, autocommit=autocommit) as data_file: for key, value in vectors.items(): data_file[key] = value if (not autocommit): data_file.commit() else: raise NotImplementedError
def save(self, path): """Save model to a pickle located at `path`""" with tempfile.TemporaryDirectory() as td: U.save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: dill.dump((model_data, self._act_params), f)
def __MergeData(cls, InputDir, OutputDir, mode): """""" if(mode == 'train'): ActionDataFile = '%s/train_2016_v2.csv' % InputDir OutputFile = '%s/train.pkl' % OutputDir else: ActionDataFile = '%s/sample_submission.csv' % InputDir OutputFile = '%s/test.pkl' % OutputDir print(OutputFile) PropertyDataFile = '%s/properties_2016.csv' % InputDir ## load ActionData = pd.read_csv(ActionDataFile, parse_dates=['transactiondate']) PropertyData = pd.read_csv(PropertyDataFile) ## left join MergedData = ActionData.merge(PropertyData, how='left', on='parcelid') ## output into pkl file if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) with open(OutputFile, 'wb') as o_file: pickle.dump(MergedData, o_file, -1) o_file.close() return ## split rawcensustractandblock into census, tract and block
def __ParseCTB(cls, InputDir, OutputDir, mode): """""" if(mode == 'train'): InputFile = '%s/train.pkl' % InputDir OutputFile = '%s/train.pkl' % OutputDir else: InputFile = '%s/test.pkl' % InputDir OutputFile = '%s/test.pkl' % OutputDir ## load with open(InputFile, 'rb') as i_file: df_data = pickle.load(i_file) i_file.close() ## extract census, tract and block identifies df_data['rawcensustractandblock'] = (df_data['rawcensustractandblock'] * 1000).astype(np.float64).astype(np.int64) df_data['fipsid'] = ((df_data['rawcensustractandblock'] / 10000000).astype(np.int64)).astype(str) df_data['tractandblock'] = df_data['rawcensustractandblock'] % 10000000 df_data['tractid'] = ((df_data['tractandblock'] / 10).astype(np.int64)).astype(str) df_data['blockid'] = ((df_data['tractandblock'] % 10).astype(np.int64)).astype(str) df_data.drop(['fips', 'rawcensustractandblock', 'tractandblock'], axis = 1, inplace = True) ## output into pkl file if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) with open(OutputFile, 'wb') as o_file: pickle.dump(df_data, o_file, -1) o_file.close() return
def __SplitData(cls, InputDir, OutputDir, mode): """""" if(mode == 'train'): InputFileData = '%s/train.pkl' % InputDir else: InputFileData = '%s/test.pkl' % InputDir InputFileFeatMap = '%s/featmap.pkl' % InputDir ## load with open(InputFileData, 'rb') as i_file: df_data = pickle.load(i_file) i_file.close() with open(InputFileFeatMap, 'rb') as i_file: d_feat = pickle.load(i_file) i_file.close() if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) with open('%s/featmap.pkl' % OutputDir, 'wb') as o_file: pickle.dump(d_feat, o_file, -1) o_file.close() ## output into individual pkl files for i in range(12): month = i + 1 df_MonthData = df_data[(df_data['transactiondate'].dt.month == month)] with open('%s/%s.pkl'% (OutputDir, month), 'wb') as o_file: pickle.dump(df_MonthData, o_file, -1) o_file.close() return ## launch single task
def run(self, tasks, MonthsOfTest): """""" print('\nLoad data ...') start = time.time() ## load train with open('%s/1.pkl' % self._InputDir, 'rb') as i_file: self.TrainData = pickle.load(i_file) i_file.close() for i in range(2,MonthsOfTest[0]): with open('%s/%s.pkl' % (self._InputDir, i), 'rb') as i_file: df_tmp = pickle.load(i_file) self.TrainData = pd.concat([self.TrainData, df_tmp], ignore_index = True) i_file.close() ## load test with open('%s/%s.pkl' % (self._InputDir, MonthsOfTest[0]), 'rb') as i_file: self.TestData = pickle.load(i_file) i_file.close() for i in MonthsOfTest[1: ]: with open('%s/%s.pkl' % (self._InputDir, i), 'rb') as i_file: df_tmp = pickle.load(i_file) self.TestData = pd.concat([self.TestData, df_tmp], ignore_index = True) i_file.close() end = time.time() print('Load data done, time consumed %ds ...' % (end - start)) ## tasks for l2 test print('\nLaunch task ...') start = time.time() for task in tasks: self.__LaunchTask(task, MonthsOfTest) end = time.time() if (os.path.exists(self._OutputDir) == False): os.makedirs(self._OutputDir) with open('%s/train.pkl' % self._OutputDir, 'wb') as o_file: pickle.dump(self.TrainData, o_file, -1) o_file.close() with open('%s/test.pkl' % self._OutputDir, 'wb') as o_file: pickle.dump(self.TestData, o_file, -1) o_file.close() print('All tasks done, time consumed %ds ...' % (end - start))
def train(self): """""" print('size before truncated outliers is %d ' % len(self.TrainData)) self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] print('size after truncated outliers is %d ' % len(self.TrainData)) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns FeatCols = list(self._l_train_columns) etr = ExtraTreesRegressor( n_estimators= self._iter, criterion= 'mse', max_features= int(math.sqrt(len(FeatCols))), max_depth = self._depth, n_jobs= 2, random_state= 2017, verbose= True ) self._model = etr.fit(X, Y) ## evaluate on valid data self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model, 'wb') as o_file: pickle.dump(self._model, o_file, -1) o_file.close() self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def train(self): """""" start = time.time() print('size before truncated outliers is %d ' % len(self.TrainData)) TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] print('size after truncated outliers is %d ' % len(self.TrainData)) TrainData['longitude'] -= -118600000 TrainData['latitude'] -= 34220000 #extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') #self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = LassoLars(alpha= self._lr_alpha, max_iter= self._lr_iter, verbose= True) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) #with open(self._f_eval_train_model, 'wb') as o_file: # pickle.dump(self._model, o_file, -1) #o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def train(self): """""" start = time.time() extra_tr = pd.read_hdf(path_or_buf='%s/p21/eval_train.hdf' % self.InputDir, key='train') print('size before truncated outliers is %d ' % len(self.TrainData)) self.TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] #self.TrainData = self.TrainData.join(extra_tr, on='parcelid', how= 'left') self.TrainData = pd.concat([self.TrainData, extra_tr.drop('parcelid', axis= 1)], axis = 1) print('size after truncated outliers is %d ' % len(self.TrainData)) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) lr = Lasso(alpha= self._lr_alpha, max_iter= self._lr_iter, tol= 1e-4, random_state= 2017, selection= self._lr_sel) self._model = lr.fit(X, Y) end = time.time() print('Training iterates %d, time consumed %d ' % (self._model.n_iter_, (end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model, 'wb') as o_file: pickle.dump(self._model, o_file, -1) o_file.close() #self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], # ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def train(self): """""" start = time.time() print('size before truncated outliers is %d ' % len(self.TrainData)) TrainData = self.TrainData[(self.TrainData['logerror'] > self._low) & (self.TrainData['logerror'] < self._up)] print('size after truncated outliers is %d ' % len(TrainData)) X = TrainData.drop(self._l_drop_cols, axis=1) Y = TrainData['logerror'] self._l_train_columns = X.columns X = X.values.astype(np.float32, copy=False) rr = Ridge(alpha= self._alpha, max_iter = self._iter, solver= 'svd') self._model = rr.fit(X, Y) end = time.time() print('time consumed %d ' % ((end - start))) self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__, datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model, 'wb') as o_file: pickle.dump(self._model, o_file, -1) o_file.close() self.TrainData = pd.concat([self.TrainData, self.ValidData[self.TrainData.columns]], ignore_index=True) ## ignore_index will reset the index or index will be overlaped return
def train(self): """""" print('size before truncated outliers is %d ' % len(self.TrainData)) #TrainData = self.TrainData[(self.TrainData['logerror'] > -0.4) & (self.TrainData['logerror'] < 0.418)] TrainData = self.TrainData print('size after truncated outliers is %d ' % len(TrainData)) print('train data size %d' % len(TrainData)) #self.__ExtraEncode() X = TrainData.drop(self._l_drop_cols, axis=1) Y = TrainData['logerror'] l_train_columns = X.columns cols = [] for col in l_train_columns: for cc in self._l_cate_cols: if (col.startswith('%s_' % cc)): cols.append(col) break tmp_cols = set(cols) if(len(tmp_cols) != len(cols)): print('!!!! cols duplicated .') self._l_train_columns = list(tmp_cols) X = scipy.sparse.csr_matrix(X[self._l_train_columns]) self._model = als.FMRegression(n_iter= self._iter, init_stdev=0.1, rank= self._rank, l2_reg_w= self._reg_w, l2_reg_V= self._reg_v) self._model.fit(X, Y) print('training done.') self._f_eval_train_model = '{0}/{1}_{2}.pkl'.format(self.OutputDir, self.__class__.__name__,datetime.now().strftime('%Y%m%d-%H:%M:%S')) with open(self._f_eval_train_model,'wb') as o_file: pickle.dump(self._model,o_file,-1) o_file.close() self.TrainData = pd.concat([self.TrainData,self.ValidData[self.TrainData.columns]],ignore_index= True) ## ignore_index will reset the index or index will be overlaped return
def wrap(self, func): def wrapped_func(*args): gpu_hash, gpu_ip, ws_port = self.connector.contact_server() if (gpu_hash is None or gpu_ip is None or ws_port is None): return source = get_source(func) params = get_local_vars(source, 4) uploads = {} uploads['function'] = func uploads['variables'] = args uploads['env'] = params with open('uploads.pkl', 'wb') as file: dill.dump(uploads, file) self.connector.upload_params_decorator(gpu_ip, gpu_hash) outUrl = self.connector.stream_output(gpu_ip, gpu_hash, ws_port) if outUrl is None: color_print('computation failed') return result = self.connector.get_return_object(outUrl) return result return wrapped_func
def run_in_cloud(cell, connector, namespace): local_vars = get_local_vars(cell, namespace) imports, unused_vars = find_required_imports(cell, local_vars) for var in unused_vars: del local_vars[var] uploads = {} uploads['cell'] = cell uploads['env'] = local_vars uploads['imports'] = imports with open('uploads.pkl', 'wb') as file: dill.dump(uploads, file) server_info = connector.contact_server() if (server_info is None): return gpu_hash, gpu_ip, ws_port = server_info connector.upload_params_magic(gpu_ip, gpu_hash) outUrl = connector.stream_output(gpu_ip, gpu_hash, ws_port) if outUrl is None: return result = connector.get_return_object(outUrl) return result
def Save(self,name_file): """ name_file: name of the file without extension. The extension .bms is added by function """ with open(name_file+'.bms','wb') as file: model=dill.dump(self,file)
def save_session(fname=None, session=None, pickleProto=4): import dill as pickle if fname is None: fname = conf.session if not fname: conf.session = fname = utils.get_temp_file(keep=True) log_interactive.info("Use [%s] as session file" % fname) if session is None: session = builtins.__dict__["scapy_session"] to_be_saved = session.copy() for k in list(to_be_saved.keys()): if k in ["__builtins__", "In", "Out", "conf"] or k.startswith("_") or \ (hasattr(to_be_saved[k], "__module__") and str(to_be_saved[k].__module__).startswith('IPython')): del(to_be_saved[k]) continue if type(to_be_saved[k]) in [type, types.ModuleType, types.MethodType]: log_interactive.info("[%s] (%s) can't be saved." % (k, type(to_be_saved[k]))) del(to_be_saved[k]) try: os.rename(fname, fname+".bak") except OSError: pass f=gzip.open(fname,"wb") for i in to_be_saved.keys(): #d = {i: to_be_saved[i]} #pickle.dump(d, f, pickleProto) pickle.dump(to_be_saved, f, pickleProto) f.close()
def save_object(fname, obj): import dill as pickle pickle.dump(obj,gzip.open(fname,"wb"))
def saga_score_struct_cache(*args): arghash = sha1(repr(("score_struct",) + args).encode('utf-8')).hexdigest() fn = "res/baseline_linear_{}.dill".format(arghash) try: with open(fn, 'rb') as f: out = dill.load(f) logging.info("Loaded cached version.") except FileNotFoundError: logging.info("Computing...") out = saga_score_struct(*args) with open(fn, 'wb') as f: dill.dump(out, f) return out
def saga_decision_function(dataset, k, link_alpha, prop_alpha, l1_ratio): fn = cache_fname("linear_val_df", (dataset, k, link_alpha, prop_alpha, l1_ratio)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) ds = 'erule' if dataset == 'cdcp' else 'ukp-essays' # sorry path = os.path.join("data", "process", ds, "folds", "{}", "{}") # sorry again: get val docs n_folds = 5 if dataset == 'ukp' else 3 load, ids = get_dataset_loader(dataset, "train") for k_, (_, val) in enumerate(KFold(n_folds).split(ids)): if k_ == k: break val_docs = list(load(ids[val])) X_tr_link, y_tr_link = load_csr(path.format(k, 'train.npz'), return_y=True) X_te_link, y_te_link = load_csr(path.format(k, 'val.npz'), return_y=True) X_tr_prop, y_tr_prop = load_csr(path.format(k, 'prop-train.npz'), return_y=True) X_te_prop, y_te_prop = load_csr(path.format(k, 'prop-val.npz'), return_y=True) baseline = BaselineStruct(link_alpha, prop_alpha, l1_ratio) baseline.fit(X_tr_link, y_tr_link, X_tr_prop, y_tr_prop) Y_marg = baseline.decision_function(X_te_link, X_te_prop, val_docs) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump((Y_marg, baseline), f) return Y_marg, baseline
def linear_cv_score(dataset, alpha, l1_ratio, constraints): fn = cache_fname("linear_cv_score", (dataset, alpha, l1_ratio, constraints)) if os.path.exists(fn): logging.info("Loading {}".format(fn)) with open(fn, "rb") as f: return dill.load(f) load, ids = get_dataset_loader(dataset, split="train") n_folds = 5 if dataset == 'ukp' else 3 scores = [] for k, (tr, val) in enumerate(KFold(n_folds).split(ids)): Y_marg, bl = saga_decision_function(dataset, k, alpha, alpha, l1_ratio) val_docs = list(load(ids[val])) Y_true = [doc.label for doc in val_docs] Y_pred = bl.fast_decode(Y_marg, val_docs, constraints) scores.append(bl._score(Y_true, Y_pred)) with open(fn, "wb") as f: logging.info("Saving {}".format(fn)) dill.dump(scores, f) return scores
def write_dill(self, file_): """ Serialize a computation to a file or file-like object :param file_: If string, writes to a file :type file_: File-like object, or string """ node_serialize = nx.get_node_attributes(self.dag, _AN_TAG) if all(serialize for name, serialize in six.iteritems(node_serialize)): obj = self else: obj = self.copy() for name, tags in six.iteritems(node_serialize): if _T_SERIALIZE not in tags: obj._set_uninitialized(name) if isinstance(file_, six.string_types): with open(file_, 'wb') as f: dill.dump(obj, f) else: dill.dump(obj, file_)
def save(self, folder): """Save object and return corresponding files.""" if not os.path.exists(folder): os.makedirs(folder) files = [] # annoy objects can't be pickled, so save these separately for k, v in self._annoy_objects.items(): annoy_filepath = os.path.join(folder, '{}.ann'.format(k)) v._ann_obj.save(annoy_filepath) files.append(annoy_filepath) pickle_filepath = os.path.join(folder, 'object.pickle') with open(pickle_filepath, 'wb') as handle: dill.dump(self, handle) files.append(pickle_filepath) # write entity types enttypes = self.get_entity_types() info_file = os.path.join(folder, 'entity_info.json') with open(info_file, 'w') as handle: json.dump(enttypes, handle) files.append(info_file) return files
def _write_args(self, input_filename): # serialize args to file if self._pass_op_args(): with open(input_filename, 'wb') as f: arg_dict = ({'args': self.op_args, 'kwargs': self.op_kwargs}) if self.use_dill: dill.dump(arg_dict, f) else: pickle.dump(arg_dict, f)
def _generate_python_code(self): if self.use_dill: pickling_library = 'dill' else: pickling_library = 'pickle' fn = self.python_callable # dont try to read pickle if we didnt pass anything if self._pass_op_args(): load_args_line = 'with open(sys.argv[1], "rb") as f: arg_dict = {}.load(f)'.format(pickling_library) else: load_args_line = 'arg_dict = {"args": [], "kwargs": {}}' # no indents in original code so we can accept any type of indents in the original function # we deserialize args, call function, serialize result if necessary return dedent("""\ import {pickling_library} import sys {load_args_code} args = arg_dict["args"] kwargs = arg_dict["kwargs"] with open(sys.argv[3], 'r') as f: virtualenv_string_args = list(map(lambda x: x.strip(), list(f))) {python_callable_lines} res = {python_callable_name}(*args, **kwargs) with open(sys.argv[2], 'wb') as f: res is not None and {pickling_library}.dump(res, f) """).format( load_args_code=load_args_line, python_callable_lines=dedent(inspect.getsource(fn)), python_callable_name=fn.__name__, pickling_library=pickling_library) self.log.info("Done.")
def save(self, path): """ Save the model. Parameters ---------- path : str a full path to a file where a model will be saved to """ if self.estimator is not None: pickle.dump(self.estimator, path) else: raise ValueError("Scikit-learn estimator does not exist. Check your config for 'estimator'.")
def main(): ''' Beginning on START_DATE, step forward hourly, training on last hour's NLDAS FORA dataset with transformers in a 2-layer hierarchical ensemble, training on the last hour of data and making out-of-training-sample predictions for the current hour. Makes a dill dump file for each hour run. Runs fro NSTEPS hour steps. ''' date = START_DATE add_hour = datetime.timedelta(hours=1) get_file_name = lambda date: date.isoformat( ).replace(':','_').replace('-','_') + '.dill' scalers = zip(('MinMaxScaler', 'RobustScaler', 'StandardScaler', 'None'), (minmax, robust, standard, None)) estimators = zip(('LinearRegression', ), (linear, )) init_func = partial(ensemble_init_func, pca=pca, scalers=scalers, n_components=n_components, estimators=estimators, preamble=preamble, log=log, minmax_bounds=minmax_bounds, summary='Flatten, Subset, Drop NaN Rows, Get Y Data, Difference X in Time') for step in range(NSTEPS): last_hour_data = sampler(date, X_time_steps=X_TIME_STEPS) date += add_hour this_hour_data = sampler(date, X_time_steps=X_TIME_STEPS) current_file = get_file_name(date) out = train_model_on_models(last_hour_data, this_hour_data, init_func) dill.dump(out, open(current_file, 'wb')) print('Dumped to:', current_file) l2, t2, models, preds, models2, preds2 = out layer_1_scores = [model._score for _, model in models] layer_2_scores = [model._score for _, model in models2] print('Scores in layer 1 models:', layer_1_scores) print('Scores in layer 2 models:', layer_2_scores) return last_hour_data, this_hour_data, models, preds, models2, preds2
def dump(self, file, protocol=None, byref=None, fmode=None, recurse=None): '''pickle (dill) an object to a file''' getattr(self, '_close', lambda: [])() return dill.dump(self, file, protocol=protocol, byref=byref, fmode=fmode, recurse=recurse)
def predict_to_pickle(prediction, fname_base): '''Dump a prediction y data''' mkdir_p(fname_base) fname = fname_base + '.xr' with open(fname, 'wb') as f: return dill.dump(prediction, f)
def gen_brown_dataset(output_folder, num=None): sentences = brown.sents() if num: if num > len(sentences): num = len(sentences) sentences = sentences[:num] (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict = \ gen_dataset(sentences) if output_folder: np.save(os.path.join(output_folder, 'X_train.npy'), X_train) np.save(os.path.join(output_folder, 'X_test.npy'), X_test) np.save(os.path.join(output_folder, 'y_train.npy'), y_train) np.save(os.path.join(output_folder, 'y_test.npy'), y_test) np.save(os.path.join(output_folder, 'K_train.npy'), K_train) np.save(os.path.join(output_folder, 'K_test.npy'), K_test) with open(os.path.join(output_folder, 'gen_param_dict.pkl'), 'w') as f: cPickle.dump(param_dict, f)
def train_brown_lemmatizer(output_folder): obs_set = np.load(os.path.join(output_folder, 'X_train.npy')) out_set = np.load(os.path.join(output_folder, 'y_train.npy')) count_set = np.load(os.path.join(output_folder, 'K_train.npy')) nn_param_set = train_lemmatizer( obs_set, out_set, count_set, window_size=[2,2], positive_samples_only=True, batch_size=128, param_scale=0.01, num_epochs=4000, step_size=0.001, l2_lambda=0.1) if output_folder: with open(os.path.join(output_folder, 'nn_param_dict.pkl'), 'w') as f: dill.dump(nn_param_set, f)
def save(textdata, fname): with open(fname, 'wb') as fout: dill.dump(textdata, fout)
def write(obj, fn): import dill with open(fn, "wb") as f: dill.dump(obj, f)