我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.externals.joblib.dump()。
def trainModel(featureCount, imageCount, save): clf = RandomForestRegressor(n_estimators=1, n_jobs=-1) features = generateFeatures(featureCount) for image in range(0, imageCount): print "Image " + str(image) train(clf, features, image) clf = clf.fit(X, Y) model = (clf, features) if save: joblib.dump(model, "model.pkl") return model
def load_trained_model(self, classifier): filename = '{}.pkl'.format(classifier.__name__.lower()) path = os.path.join(self.data_path, filename) # palliative: this outputs a model too large for joblib if classifier.__name__ == 'MonthlySubquotaLimitClassifier': model = classifier() model.fit(self.dataset) else: if os.path.isfile(path): model = joblib.load(path) else: model = classifier() model.fit(self.dataset) joblib.dump(model, path) return model
def make_check_point(self): num, last_checkpoints = self.load_current_checkpoints() if self.best_val_acc > last_checkpoints['best_val_acc']: best_val_acc = self.best_val_acc best_params = self.best_params else: best_val_acc = last_checkpoints['best_val_acc'] best_params = last_checkpoints['best_params'] checkpoints = { 'model': self.model, 'epoch': self.epoch, 'best_params': best_params, 'best_val_acc': best_val_acc, 'loss_history': self.loss_history, 'train_acc_history': self.train_acc_history, 'val_acc_history': self.val_acc_history} name = 'check_' + str(num + 1) os.mkdir(os.path.join(self.path_checkpoints, name)) joblib.dump(checkpoints, os.path.join( self.path_checkpoints, name, name + '.pkl'))
def __init__(self, clf, scaler, pf_df, data_folder=""): model_file_name = "banana.pkl" scaler_file_name = "banana_scaler.pkl" list_file_name = "banana_list.txt" def_file_path = "../../models/" self.data_folder = data_folder if not data_folder: model_file = os.path.join(os.path.dirname(__file__), def_file_path) + model_file_name scaler_file = os.path.join(os.path.dirname(__file__), def_file_path) + scaler_file_name list_file = os.path.join(os.path.dirname(__file__), def_file_path) + list_file_name else: model_file = self.data_folder + model_file_name scaler_file = self.data_folder + scaler_file_name list_file = self.data_folder + list_file_name joblib.dump(clf, model_file) joblib.dump(scaler, scaler_file) with open(list_file, "w") as f: f.write(" ".join(pf_df.columns.tolist()))
def get_cache_file(model_id, index, cache_dir='', suffix='csv'): # Identify index trick. # If sum of first 20 index, recognize as the same index. if index is None: raise IOError if len(index) < 20: sum_index = sum(index) else: sum_index = sum(index[:20]) return "{0}{1}_{2}.{3}".format(cache_dir, model_id, sum_index, suffix) ##def saving_fit(learner, X, y, index): ## import os ## pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index)) ## try: ## learner = joblib.load(pkl_file) ## print("**** learner is loaded from {0} ****".format(pkl_file)) ## except IOError: ## learner.fit(X, y) ## joblib.dump(learner, pkl_file) ## return learner
def KmeansWrapper(true_k, data, load=False): from sklearn.externals import joblib modelName = 'doc_cluster.%s.plk' % true_k if load: km = joblib.load(modelName) labels = km.labels_ else: km = KMeans(n_clusters=true_k, init='k-means++', # max_iter=1000, n_init=10, n_jobs=-1, random_state=0, verbose=0) km.fit_predict(data) labels = km.labels_ joblib.dump(km, modelName) return labels, km.cluster_centers_
def init_state(indata, test=False): close = indata['close'].values diff = np.diff(close) diff = np.insert(diff, 0, 0) sma15 = SMA(indata, timeperiod=15) sma60 = SMA(indata, timeperiod=60) rsi = RSI(indata, timeperiod=14) atr = ATR(indata, timeperiod=14) #--- Preprocess data xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr)) xdata = np.nan_to_num(xdata) if test == False: scaler = preprocessing.StandardScaler() xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1) joblib.dump(scaler, 'data/scaler.pkl') elif test == True: scaler = joblib.load('data/scaler.pkl') xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1) state = xdata[0:1, 0:1, :] return state, xdata, close #Take Action
def persist_pipelines(pipelines): Path('models').mkdir(exist_ok=True) fp_fmt = 'models/{}-{:%y-%m-%d}.pkl' now = dt.datetime.now() for pipe in pipelines: print(utils.pipeline_name(pipe)) fp_name = fp_fmt.format(utils.pipeline_name(pipe), now) joblib.dump(pipe, fp_name) # Pickle fails to work on RandomForestRegressor # with open(fp_name, 'wb') as fp: # pickle.dump(pipe, fp)
def _vectorize_chunk(dsid_dir, k, pars, pretend=False): """ Extract features on a chunk of files """ from sklearn.feature_extraction.text import HashingVectorizer from sklearn.externals import joblib filenames = pars['filenames_abs'] chunk_size = pars['chunk_size'] n_samples = pars['n_samples'] mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples)) hash_opts = {key: vals for key, vals in pars.items() if key in ['stop_words', 'n_features', 'analyser', 'ngram_range']} hash_opts['alternate_sign'] = False fe = HashingVectorizer(input='content', norm=None, **hash_opts) if pretend: return fe fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice]) fset_new.eliminate_zeros() joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
def dump_classifier(self): """ This function ... :return: """ # Determine the path to the pickle file classifier_path = os.path.join(self.classification_mode_path, "classifier.pkl") # Inform the user self.log.info("Writing the classifier to " + classifier_path) # Serialize and dump the classifier joblib.dump(self.vector_classifier, classifier_path) # -----------------------------------------------------------------
def generate_LR_model(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Logistic Regression Model...' start_time = datetime.datetime.now() clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced') clf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time-start_time).seconds print 'Save Model...' joblib.dump(clf, 'LR.model') return clf
def test(): iris = load_iris() #print iris #print iris['target'].shape gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4) gbdt.fit(iris.data[:120],iris.target[:120]) #Save GBDT Model joblib.dump(gbdt, 'GBDT.model') predict = gbdt.predict(iris.data[:120]) total_err = 0 for i in range(len(predict)): print predict[i],iris.target[i] err = predict[i] - iris.target[i] total_err += err * err print 'Training Error: %f' % (total_err / len(predict)) pred = gbdt.predict(iris.data[120:]) error = 0 for i in range(len(pred)): print pred[i],iris.target[i+120] err = pred[i] - iris.target[i+120] error += err * err print 'Test Error: %f' % (error / len(pred))
def generate_GBDT_model(file_name): train_df = read_from_file(file_name) #featrue 18 selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Gradient Boosting Regression Model...' start_time = datetime.datetime.now() gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced') gbdt.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time - start_time).seconds print 'Save Model...' joblib.dump(gbdt, 'GBDT.model') return gbdt
def generate_XGB_model(train_df): train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') '''print 'Train And Fix Missing Age Value...' train_df, xgb_age = train_model_for_age(train_df) joblib.dump(xgb_age, 'XGB_age.model')''' train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True) print 'Done' print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Xgboost Model...' start_time = datetime.datetime.now() xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False) xbg_clf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_}) print model_df return xbg_clf
def xgb_model_select(train_file_name): train_df = merge_features_to_use(train_file_name) train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Select Model...' start_time = datetime.datetime.now() xgb_clf = xgb.XGBRegressor() parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], } grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1) print("parameters:") pprint.pprint(parameters) grid_search.fit(X, y) print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters=grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) end_time = datetime.datetime.now() print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
def generate_RF_model(file_name): train_df = read_from_file(file_name) selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence') train_np = selected_train_df.as_matrix() y = train_np[:,0] X = train_np[:,1:] print 'Train Random Forest Regression Model...' start_time = datetime.datetime.now() rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced') rf.fit(X,y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: ' print (end_time-start_time).seconds print 'Save Model...' joblib.dump(rf, 'RF.model') return rf
def load_data(test=False): fname = FTEST if test else FTRAIN df = pd.read_csv(fname) cols = df.columns[:-1] df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ') / 255.0) df = df.dropna() X = np.vstack(df['Image']) X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1) if not test: # y = (df[cols].values -48) / 48.0 y = df[cols].values / 96.0 X, y = shuffle(X, y) joblib.dump(cols, 'data/cols.pkl', compress=3) else: y = None return X, y
def Dump(model,fnameMODEL,fnameWeight): if str(type(model)).find("sklearn.")==-1: from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.optimizers import SGD json_string = model.to_json() fm = open(fnameMODEL+".json","w") fm.write(json_string) fm.close() model.save_weights(fnameWeight+".hdf5",overwrite=True) else: from sklearn.externals import joblib def ensure_dir(f): d = os.path.dirname(f) if not os.path.exists(d): os.makedirs(d) ensure_dir('./skmodel/') joblib.dump(model, "./skmodel/"+fnameMODEL+".pkl",compress=3)
def train_svms(): if not os.path.isfile('models/fine_tune.model.index'): print('models/fine_tune.model doesn\'t exist.') return net = create_alexnet() model = tflearn.DNN(net) model.load('models/fine_tune.model') train_file_dir = 'svm_train/' flist = os.listdir(train_file_dir) svms = [] for train_file in flist: if "pkl" in train_file: continue X, Y = generate_single_svm_train_data(train_file_dir + train_file) train_features = [] for i in X: feats = model.predict([i]) train_features.append(feats[0]) print("feature dimension of fitting: {}".format(np.shape(train_features))) clf = svm.LinearSVC() clf.fit(train_features, Y) svms.append(clf) joblib.dump(svms, 'models/train_svm.model')
def train(self, training_data, trees=100,rf_out=None): # Use CNN to extract features self.cnn.set_intermediate(self.feature_layer) features = self.extract_features(training_data) # Create random forest self.rf = RandomForestClassifier(n_estimators=trees, class_weight='balanced_subsample') X_train = features['y_pred'] # inputs to train the random forest y_train = np.asarray(features['y_true']) # ground truth for random forest print "Training RF..." self.rf.fit(X_train, y_train) if rf_out: joblib.dump(self.rf, rf_out) return self.rf, X_train, y_train
def train_model(data, with_mac=True): global without_mac_clf, mac_clf df = pd.DataFrame.from_dict(data) y = df.pop("location") features = [f for f in df.columns if f is not 'mac'] df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features]))) model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME if with_mac: df = df.apply(LabelEncoder().fit_transform) else: df.drop("mac", axis=1, inplace=True) clf = DecisionTreeClassifier() clf.fit(df, y) joblib.dump(clf, model_name) if with_mac and mac_clf is None: mac_clf = clf if not with_mac and without_mac_clf is None: without_mac_clf = clf export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot') os.system("dot -Tpng model.dot -o model.png")
def trainClassifier(foldername,classifierName): model = cv2.ml.KNearest_create() features = [] labels = [] os.chdir(foldername) for filename in glob.iglob('*.png'): features.append(cv2.imread((filename),-1)) labels.append(filename[0]) list_hog_fd = [] for feature in features: fd = hog(feature.reshape((27, 35)), orientations=9, pixels_per_cell=(9, 7), cells_per_block=(1, 1), visualise=False) list_hog_fd.append(fd) hog_features = np.array(list_hog_fd, 'float64') os.chdir("..") clf = LinearSVC() clf.fit(hog_features, labels) joblib.dump(clf,classifierName, compress=3) os.chdir("..")
def learn(fName, features, nRows=-1): with open('bin/train.bin', 'r') as f: train = np.load(f) x = np.mat(train[:nRows,timbreVector[features[0]]]).reshape(nRows,1) y = np.mat(train[:nRows,timbreVector[features[1]]]).reshape(nRows,1) z = np.mat(train[:nRows,timbreVector[features[2]]]).reshape(nRows,1) X = np.concatenate((x, y, z), axis=1) Y = train[:nRows,0] % minYear clf = svm.SVC(verbose=3) clf.fit(X, Y) print "[SUCCESS] Fitted training data to SVM (kernel: rbf)." print "[STARTED] Dumping classifier." joblib.dump(clf, 'bin/%s'%fName) print "[SUCCESS] Dumped to ", fName
def train(self, training_set, training_target, fea_index): clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced") clf = clf.fit(training_set, training_target) class_names = np.unique([str(i) for i in training_target]) feature_names = [attr_list[i] for i in fea_index] dot_data = tree.export_graphviz(clf, out_file=None, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("output/tree-vis.pdf") joblib.dump(clf, 'output/CART.pkl')
def saveDebugStateAtBatch(self, name, batchID, LPchunk=None, SS=None, SSchunk=None, hmodel=None, Dchunk=None): if self.outputParams['debugBatch'] == batchID: debugLap = self.outputParams['debugLap'] debugLapBuffer = self.outputParams['debugLapBuffer'] if self.lapFrac < 1: joblib.dump(dict(Dchunk=Dchunk), os.path.join(self.task_output_path, 'Debug-Data.dump')) belowWindow = self.lapFrac < debugLap - debugLapBuffer aboveWindow = self.lapFrac > debugLap + debugLapBuffer if belowWindow or aboveWindow: return filename = 'DebugLap%04.0f-%s.dump' % (np.ceil(self.lapFrac), name) SaveVars = dict(LP=LPchunk, SS=SS, hmodel=hmodel, SSchunk=SSchunk, lapFrac=self.lapFrac) joblib.dump(SaveVars, os.path.join(self.task_output_path, filename)) if self.lapFrac < 1: joblib.dump(dict(Dchunk=Dchunk), os.path.join(self.task_output_path, 'Debug-Data.dump'))
def pca(dataMat,n): print "Start to do PCA..." newData,meanVal=zeroMean(dataMat) # covMat=np.cov(newData,rowvar=0) # eigVals,eigVects=np.linalg.eig(np.mat(covMat)) # joblib.dump(eigVals,'./features/PCA/eigVals_train_%s.eig' %m,compress=3) # joblib.dump(eigVects,'./features/PCA/eigVects_train_%s.eig' %m,compress=3) eigVals = joblib.load('./features/PCA/eigVals_train_%s.eig' %m) eigVects = joblib.load('./features/PCA/eigVects_train_%s.eig' %m) eigValIndice=np.argsort(eigVals) n_eigValIndice=eigValIndice[-1:-(n+1):-1] n_eigVect=eigVects[:,n_eigValIndice] # joblib.dump(n_eigVect,'./features/PCA/n_eigVects_train_%s_%s.eig' %(m,n)) lowDDataMat=newData*n_eigVect return lowDDataMat
def pca(dataMat,n): print "Start to do PCA..." t1 = time.time() newData,meanVal=zeroMean(dataMat) covMat=np.cov(newData,rowvar=0) eigVals,eigVects=np.linalg.eig(np.mat(covMat)) # calculate feature value and feature vector joblib.dump(eigVals,'./features/PCA/%s/eigVals_train_%s.eig' %(m,m),compress=3) joblib.dump(eigVects,'./features/PCA/%s/eigVects_train_%s.eig' %(m,m),compress=3) # eigVals = joblib.load('./features/PCA/%s/eigVals_train_%s.eig' %(m,m)) # eigVects = joblib.load('./features/PCA/%s/eigVects_train_%s.eig' %(m,m)) eigValIndice=np.argsort(eigVals) # sort feature value n_eigValIndice=eigValIndice[-1:-(n+1):-1] # take n feature value n_eigVect=eigVects[:,n_eigValIndice] # take n feature vector joblib.dump(n_eigVect,'./features/PCA/%s/n_eigVects_train_%s_%s.eig' %(m,m,n)) lowDDataMat=newData*n_eigVect # calculate low dimention data # reconMat=(lowDDataMat*n_eigVect.T)+meanVal t2 = time.time() print "PCA takes %f seconds" %(t2-t1) return lowDDataMat
def getFeat(Data,mode): # get and save feature valuve num = 0 for data in Data: image = np.reshape(data[0], (200, 200, 3)) gray = rgb2gray(image)/255.0 # trans image to gray fd = hog(gray, orientations, pixels_per_cell, cells_per_block, block_norm, visualize, normalize) fd = np.concatenate((fd, data[1])) # add label in the end of the array filename = list(data[2]) fd_name = filename[0].split('.')[0]+'.feat' # set file name if mode == 'train': fd_path = os.path.join('./features/train/', fd_name) else: fd_path = os.path.join('./features/test/', fd_name) joblib.dump(fd, fd_path,compress=3) # save data to local num += 1 print "%d saving: %s." %(num,fd_name)
def train_logistic(): df = pd.read_csv(config.activations_path) df, y, classes = encode(df) X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=17) params = {'C': [10, 2, .9, .4, .1], 'tol': [0.0001, 0.001, 0.0005]} log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced') clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit=True, cv=3, n_jobs=-1) clf.fit(X_train, y_train) print("best params: " + str(clf.best_params_)) print("Accuracy: ", accuracy_score(y_test, clf.predict(X_test))) setattr(clf, '__classes', classes) # save results for further using joblib.dump(clf, config.get_novelty_detection_model_path())
def persist(self, model_dir): # type: (Text) -> Dict[Text, Any] """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" from sklearn.externals import joblib if self.ent_tagger: model_file_name = os.path.join(model_dir, "crf_model.pkl") joblib.dump(self.ent_tagger, model_file_name) return {"entity_extractor_crf": {"model_file": "crf_model.pkl", "crf_features": self.crf_features, "BILOU_flag": self.BILOU_flag, "version": 1}} else: return {"entity_extractor_crf": None}
def newKMeansModel(vectorFile, outputFile, numClusters): # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering model = Doc2Vec.load("Models\\" + vectorFile) docVecs = model.docvecs.doctag_syn0 km = KMeans(n_clusters=numClusters) print("Starting") km.fit(docVecs) print("Fitting Data") joblib.dump(km, outputFile)
def newDBSCANModel(vectorFile, outputFile): model = Doc2Vec.load("Models\\" + vectorFile) vecs = [] for doc in range(0, len(model.docvecs)): doc_vec = model.docvecs[doc] # print doc_vec vecs.append(doc_vec.reshape((1, 300))) doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values # print doc_vecs docs = [] for i in doc_vecs: docs.append(i[0]) db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs) joblib.dump(db, outputFile) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) clusters = db.labels_.tolist() cluster_info = {'labels': model.docvecs.offset2doctag, "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag], 'clusters': clusters} sentenceDF = pd.DataFrame(cluster_info, index=[clusters], columns=['labels', "index, wordcount and repeated words", 'clusters']) print(sentenceDF) sentenceDF.to_csv("DBSCAN.csv") print('Estimated number of clusters: %d' % n_clusters_)
def run_model(ms, i_fold): model = ModelVW(ms.name(), i_fold) prms = model_params_dict[ms.model_params] if not prms.has_key("interaction"): prms["interaction"] = vw_inter_list[ms.feature_set] model.set_params(prms) model.set_data(ms.feature_set, i_fold) # special model.train() pred = model.predict() train_pred = model.predict_train() model.dump() model.dump_pred(pred, "pred.pkl") return pred, train_pred
def create_model(self, training_articles): model = OneVsRestClassifier(svm.SVC(probability=True)) features = [] labels = [] i = 0 for article in training_articles: print("Generating features for article " + str(i) + "...") google_cloud_response = self.analyze_text_google_cloud(article["article"]) relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"]) # Only count this article if a relevant entity is present if relevant_entities: article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"]) features.append(article_features) labels.append(article["label"]) else: print("Skipping article " + str(i) + "...") i = i + 1 print("Performing feature scaling...") scaler = preprocessing.StandardScaler().fit(features) features_scaled = scaler.transform(features) print("Fitting model...") model.fit(features_scaled, labels) print("Saving model...") joblib.dump(scaler, "data_analysis/caler.pkl") joblib.dump(model, "data_analysis/model.pkl") print("Done!") # For use in prod
def train(): DataTrain=loadPybrainData() fnn=buildNet() trainer=BackpropTrainer(fnn,dataset=DataTrain,momentum=0.05,verbose=True,weightdecay=0.005) trainer.trainUntilConvergence(maxEpochs=500) joblib.dump(fnn,PKL) return fnn
def printfile(X, filename): joblib.dump((X), filename)
def use_SVM(X_data,y_data): p_gamma = 0.1 p_C = 10 svm = SVC(kernel = 'rbf',random_state=0, gamma=p_gamma ,C=p_C, probability=True) svm.fit(X_data,y_data) joblib.dump(svm,"./sklearn_model/svm_trainval1_{param1}_{param2}".format(param1 = p_gamma,param2 = p_C)) return svm
def save(self, path): """ Persist the model itself and it's classes with joblib and pickle. Parameters ---------- path: string The location of the persistence directory where model and classes will be stored. Return ---------- None """ joblib.dump(self.model, path + 'tree.pkl') joblib.dump(self.classes, path + 'classes.pkl')
def save(self): home_dir = self.__home_dir(self.field_manager.app_id) if not os.path.isdir(home_dir): print("making directory for app {}...".format(self.field_manager.app_id)) os.mkdir(home_dir) path_fieldm = os.path.join(home_dir, self.FIELD_MANAGER_FILE) with open(path_fieldm, mode="w", encoding="utf-8") as fm: serialized = self.field_manager.to_dict() json.dump(serialized, fm, indent=2) if self.model: joblib.dump(self.model, os.path.join(home_dir, self.MODEL_FILE))
def save(self, filebase): # re-train best model on full data set self.model_.fit(self.data, self.data[LABEL].values) ts = datetime.now().strftime('%Y%m%d_%H%M%S') # logging wrappers don't serialize del self.logger joblib.dump(self,'{0}/model_{1}.pkl'.format(filebase, ts))
def train(estimator, feats_train, labels_train, weights_train, model='model.pkl'): ''' Train and Evaluate (using k-fold cross validation) the generated machine learning model for severity classification @param estimator: the ML estimator to use @param feats_train: feats_train: the training features @param labels_train: labels for training data @return estimator: trained estimator (model) ''' estimator = estimator.fit(feats_train, labels_train, sample_weight=weights_train) if model is not None: joblib.dump(estimator, cfg.PATH_RESOURCES+model) return estimator
def save(self, location="brain"): """ Pickle the brain """ if self._trained: joblib.dump(self.lobe, location + ".pickle") logger.info('Brain %s saved', location + '.pickle') else: return logger.error('Brain is not trained yet! Nothing to save...')
def getFeat(TrainData, TestData): for data in TestData: image = np.reshape(data[0].T, (32, 32, 3)) gray = rgb2gray(image)/255.0 fd = hog(gray, 9, [8, 8], [2, 2], 'L2-Hys', False, True) fd = np.concatenate((fd, data[1])) filename = list(data[2]) fd_name = filename[0].split('.')[0]+'.feat' fd_path = os.path.join('./data/features/test/', fd_name) joblib.dump(fd, fd_path) print "Test features are extracted and saved." for data in TrainData: image = np.reshape(data[0].T, (32, 32, 3)) gray = rgb2gray(image)/255.0 fd = hog(gray, 9, [8, 8], [2, 2], 'L2-Hys', False, True) fd = np.concatenate((fd, data[1])) filename = list(data[2]) fd_name = filename[0].split('.')[0]+'.feat' fd_path = os.path.join('./data/features/train/', fd_name) joblib.dump(fd, fd_path) print "Train features are extracted and saved."
def save_pkl(obj, path, log_description=None, logger=None, logging_level=logging.INFO, verbose_start=True, verbose_end=True, end_in_new_line=True, log_prefix="..."): if log_description is None: log_description = "Pickling to " + (path) with open(path, "wb") as fp, \ SimpleTimer(log_description, logger, logging_level, verbose_start, verbose_end, end_in_new_line, log_prefix): cPickle.dump(obj, fp, protocol=cPickle.HIGHEST_PROTOCOL)
def save_joblib_pkl(obj, path, log_description=None, logger=None, logging_level=logging.INFO, verbose_start=True, verbose_end=True, end_in_new_line=True, log_prefix="..."): try: from sklearn.externals import joblib except ImportError: raise ImportError("This function requires sklearn module. " "You can install it via " "\"pip install scikit-learn\".") if log_description is None: log_description = "Pickling to " + (path) with SimpleTimer(log_description, logger, logging_level, verbose_start, verbose_end, end_in_new_line, log_prefix): joblib.dump(obj, path)
def save(self, filename): """ Saves trained model to filename. :param filename: Name of file to save model as. """ joblib.dump(self.svc, filename)
def save_corpus( out_prefix, X, terms, doc_ids, classes = None ): """ Save a pre-processed scikit-learn corpus and associated metadata using Joblib. """ matrix_outpath = "%s.pkl" % out_prefix joblib.dump((X,terms,doc_ids,classes), matrix_outpath )
def save_term_rankings( out_path, term_rankings, labels = None ): """ Save a list of multiple term rankings using Joblib. """ # no labels? generate some standard ones if labels is None: labels = [] for i in range( len(term_rankings) ): labels.append( "C%02d" % (i+1) ) joblib.dump((term_rankings,labels), out_path )