Python sklearn.externals.joblib 模块,dump() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.externals.joblib.dump()

项目:libskeletal    作者:bobbybee    | 项目源码 | 文件源码
def trainModel(featureCount, imageCount, save):
    clf = RandomForestRegressor(n_estimators=1, n_jobs=-1)

    features = generateFeatures(featureCount)

    for image in range(0, imageCount):
        print "Image " + str(image)
        train(clf, features, image)

    clf = clf.fit(X, Y)
    model = (clf, features)

    if save:
        joblib.dump(model, "model.pkl")

    return model
项目:rosie    作者:datasciencebr    | 项目源码 | 文件源码
def load_trained_model(self, classifier):
        filename = '{}.pkl'.format(classifier.__name__.lower())
        path = os.path.join(self.data_path, filename)

        # palliative: this outputs a model too large for joblib
        if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
            model = classifier()
            model.fit(self.dataset)

        else:
            if os.path.isfile(path):
                model = joblib.load(path)
            else:
                model = classifier()
                model.fit(self.dataset)
                joblib.dump(model, path)

        return model
项目:Dense-Net    作者:achyudhk    | 项目源码 | 文件源码
def make_check_point(self):

        num, last_checkpoints = self.load_current_checkpoints()

        if self.best_val_acc > last_checkpoints['best_val_acc']:
            best_val_acc = self.best_val_acc
            best_params = self.best_params
        else:
            best_val_acc = last_checkpoints['best_val_acc']
            best_params = last_checkpoints['best_params']

        checkpoints = {
            'model': self.model,
            'epoch': self.epoch,
            'best_params': best_params,
            'best_val_acc': best_val_acc,
            'loss_history': self.loss_history,
            'train_acc_history': self.train_acc_history,
            'val_acc_history': self.val_acc_history}

        name = 'check_' + str(num + 1)
        os.mkdir(os.path.join(self.path_checkpoints, name))
        joblib.dump(checkpoints, os.path.join(
            self.path_checkpoints, name, name + '.pkl'))
项目:elephant_sense    作者:chakki-works    | 项目源码 | 文件源码
def __init__(self, clf, scaler, pf_df, data_folder=""):
        model_file_name = "banana.pkl"
        scaler_file_name = "banana_scaler.pkl"
        list_file_name = "banana_list.txt"

        def_file_path = "../../models/"
        self.data_folder = data_folder

        if not data_folder:
            model_file = os.path.join(os.path.dirname(__file__), def_file_path) + model_file_name
            scaler_file = os.path.join(os.path.dirname(__file__), def_file_path) + scaler_file_name
            list_file = os.path.join(os.path.dirname(__file__), def_file_path) + list_file_name
        else:
            model_file = self.data_folder + model_file_name
            scaler_file = self.data_folder + scaler_file_name
            list_file = self.data_folder + list_file_name


        joblib.dump(clf, model_file)
        joblib.dump(scaler, scaler_file)

        with open(list_file, "w") as f:
            f.write(" ".join(pf_df.columns.tolist()))
项目:stacked_generalization    作者:fukatani    | 项目源码 | 文件源码
def get_cache_file(model_id, index, cache_dir='', suffix='csv'):
    # Identify index trick.
    # If sum of first 20 index, recognize as the same index.
    if index is None:
        raise IOError
    if len(index) < 20:
        sum_index = sum(index)
    else:
        sum_index = sum(index[:20])
    return "{0}{1}_{2}.{3}".format(cache_dir,
                                   model_id,
                                   sum_index,
                                   suffix)

##def saving_fit(learner, X, y, index):
##    import os
##    pkl_file = "{0}_{1}_{2}.pkl".format(learner.id, min(index), max(index))
##    try:
##        learner = joblib.load(pkl_file)
##        print("**** learner is loaded from {0} ****".format(pkl_file))
##    except IOError:
##        learner.fit(X, y)
##        joblib.dump(learner, pkl_file)
##    return learner
项目:hugo_similar_posts    作者:elbaulp    | 项目源码 | 文件源码
def KmeansWrapper(true_k, data, load=False):
    from sklearn.externals import joblib

    modelName = 'doc_cluster.%s.plk' % true_k

    if load:
        km = joblib.load(modelName)
        labels = km.labels_
    else:
        km = KMeans(n_clusters=true_k,
                    init='k-means++',
                    # max_iter=1000,
                    n_init=10,
                    n_jobs=-1,
                    random_state=0,
                    verbose=0)
        km.fit_predict(data)
        labels = km.labels_
        joblib.dump(km,  modelName)

    return labels, km.cluster_centers_
项目:sl-quant    作者:danielzak    | 项目源码 | 文件源码
def init_state(indata, test=False):
    close = indata['close'].values
    diff = np.diff(close)
    diff = np.insert(diff, 0, 0)
    sma15 = SMA(indata, timeperiod=15)
    sma60 = SMA(indata, timeperiod=60)
    rsi = RSI(indata, timeperiod=14)
    atr = ATR(indata, timeperiod=14)

    #--- Preprocess data
    xdata = np.column_stack((close, diff, sma15, close-sma15, sma15-sma60, rsi, atr))

    xdata = np.nan_to_num(xdata)
    if test == False:
        scaler = preprocessing.StandardScaler()
        xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
        joblib.dump(scaler, 'data/scaler.pkl')
    elif test == True:
        scaler = joblib.load('data/scaler.pkl')
        xdata = np.expand_dims(scaler.fit_transform(xdata), axis=1)
    state = xdata[0:1, 0:1, :]

    return state, xdata, close

#Take Action
项目:time_series_modeling    作者:rheineke    | 项目源码 | 文件源码
def persist_pipelines(pipelines):
    Path('models').mkdir(exist_ok=True)
    fp_fmt = 'models/{}-{:%y-%m-%d}.pkl'
    now = dt.datetime.now()
    for pipe in pipelines:
        print(utils.pipeline_name(pipe))
        fp_name = fp_fmt.format(utils.pipeline_name(pipe), now)
        joblib.dump(pipe, fp_name)
        # Pickle fails to work on RandomForestRegressor
        # with open(fp_name, 'wb') as fp:
        #     pickle.dump(pipe, fp)
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def _vectorize_chunk(dsid_dir, k, pars, pretend=False):
    """ Extract features on a chunk of files """
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.externals import joblib

    filenames = pars['filenames_abs']
    chunk_size = pars['chunk_size']
    n_samples = pars['n_samples']

    mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples))

    hash_opts = {key: vals for key, vals in pars.items()
                 if key in ['stop_words', 'n_features',
                            'analyser', 'ngram_range']}
    hash_opts['alternate_sign'] = False
    fe = HashingVectorizer(input='content', norm=None, **hash_opts)
    if pretend:
        return fe
    fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice])

    fset_new.eliminate_zeros()

    joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
项目:CAAPR    作者:Stargrazer82301    | 项目源码 | 文件源码
def dump_classifier(self):

        """
        This function ...
        :return:
        """

        # Determine the path to the pickle file
        classifier_path = os.path.join(self.classification_mode_path, "classifier.pkl")

        # Inform the user
        self.log.info("Writing the classifier to " + classifier_path)

        # Serialize and dump the classifier
        joblib.dump(self.vector_classifier, classifier_path)

    # -----------------------------------------------------------------
项目:CAAPR    作者:Stargrazer82301    | 项目源码 | 文件源码
def dump_classifier(self):

        """
        This function ...
        :return:
        """

        # Determine the path to the pickle file
        classifier_path = os.path.join(self.classification_mode_path, "classifier.pkl")

        # Inform the user
        self.log.info("Writing the classifier to " + classifier_path)

        # Serialize and dump the classifier
        joblib.dump(self.vector_classifier, classifier_path)

    # -----------------------------------------------------------------
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_LR_model(file_name):
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|connectionType_.*|telecomsOperator_.*|sitesetID_.*|positionType_.*|gender_.*|haveBaby_.*|age_scaled')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Logistic Regression Model...'
    start_time  = datetime.datetime.now()
    clf = linear_model.LogisticRegression(penalty='l2',C=1.0,solver='sag',n_jobs=-1, tol=1e-6, max_iter=200)#, class_weight='balanced')
    clf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time-start_time).seconds

    print 'Save Model...'
    joblib.dump(clf, 'LR.model')
    return clf
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def test():
    iris = load_iris()  
    #print iris
    #print iris['target'].shape  
    gbdt=GradientBoostingRegressor(n_estimators=1000, max_depth=4) 
    gbdt.fit(iris.data[:120],iris.target[:120])

    #Save GBDT Model
    joblib.dump(gbdt, 'GBDT.model') 

    predict = gbdt.predict(iris.data[:120])
    total_err = 0
    for i in range(len(predict)):
        print predict[i],iris.target[i]
        err = predict[i] - iris.target[i]
        total_err += err * err
    print 'Training Error: %f' % (total_err / len(predict))

    pred = gbdt.predict(iris.data[120:])
    error = 0
    for i in range(len(pred)):
        print pred[i],iris.target[i+120]
        err = pred[i] - iris.target[i+120]
        error += err * err
    print 'Test Error: %f' % (error / len(pred))
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_GBDT_model(file_name):
    train_df = read_from_file(file_name)
    #featrue 18
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Gradient Boosting Regression Model...'
    start_time  = datetime.datetime.now()
    gbdt = GradientBoostingRegressor(n_estimators=120, max_depth=10) #, class_weight='balanced')
    gbdt.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time - start_time).seconds

    print 'Save Model...'
    joblib.dump(gbdt, 'GBDT.model')
    return gbdt
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_XGB_model(train_df):
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    '''print 'Train And Fix Missing Age Value...'
    train_df, xgb_age = train_model_for_age(train_df)
    joblib.dump(xgb_age, 'XGB_age.model')'''
    train_df.drop(['marriageStatus','haveBaby','sitesetID', 'positionType'], axis=1, inplace=True)
    print 'Done'
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Xgboost Model...'
    start_time  = datetime.datetime.now()
    xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False)
    xbg_clf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds)
    model_df = pd.DataFrame({'columns':list(train_df.columns)[1:], 'values':xbg_clf.feature_importances_})
    print model_df
    return xbg_clf
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def xgb_model_select(train_file_name):  
    train_df = merge_features_to_use(train_file_name)
    train_df.drop(['conversionTime'], axis=1, inplace=True)
    print 'Train And Fix Missing App Count Value...'
    train_df, xgb_appcount = train_model_for_appcounts(train_df)
    joblib.dump(xgb_appcount, 'XGB_missing.model')
    print train_df.info()
    print train_df.describe()
    print train_df.isnull().sum()
    train_np = train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]

    print 'Select Model...'
    start_time  = datetime.datetime.now()
    xgb_clf = xgb.XGBRegressor() 
    parameters = {'n_estimators': [120, 100, 140], 'max_depth':[3,5,7,9], 'gamma':[0.1,0.3,0.5,0.7], 'min_child_weight':[1,3,5,7], }
    grid_search = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=10, n_jobs=-1)
    print("parameters:")
    pprint.pprint(parameters)
    grid_search.fit(X, y)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters=grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    end_time = datetime.datetime.now()
    print 'Select Done..., Time Cost: %d' % ((end_time - start_time).seconds)
项目:tpai_comp    作者:luuuyi    | 项目源码 | 文件源码
def generate_RF_model(file_name):
    train_df = read_from_file(file_name)
    selected_train_df = train_df.filter(regex='label|creativeID|positionID|connectionType|telecomsOperator|adID|camgaignID|advertiserID|appID|appPlatform|sitesetID|positionType|age|gender|education|marriageStatus|haveBaby|hometown|residence')
    train_np = selected_train_df.as_matrix()
    y = train_np[:,0]
    X = train_np[:,1:]
    print 'Train Random Forest Regression Model...'
    start_time  = datetime.datetime.now()
    rf = RandomForestRegressor(n_estimators=25, n_jobs=-1)#, class_weight='balanced')
    rf.fit(X,y)
    end_time = datetime.datetime.now()
    print 'Training Done..., Time Cost: '
    print (end_time-start_time).seconds

    print 'Save Model...'
    joblib.dump(rf, 'RF.model')
    return rf
项目:facial-keypoints-detection    作者:saber1988    | 项目源码 | 文件源码
def load_data(test=False):
    fname = FTEST if test else FTRAIN
    df = pd.read_csv(fname)

    cols = df.columns[:-1]

    df['Image'] = df['Image'].apply(lambda im: np.fromstring(im, sep=' ') / 255.0)
    df = df.dropna()

    X = np.vstack(df['Image'])
    X = X.reshape(-1, IMAGE_SIZE, IMAGE_SIZE, 1)
    if not test:
        # y = (df[cols].values -48) / 48.0
        y = df[cols].values / 96.0
        X, y = shuffle(X, y)
        joblib.dump(cols, 'data/cols.pkl', compress=3)

    else:
        y = None
    return X, y
项目:HappyCat    作者:sparktsao    | 项目源码 | 文件源码
def Dump(model,fnameMODEL,fnameWeight):
    if str(type(model)).find("sklearn.")==-1:
        from keras.models import Sequential
        from keras.layers.core import Dense, Dropout, Activation
        from keras.optimizers import SGD
        json_string = model.to_json()
        fm = open(fnameMODEL+".json","w")
        fm.write(json_string)
        fm.close()

        model.save_weights(fnameWeight+".hdf5",overwrite=True)
    else:
        from sklearn.externals import joblib
        def ensure_dir(f):
            d = os.path.dirname(f)
            if not os.path.exists(d):
                os.makedirs(d)
        ensure_dir('./skmodel/')
        joblib.dump(model, "./skmodel/"+fnameMODEL+".pkl",compress=3)
项目:rcnn-with-tflearn    作者:Redoblue    | 项目源码 | 文件源码
def train_svms():
    if not os.path.isfile('models/fine_tune.model.index'):
        print('models/fine_tune.model doesn\'t exist.')
        return

    net = create_alexnet()
    model = tflearn.DNN(net)
    model.load('models/fine_tune.model')

    train_file_dir = 'svm_train/'
    flist = os.listdir(train_file_dir)
    svms = []
    for train_file in flist:
        if "pkl" in train_file:
            continue
        X, Y = generate_single_svm_train_data(train_file_dir + train_file)
        train_features = []
        for i in X:
            feats = model.predict([i])
            train_features.append(feats[0])
        print("feature dimension of fitting: {}".format(np.shape(train_features)))
        clf = svm.LinearSVC()
        clf.fit(train_features, Y)
        svms.append(clf)
    joblib.dump(svms, 'models/train_svm.model')
项目:qtim_ROP    作者:QTIM-Lab    | 项目源码 | 文件源码
def train(self, training_data, trees=100,rf_out=None):

        # Use CNN to extract features
        self.cnn.set_intermediate(self.feature_layer)
        features = self.extract_features(training_data)

        # Create random forest
        self.rf = RandomForestClassifier(n_estimators=trees, class_weight='balanced_subsample')
        X_train = features['y_pred']  # inputs to train the random forest
        y_train = np.asarray(features['y_true'])  # ground truth for random forest

        print "Training RF..."
        self.rf.fit(X_train, y_train)

        if rf_out:
            joblib.dump(self.rf, rf_out)

        return self.rf, X_train, y_train
项目:whereareyou    作者:futurice    | 项目源码 | 文件源码
def train_model(data, with_mac=True):
    global without_mac_clf, mac_clf
    df = pd.DataFrame.from_dict(data)
    y = df.pop("location")
    features = [f for f in df.columns if f is not 'mac']
    df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
    model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
    if with_mac:
        df = df.apply(LabelEncoder().fit_transform)
    else:
        df.drop("mac", axis=1, inplace=True)
    clf = DecisionTreeClassifier()
    clf.fit(df, y)
    joblib.dump(clf, model_name)
    if with_mac and mac_clf is None:
        mac_clf = clf
    if not with_mac and without_mac_clf is None:
        without_mac_clf = clf
    export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
    os.system("dot -Tpng model.dot -o model.png")
项目:SBB4-damage-tracker    作者:whorn    | 项目源码 | 文件源码
def trainClassifier(foldername,classifierName):
    model = cv2.ml.KNearest_create()
    features = []
    labels = []
    os.chdir(foldername)
    for filename in glob.iglob('*.png'):
        features.append(cv2.imread((filename),-1))
        labels.append(filename[0])
    list_hog_fd = []
    for feature in features:
        fd = hog(feature.reshape((27, 35)), orientations=9, pixels_per_cell=(9, 7), cells_per_block=(1, 1), visualise=False)
        list_hog_fd.append(fd)
    hog_features = np.array(list_hog_fd, 'float64')
    os.chdir("..")
    clf = LinearSVC()
    clf.fit(hog_features, labels)
    joblib.dump(clf,classifierName, compress=3)
    os.chdir("..")
项目:Machine-Learning-Projects    作者:poke19962008    | 项目源码 | 文件源码
def learn(fName, features, nRows=-1):
    with open('bin/train.bin', 'r') as f:
        train = np.load(f)

        x = np.mat(train[:nRows,timbreVector[features[0]]]).reshape(nRows,1)
        y = np.mat(train[:nRows,timbreVector[features[1]]]).reshape(nRows,1)
        z = np.mat(train[:nRows,timbreVector[features[2]]]).reshape(nRows,1)

        X = np.concatenate((x, y, z), axis=1)
        Y = train[:nRows,0] % minYear

        clf = svm.SVC(verbose=3)
        clf.fit(X, Y)
        print "[SUCCESS] Fitted training data to SVM (kernel: rbf)."

        print "[STARTED] Dumping classifier."
        joblib.dump(clf, 'bin/%s'%fName)
        print "[SUCCESS] Dumped to ", fName
项目:kdd99-scikit    作者:PENGZhaoqing    | 项目源码 | 文件源码
def train(self, training_set, training_target, fea_index):

        clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
        clf = clf.fit(training_set, training_target)

        class_names = np.unique([str(i) for i in training_target])
        feature_names = [attr_list[i] for i in fea_index]

        dot_data = tree.export_graphviz(clf, out_file=None,
                                        feature_names=feature_names,
                                        class_names=class_names,
                                        filled=True, rounded=True,
                                        special_characters=True)

        graph = pydotplus.graph_from_dot_data(dot_data)
        graph.write_pdf("output/tree-vis.pdf")
        joblib.dump(clf, 'output/CART.pkl')
项目:bnpy    作者:bnpy    | 项目源码 | 文件源码
def saveDebugStateAtBatch(self, name, batchID, LPchunk=None, SS=None,
                              SSchunk=None, hmodel=None,
                              Dchunk=None):
        if self.outputParams['debugBatch'] == batchID:
            debugLap = self.outputParams['debugLap']
            debugLapBuffer = self.outputParams['debugLapBuffer']
            if self.lapFrac < 1:
                joblib.dump(dict(Dchunk=Dchunk),
                            os.path.join(self.task_output_path, 'Debug-Data.dump'))
            belowWindow = self.lapFrac < debugLap - debugLapBuffer
            aboveWindow = self.lapFrac > debugLap + debugLapBuffer
            if belowWindow or aboveWindow:
                return
            filename = 'DebugLap%04.0f-%s.dump' % (np.ceil(self.lapFrac), name)
            SaveVars = dict(LP=LPchunk, SS=SS, hmodel=hmodel,
                            SSchunk=SSchunk,
                            lapFrac=self.lapFrac)
            joblib.dump(SaveVars, os.path.join(self.task_output_path, filename))
            if self.lapFrac < 1:
                joblib.dump(dict(Dchunk=Dchunk),
                            os.path.join(self.task_output_path, 'Debug-Data.dump'))
项目:SVM-classification-localization    作者:HandsomeHans    | 项目源码 | 文件源码
def pca(dataMat,n):
    print "Start to do PCA..."
    newData,meanVal=zeroMean(dataMat)

#    covMat=np.cov(newData,rowvar=0)
#    eigVals,eigVects=np.linalg.eig(np.mat(covMat))
#    joblib.dump(eigVals,'./features/PCA/eigVals_train_%s.eig' %m,compress=3)
#    joblib.dump(eigVects,'./features/PCA/eigVects_train_%s.eig' %m,compress=3)

    eigVals = joblib.load('./features/PCA/eigVals_train_%s.eig' %m)
    eigVects = joblib.load('./features/PCA/eigVects_train_%s.eig' %m)

    eigValIndice=np.argsort(eigVals)
    n_eigValIndice=eigValIndice[-1:-(n+1):-1]
    n_eigVect=eigVects[:,n_eigValIndice]
#    joblib.dump(n_eigVect,'./features/PCA/n_eigVects_train_%s_%s.eig' %(m,n))
    lowDDataMat=newData*n_eigVect
    return lowDDataMat
项目:SVM-classification-localization    作者:HandsomeHans    | 项目源码 | 文件源码
def pca(dataMat,n):   
    print "Start to do PCA..."   
    t1 = time.time()   
    newData,meanVal=zeroMean(dataMat)   
    covMat=np.cov(newData,rowvar=0)   
    eigVals,eigVects=np.linalg.eig(np.mat(covMat)) # calculate feature value and feature vector   
    joblib.dump(eigVals,'./features/PCA/%s/eigVals_train_%s.eig' %(m,m),compress=3)    
    joblib.dump(eigVects,'./features/PCA/%s/eigVects_train_%s.eig' %(m,m),compress=3)  
    # eigVals = joblib.load('./features/PCA/%s/eigVals_train_%s.eig' %(m,m))  
    # eigVects = joblib.load('./features/PCA/%s/eigVects_train_%s.eig' %(m,m))   
    eigValIndice=np.argsort(eigVals) # sort feature value
    n_eigValIndice=eigValIndice[-1:-(n+1):-1] # take n feature value   
    n_eigVect=eigVects[:,n_eigValIndice] # take n feature vector 
    joblib.dump(n_eigVect,'./features/PCA/%s/n_eigVects_train_%s_%s.eig' %(m,m,n))    
    lowDDataMat=newData*n_eigVect # calculate low dimention data
    # reconMat=(lowDDataMat*n_eigVect.T)+meanVal   
    t2 = time.time()   
    print "PCA takes %f seconds" %(t2-t1)   
    return lowDDataMat
项目:SVM-classification-localization    作者:HandsomeHans    | 项目源码 | 文件源码
def getFeat(Data,mode): # get and save feature valuve
    num = 0  
    for data in Data:  
        image = np.reshape(data[0], (200, 200, 3)) 
        gray = rgb2gray(image)/255.0 # trans image to gray
        fd = hog(gray, orientations, pixels_per_cell, cells_per_block, block_norm, visualize, normalize)  
        fd = np.concatenate((fd, data[1])) # add label in the end of the array
        filename = list(data[2])  
        fd_name = filename[0].split('.')[0]+'.feat' # set file name  
        if mode == 'train':  
            fd_path = os.path.join('./features/train/', fd_name)  
        else:  
            fd_path = os.path.join('./features/test/', fd_name)  
        joblib.dump(fd, fd_path,compress=3) # save data to local  
        num += 1  
        print "%d saving: %s." %(num,fd_name)
项目:keras-transfer-learning-for-oxford102    作者:Arsey    | 项目源码 | 文件源码
def train_logistic():
    df = pd.read_csv(config.activations_path)
    df, y, classes = encode(df)

    X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.2, random_state=17)

    params = {'C': [10, 2, .9, .4, .1], 'tol': [0.0001, 0.001, 0.0005]}
    log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', class_weight='balanced')
    clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit=True, cv=3, n_jobs=-1)
    clf.fit(X_train, y_train)

    print("best params: " + str(clf.best_params_))
    print("Accuracy: ", accuracy_score(y_test, clf.predict(X_test)))

    setattr(clf, '__classes', classes)
    # save results for further using
    joblib.dump(clf, config.get_novelty_detection_model_path())
项目:rasa_nlu    作者:RasaHQ    | 项目源码 | 文件源码
def persist(self, model_dir):
        # type: (Text) -> Dict[Text, Any]
        """Persist this model into the passed directory.

        Returns the metadata necessary to load the model again."""

        from sklearn.externals import joblib

        if self.ent_tagger:
            model_file_name = os.path.join(model_dir, "crf_model.pkl")

            joblib.dump(self.ent_tagger, model_file_name)
            return {"entity_extractor_crf": {"model_file": "crf_model.pkl",
                                             "crf_features": self.crf_features,
                                             "BILOU_flag": self.BILOU_flag,
                                             "version": 1}}
        else:
            return {"entity_extractor_crf": None}
项目:EchoBurst    作者:TyJK    | 项目源码 | 文件源码
def newKMeansModel(vectorFile, outputFile, numClusters):
    # https://stackoverflow.com/questions/43476869/doc2vec-sentence-clustering

    model = Doc2Vec.load("Models\\" + vectorFile)
    docVecs = model.docvecs.doctag_syn0
    km = KMeans(n_clusters=numClusters)
    print("Starting")
    km.fit(docVecs)
    print("Fitting Data")
    joblib.dump(km, outputFile)
项目:EchoBurst    作者:TyJK    | 项目源码 | 文件源码
def newDBSCANModel(vectorFile, outputFile):
    model = Doc2Vec.load("Models\\" + vectorFile)
    vecs = []
    for doc in range(0, len(model.docvecs)):
        doc_vec = model.docvecs[doc]
        # print doc_vec
        vecs.append(doc_vec.reshape((1, 300)))

    doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

    # print doc_vecs
    docs = []
    for i in doc_vecs:
        docs.append(i[0])
    db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
    joblib.dump(db, outputFile)


    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters = db.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
                                                            model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv("DBSCAN.csv")

    print('Estimated number of clusters: %d' % n_clusters_)
项目:kaggle-prudential-sample    作者:threecourse    | 项目源码 | 文件源码
def run_model(ms, i_fold):

    model = ModelVW(ms.name(), i_fold)

    prms = model_params_dict[ms.model_params]

    if not prms.has_key("interaction"):
        prms["interaction"] = vw_inter_list[ms.feature_set]

    model.set_params(prms)
    model.set_data(ms.feature_set, i_fold)  # special

    model.train()

    pred = model.predict()
    train_pred = model.predict_train()

    model.dump()
    model.dump_pred(pred, "pred.pkl")

    return pred, train_pred
项目:Rasa_NLU_Chi    作者:crownpku    | 项目源码 | 文件源码
def persist(self, model_dir):
        # type: (Text) -> Dict[Text, Any]
        """Persist this model into the passed directory.

        Returns the metadata necessary to load the model again."""

        from sklearn.externals import joblib

        if self.ent_tagger:
            model_file_name = os.path.join(model_dir, "crf_model.pkl")

            joblib.dump(self.ent_tagger, model_file_name)
            return {"entity_extractor_crf": {"model_file": "crf_model.pkl",
                                             "crf_features": self.crf_features,
                                             "BILOU_flag": self.BILOU_flag,
                                             "version": 1}}
        else:
            return {"entity_extractor_crf": None}
项目:algo-trading-pipeline    作者:NeuralKnot    | 项目源码 | 文件源码
def create_model(self, training_articles):
        model = OneVsRestClassifier(svm.SVC(probability=True))

        features = []
        labels = []
        i = 0
        for article in training_articles:
            print("Generating features for article " + str(i) + "...")
            google_cloud_response = self.analyze_text_google_cloud(article["article"])
            relevant_entities = self.get_relevant_entities(google_cloud_response["entities"], article["market"]["entities"], article["market"]["wikipedia_urls"])

            # Only count this article if a relevant entity is present
            if relevant_entities:
                article_features = self.article_features(relevant_entities, article["market"], google_cloud_response, article["article"])
                features.append(article_features)
                labels.append(article["label"])
            else:
                print("Skipping article " + str(i) + "...")

            i = i + 1

        print("Performing feature scaling...")
        scaler = preprocessing.StandardScaler().fit(features)
        features_scaled = scaler.transform(features)

        print("Fitting model...")
        model.fit(features_scaled, labels)

        print("Saving model...")
        joblib.dump(scaler, "data_analysis/caler.pkl")
        joblib.dump(model, "data_analysis/model.pkl")

        print("Done!")

    # For use in prod
项目:Verification-code-crack    作者:weixianglin    | 项目源码 | 文件源码
def train():
    DataTrain=loadPybrainData()
    fnn=buildNet()
    trainer=BackpropTrainer(fnn,dataset=DataTrain,momentum=0.05,verbose=True,weightdecay=0.005)
    trainer.trainUntilConvergence(maxEpochs=500)
    joblib.dump(fnn,PKL)
    return fnn
项目:ensemble_amazon    作者:kaz-Anova    | 项目源码 | 文件源码
def printfile(X, filename):

    joblib.dump((X), filename)
项目:EmotiW-2017-Audio-video-Emotion-Recognition    作者:xujinchang    | 项目源码 | 文件源码
def use_SVM(X_data,y_data):
    p_gamma = 0.1
    p_C = 10
    svm = SVC(kernel = 'rbf',random_state=0, gamma=p_gamma ,C=p_C, probability=True)
    svm.fit(X_data,y_data)
    joblib.dump(svm,"./sklearn_model/svm_trainval1_{param1}_{param2}".format(param1 = p_gamma,param2 = p_C))
    return svm
项目:a-cadmci    作者:florez87    | 项目源码 | 文件源码
def save(self, path):
        """
        Persist the model itself and it's classes with joblib and pickle.

        Parameters
        ----------
        path: string
            The location of the persistence directory where model and classes will be stored.

        Return
        ----------
        None
        """
        joblib.dump(self.model, path + 'tree.pkl')
        joblib.dump(self.classes, path + 'classes.pkl')
项目:karura    作者:icoxfog417    | 项目源码 | 文件源码
def save(self):
        home_dir = self.__home_dir(self.field_manager.app_id)
        if not os.path.isdir(home_dir):
            print("making directory for app {}...".format(self.field_manager.app_id))
            os.mkdir(home_dir)

        path_fieldm = os.path.join(home_dir, self.FIELD_MANAGER_FILE)
        with open(path_fieldm, mode="w", encoding="utf-8") as fm:
            serialized = self.field_manager.to_dict()
            json.dump(serialized, fm, indent=2)

        if self.model:
            joblib.dump(self.model, os.path.join(home_dir, self.MODEL_FILE))
项目:probablyPOTUS    作者:jjardel    | 项目源码 | 文件源码
def save(self, filebase):

        # re-train best model on full data set
        self.model_.fit(self.data, self.data[LABEL].values)

        ts = datetime.now().strftime('%Y%m%d_%H%M%S')

        # logging wrappers don't serialize
        del self.logger

        joblib.dump(self,'{0}/model_{1}.pkl'.format(filebase, ts))
项目:rdocChallenge    作者:Elyne    | 项目源码 | 文件源码
def train(estimator, feats_train, labels_train, weights_train, model='model.pkl'):
    '''
    Train and Evaluate (using k-fold cross validation) the generated machine learning model for severity classification
    @param estimator: the ML estimator to use
    @param feats_train: feats_train: the training features
    @param labels_train: labels for training data
    @return estimator: trained estimator (model)
    '''
    estimator = estimator.fit(feats_train, labels_train, sample_weight=weights_train)
    if model is not None:
        joblib.dump(estimator, cfg.PATH_RESOURCES+model)
    return estimator
项目:marconibot    作者:s4w3d0ff    | 项目源码 | 文件源码
def save(self, location="brain"):
        """ Pickle the brain """
        if self._trained:
            joblib.dump(self.lobe, location + ".pickle")
            logger.info('Brain %s saved', location + '.pickle')
        else:
            return logger.error('Brain is not trained yet! Nothing to save...')
项目:-    作者:YoPatapon    | 项目源码 | 文件源码
def getFeat(TrainData, TestData):
    for data in TestData:
        image = np.reshape(data[0].T, (32, 32, 3))
        gray = rgb2gray(image)/255.0
        fd = hog(gray, 9, [8, 8], [2, 2], 'L2-Hys', False, True)
        fd = np.concatenate((fd, data[1]))
        filename = list(data[2])
        fd_name = filename[0].split('.')[0]+'.feat'
        fd_path = os.path.join('./data/features/test/', fd_name)
        joblib.dump(fd, fd_path)
    print "Test features are extracted and saved."
    for data in TrainData:
        image = np.reshape(data[0].T, (32, 32, 3))
        gray = rgb2gray(image)/255.0
        fd = hog(gray, 9, [8, 8], [2, 2], 'L2-Hys', False, True)
        fd = np.concatenate((fd, data[1]))
        filename = list(data[2])
        fd_name = filename[0].split('.')[0]+'.feat'
        fd_path = os.path.join('./data/features/train/', fd_name)
        joblib.dump(fd, fd_path)
    print "Train features are extracted and saved."
项目:serialtime    作者:ianlini    | 项目源码 | 文件源码
def save_pkl(obj, path, log_description=None, logger=None,
             logging_level=logging.INFO, verbose_start=True,
             verbose_end=True, end_in_new_line=True, log_prefix="..."):
    if log_description is None:
        log_description = "Pickling to " + (path)
    with open(path, "wb") as fp, \
            SimpleTimer(log_description, logger, logging_level, verbose_start,
                        verbose_end, end_in_new_line, log_prefix):
        cPickle.dump(obj, fp, protocol=cPickle.HIGHEST_PROTOCOL)
项目:serialtime    作者:ianlini    | 项目源码 | 文件源码
def save_joblib_pkl(obj, path, log_description=None, logger=None,
                    logging_level=logging.INFO, verbose_start=True,
                    verbose_end=True, end_in_new_line=True, log_prefix="..."):
    try:
        from sklearn.externals import joblib
    except ImportError:
        raise ImportError("This function requires sklearn module. "
                          "You can install it via "
                          "\"pip install scikit-learn\".")
    if log_description is None:
        log_description = "Pickling to " + (path)
    with SimpleTimer(log_description, logger, logging_level, verbose_start,
                     verbose_end, end_in_new_line, log_prefix):
        joblib.dump(obj, path)
项目:eigenfish    作者:sethdp    | 项目源码 | 文件源码
def save(self, filename):
        """
        Saves trained model to filename.

        :param filename: Name of file to save model as.
        """
        joblib.dump(self.svc, filename)
项目:topic-ensemble    作者:derekgreene    | 项目源码 | 文件源码
def save_corpus( out_prefix, X, terms, doc_ids, classes = None ):
    """
    Save a pre-processed scikit-learn corpus and associated metadata using Joblib.
    """
    matrix_outpath = "%s.pkl" % out_prefix 
    joblib.dump((X,terms,doc_ids,classes), matrix_outpath )
项目:topic-ensemble    作者:derekgreene    | 项目源码 | 文件源码
def save_term_rankings( out_path, term_rankings, labels = None ):
    """
    Save a list of multiple term rankings using Joblib.
    """
    # no labels? generate some standard ones
    if labels is None:
        labels = []
        for i in range( len(term_rankings) ):
            labels.append( "C%02d" % (i+1) )
    joblib.dump((term_rankings,labels), out_path )