我们从Python开源项目中,提取了以下44个代码示例,用于说明如何使用sklearn.preprocessing.Imputer()。
def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ from sklearn.datasets import load_boston scikit_data = load_boston() scikit_model = Imputer(strategy='most_frequent', axis=0) scikit_data['data'][1,8] = np.NaN input_data = scikit_data['data'][:,8].reshape(-1, 1) scikit_model.fit(input_data, scikit_data['target']) # Save the data and the model self.scikit_data = scikit_data self.scikit_model = scikit_model
def gen_features(train, y, test): for c in ['active', 'alco', 'smoke']: le = preprocessing.LabelEncoder() le.fit(train[c].values.tolist() + test[c].values.tolist()) train[c] = le.transform(train[c]) test[c] = le.transform(test[c]) train['ap_dif'] = train.ap_hi - train.ap_lo test['ap_dif'] = test.ap_hi - test.ap_lo h = train['height'] / 100 train['BWI'] = train['weight'] / (h * h) h = test['height'] / 100 test['BWI'] = test['weight'] / (h * h) imp = preprocessing.Imputer() train = imp.fit_transform(train) test = imp.transform(test) return train, y, test
def FeatureCombination(Df,s='',num_feature=2): feature_set = [] for c in Df.columns: if c.startswith(s): feature_set.append(c) print('combining', len(feature_set), 'features') data = Df[feature_set].values for c in Df.columns: if Df[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(Df[c].values)) Df[c] = lbl.transform(list(Df[c].values)) imp = preprocessing.Imputer() data = imp.fit_transform(data) data = preprocessing.scale(data) pca = PCA(num_feature) pca.fit(data) print('explained_variance_ratio_:', pca.explained_variance_ratio_) trans = pca.transform(data) for i in range(0,num_feature): Df[s+'_%d'%(i+1)] = trans[:,i] Df.drop(feature_set,1,inplace=True) return Df
def fit(self, scenario: ASlibScenario, config: Configuration): ''' fit pca object to ASlib scenario data Arguments --------- scenario: data.aslib_scenario.ASlibScenario ASlib Scenario with all data in pandas config: ConfigSpace.Configuration configuration ''' self.imputer = Imputer(strategy=config.get("imputer_strategy")) self.imputer.fit(scenario.feature_data.values) self.active = True
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"): self.max_iter = max_iter self.initial_strategy = initial_strategy self.initial_imputer = Imputer(strategy=initial_strategy) self.tol = tol self.f_model = f_model
def build_model_random_forest(df, features, categorical_features, target, split=0.70): print "using %d features (%d columns) on %d rows and target %s. Split %f." % ( len(features), len(df.columns), len(df), target, split) df['is_train'] = np.random.uniform(0, 1, len(df)) <= split train, test = df[df['is_train'] == True], df[df['is_train'] == False] # one_hot_encoding because it doesn't work in pipeline for some reason # for f in categorical_features: # dummies = pd.get_dummies(df[f], prefix=f) # for dummy in dummies.columns: # df[dummy] = dummies[dummy] # features.append(dummy) # df = df.drop(f, 1) # features.remove(f) clf = Pipeline([ ("imputer", Imputer(strategy="mean", axis=0)), ('feature_selection', SelectKBest(k=5)), ("forest", RandomForestClassifier())]) clf.fit(train[features], train[target]) score = clf.score(test[features], test[target]) predicted = clf.predict(test[features]) cm = confusion_matrix(test[target], predicted) print "Random Forest score: %f" % score print "confusion_matrix : \n%s" % cm return clf
def make_predictions_random_forest(df, features, target, split=0.70): print "using %d features (%d columns) on %d rows and target %s. Split %f." % ( len(features), len(df.columns), len(df), target, split) # print "unused features: ", '\n\t\t'.join([f for f in df.columns if f not in features]) # print "columns: ", '\n\t\t'.join(df.columns) df['is_train'] = np.random.uniform(0, 1, len(df)) <= split train, test = df[df['is_train'] == True], df[df['is_train'] == False] clf = Pipeline([ ("imputer", Imputer(strategy="mean", axis=0)), ('feature_selection', SelectKBest(k=200)), ("forest", RandomForestClassifier( min_samples_leaf=1, min_samples_split=10, n_estimators=60, max_depth=None, criterion='gini'))]) clf.fit(train[features], train[target]) score = clf.score(test[features], test[target]) predicted = clf.predict(test[features]) cm = confusion_matrix(test[target], predicted) # print classification_report(test[target], predicted) return score, cm # Utility function to report best scores
def preprocess_data(X_train, X_test): """ Impute missing values. """ # Impute using the mean of every column for now. However, # I would've liked to impute 'F5' using mode instead. imp = Imputer(missing_values='NaN', strategy='mean', axis=0) train_xform = imp.fit_transform(X_train) X_train = pd.DataFrame(train_xform, columns=X_train.columns) test_xform = imp.transform(X_test) X_test = pd.DataFrame(test_xform, columns=X_test.columns) return X_train, X_test
def imputerLabelEncoder_train(X,y): imputer = preprocessing.Imputer() X = imputer.fit_transform(X) le = preprocessing.LabelEncoder() y = le.fit_transform(y) return X,y,imputer,le
def build_classifier(base_clf=svm.SVC()): # The imputer is for "use_taxonomy", and shouldn't affect if it's False. # TODO: should also try with other imputer strategies return pipeline.make_pipeline(preprocessing.Imputer(strategy='most_frequent'), preprocessing.StandardScaler(), base_clf) # noinspection PyPep8Naming
def fit(self, X, y=None): self.imp = Imputer(strategy=self.strategy) self.imp.fit(X) self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns) return self
def test_grid_search_allows_nans(): # Test dcv.GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) dcv.GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(Exception): model = Imputer() spec = converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(Exception): from sklearn.linear_model import LinearRegression model = LinearRegression() spec = converter.convert(model, 'data', 'out')
def test_conversion_boston(self): from sklearn.datasets import load_boston scikit_data = load_boston() sh = scikit_data.data.shape rn.seed(0) missing_value_indices = [(rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])] for strategy in ["mean", "median", "most_frequent"]: for missing_value in [0, 'NaN', -999]: X = np.array(scikit_data.data).copy() for i, j in missing_value_indices: X[i,j] = missing_value model = Imputer(missing_values = missing_value, strategy = strategy) model = model.fit(X) tr_X = model.transform(X.copy()) spec = converter.convert(model, scikit_data.feature_names, 'out') input_data = [dict(zip(scikit_data.feature_names, row)) for row in X] output_data = [{"out" : row} for row in tr_X] result = evaluate_transformer(spec, input_data, output_data) assert result["num_errors"] == 0
def __init__(self, strategy_categorical="most_frequent", strategy_numerical="median", categorical=None): """ An Imputer that can apply a different strategy for both categorical data and numerical data. :param strategy_categorical: "mean", "median" or "most_frequent" :param strategy_numerical: "mean", "median" or "most_frequent" :param categorical: A boolean mask for the categorical columns of a dataset """ if categorical is None: categorical = [] self.strategy_categorical = strategy_categorical self.strategy_numerical = strategy_numerical self.cat_imputer = Imputer(strategy=strategy_categorical) self.num_imputer = Imputer(strategy=strategy_numerical) self.categorical = categorical self._update_indices()
def remove_nan(x): """remove NaN values from data vectors""" imp = Imputer(missing_values='NaN', strategy='mean', axis=0) x_clean = imp.fit_transform(x) return x_clean
def mean_shift(location, location_callback, bandwidth=None): """Returns one or more clusters of a set of points, using a mean shift algorithm. The result is sorted with the first value being the largest cluster. Kwargs: bandwidth (float): If bandwidth is None, a value is detected automatically from the input using estimate_bandwidth. Returns: A list of NamedTuples (see get_cluster_named_tuple for a definition of the tuple). """ pts = location._tuple_points() if not pts: return None X = np.array(pts).reshape((len(pts), len(pts[0]))) if np.any(np.isnan(X)) or not np.all(np.isfinite(X)): return None X = Imputer().fit_transform(X) X = X.astype(np.float32) if not bandwidth: bandwidth = estimate_bandwidth(X, quantile=0.3) ms = MeanShift(bandwidth=bandwidth or None, bin_seeding=False).fit(X) clusters = [] for cluster_id, cluster_centre in enumerate(ms.cluster_centers_): locations = [] for j, label in enumerate(ms.labels_): if not label == cluster_id: continue locations.append(location.locations[j]) if not locations: continue clusters.append(cluster_named_tuple()(label=cluster_id, centroid=Point(cluster_centre), location=location_callback( locations))) return clusters
def GetFeatures(frame): #convert data to float arr = np.array(frame,dtype=np.float) #fill missing values from sklearn.preprocessing import Imputer imputer = Imputer(strategy='mean') arr = imputer.fit_transform(arr) #normalize the entire data from sklearn.preprocessing import scale arr = scale(arr) return arr #=================================================
def impute_and_scale(df, scaling=None): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) # print(mat.shape) if scaling is None: return pd.DataFrame(mat, columns=df.columns) # Scaling data if scaling == 'maxabs': # Normalizing -1 to 1 scaler = MaxAbsScaler() elif scaling == 'minmax': # Scaling to [0,1] scaler = MinMaxScaler() else: # Standard normalization scaler = StandardScaler() mat = scaler.fit_transform(mat) # print(mat.shape) df = pd.DataFrame(mat, columns=df.columns) return df
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean', axis=0) mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def data_handlemissing(dataframe, pipeline): try: if pipeline['options']['type'] == "dropcolumns": thresh = pipeline['options']['thresh'] if thresh == -1: dataframe.dropna(axis=1, how="all", inplace=True) elif thresh == 0: dataframe.dropna(axis=1, how="any", inplace=True) elif thresh > 0: dataframe.dropna(axis=1, thresh=thresh, inplace=True) elif pipeline['options']['type'] == "droprows": thresh = pipeline['options']['thresh'] if thresh == -1: dataframe.dropna(axis=0, how="all", inplace=True) elif thresh == 0: dataframe.dropna(axis=0, how="any", inplace=True) elif thresh > 0: dataframe.dropna(axis=0, thresh=thresh) elif pipeline['options']['type'] == "fillmissing": strategy = pipeline['options']['strategy'] imp = Imputer(missing_values='NaN', strategy=strategy, axis=0) array = imp.fit_transform(dataframe.values) dataframe = pandas.DataFrame(array, columns = dataframe.columns) return dataframe except Exception as e: raise Exception("data_handlemissing: " + str(e))
def imputer_transform(data): imputer = Imputer() imputer.fit(data) return imputer.transform(data)
def imputator(features): """Fill in missing values with mean of the remaining samples Keyword arguments: features -- feature matrix """ imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(features) return imp.transform(features)
def impute_data(self,x): """Imputes data set containing Nan values""" imp = Imputer(missing_values='NaN', strategy='mean', axis=0) return imp.fit_transform(x)
def test_grid_search_allows_nans(): # Test GridSearchCV with Imputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
def test_permutation_test_score_allow_nans(): # Check that permutation_test_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) permutation_test_score(p, X, y, cv=5)
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cross_val_score(p, X, y, cv=5)
def test_cross_val_score_allow_nans(): # Check that cross_val_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cval.cross_val_score(p, X, y, cv=5)
def test_permutation_test_score_allow_nans(): # Check that permutation_test_score allows input data with NaNs X = np.arange(200, dtype=np.float64).reshape(10, -1) X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ ('imputer', Imputer(strategy='mean', missing_values='NaN')), ('classifier', MockClassifier()), ]) cval.permutation_test_score(p, X, y, cv=5)
def convert(model, input_features, output_features): """Convert a DictVectorizer model to the protobuf spec. Parameters ---------- model: DictVectorizer A fitted DictVectorizer model. input_features: str Name of the input column. output_features: str Name of the output column. Returns ------- model_spec: An object of type Model_pb. Protobuf representation of the model """ if not(_HAS_SKLEARN): raise RuntimeError('scikit-learn not found. scikit-learn conversion API is disabled.') # Set the interface params. spec = _Model_pb2.Model() spec.specificationVersion = SPECIFICATION_VERSION assert len(input_features) == 1 assert isinstance(input_features[0][1], datatypes.Array) # feature name in and out are the same here spec = set_transform_interface_params(spec, input_features, output_features) # Test the scikit-learn model _sklearn_util.check_expected_type(model, Imputer) _sklearn_util.check_fitted(model, lambda m: hasattr(m, 'statistics_')) if model.axis != 0: raise ValueError("Imputation is only supported along axis = 0.") # The imputer in our framework only works on single columns, so # we need to translate that over. The easiest way to do that is to # put it in a nested pipeline with a feature extractor and a tr_spec = spec.imputer for v in model.statistics_: tr_spec.imputedDoubleArray.vector.append(v) try: tr_spec.replaceDoubleValue = float(model.missing_values) except ValueError: raise ValueError("Only scalar values or NAN as missing_values " "in _imputer are supported.") return _MLModel(spec)
def impute_values(features, dt, sentinel): r"""Impute values for a given data type. The *median* strategy is applied for floating point values, and the *most frequent* strategy is applied for integer or Boolean values. Parameters ---------- features : pandas.DataFrame Dataframe containing the features for imputation. dt : str The values ``'float64'``, ``'int64'``, or ``'bool'``. sentinel : float The number to be imputed for NaN values. Returns ------- imputed_features : numpy array The features after imputation. Raises ------ TypeError Data type ``dt`` is invalid for imputation. References ---------- You can find more information on feature imputation here [IMP]_. .. [IMP] http://scikit-learn.org/stable/modules/preprocessing.html#imputation """ try: nfeatures = features.shape[1] except: features = features.values.reshape(-1, 1) if dt == 'float64': imp = Imputer(missing_values='NaN', strategy='median', axis=0) elif dt == 'int64' or dt == 'bool': imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) else: raise TypeError("Data Type %s is invalid for imputation" % dt) imputed = imp.fit_transform(features) if imputed.shape[1] == 0: nans = np.isnan(features) features[nans] = sentinel imputed_features = features else: imputed_features = imputed return imputed_features # # Function get_numerical_features #
def prepareDataForClassification(dataset): """ generates categorical output column, attach to dataframe label the categories and split into train and test """ le = preprocessing.LabelEncoder() dataset['UpDown'] = dataset['Return_Out'] dataset.UpDown[dataset.UpDown >= 0] = 'Up' dataset.UpDown[dataset.UpDown < 0] = 'Down' dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) features = dataset.columns[1:-1] X = dataset[features] y = dataset.UpDown # print y print X.shape # print y.shape # for i in range(len(X.columns)): # print X.columns[i] # X.to_csv("X.csv", sep='\t', encoding='utf-8') # y.to_csv("y.csv", sep='\t', encoding='utf-8') # print X.iloc[2:5, 78:84] # X = X.fillna(X.mean()) # print X.iloc[2:5, 78:84] # print X.index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # print X_train.iloc[2:5, 78:84] X_train = X_train.fillna(X_train.mean()) X_test = X_test.fillna(X_test.mean()) # print X_train.iloc[2:5, 78:84] # X_train.to_csv("X_train.csv", sep='\t', encoding='utf-8') # y_train.to_csv("y_train.csv", sep='\t', encoding='utf-8') # imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # imp.fit(X_train) # # imp.fit(y_train) # imp.fit(X_test) # # imp.fit(y_test) # X_train = imp.fit_transform(X_train) # # y_train = imp.fit_transform(y_train) # X_test = imp.fit_transform(X_test) # # y_test = imp.fit_transform(y_test) # imp = Imputer(missing_values=0, strategy='mean', axis=0) # imp.fit(X_train) # # imp.fit(y_train) # imp.fit(X_test) # # imp.fit(y_test) # X_train = imp.fit_transform(X_train) # # y_train = imp.fit_transform(y_train) # X_test = imp.fit_transform(X_test) # y return X_train, y_train, X_test, y_test
def prepareDataForClassification(dataset): """ generates categorical output column, attach to dataframe label the categories and split into train and test """ le = preprocessing.LabelEncoder() dataset['UpDown'] = dataset['Return_Out'] dataset.UpDown[dataset.UpDown >= 0] = 'Up' dataset.UpDown[dataset.UpDown < 0] = 'Down' dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) features = dataset.columns[1:-1] X = dataset[features] y = dataset.UpDown # print X.shape # print y.shape # print X.index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X_train) # imp.fit(y_train) imp.fit(X_test) # imp.fit(y_test) X_train = imp.fit_transform(X_train) # y_train = imp.fit_transform(y_train) X_test = imp.fit_transform(X_test) # y_test = imp.fit_transform(y_test) imp = Imputer(missing_values=0, strategy='mean', axis=0) imp.fit(X_train) # imp.fit(y_train) imp.fit(X_test) # imp.fit(y_test) X_train = imp.fit_transform(X_train) # y_train = imp.fit_transform(y_train) X_test = imp.fit_transform(X_test) # y return X_train, y_train, X_test, y_test
def affinity_propagation(location, location_callback): """Returns one or more clusters of a set of points, using an affinity propagation algorithm. The result is sorted with the first value being the largest cluster. Returns: A list of NamedTuples (see get_cluster_named_tuple for a definition of the tuple). """ pts = location._tuple_points() if not pts: return None X = np.array(pts).reshape((len(pts), len(pts[0]))) if np.any(np.isnan(X)) or not np.all(np.isfinite(X)): return None X = Imputer().fit_transform(X) X = X.astype(np.float32) afkwargs = { 'damping': 0.5, 'convergence_iter': 15, 'max_iter': 200, 'copy': True, 'preference': None, 'affinity': 'euclidean', 'verbose': False } af = AffinityPropagation(**afkwargs).fit(X) cluster_centers_indices = af.cluster_centers_indices_ clusters = [] for cluster_id, cluster_centre in enumerate(af.cluster_centers_): locations = [] for j, label in enumerate(af.labels_): if not label == cluster_id: continue locations.append(location.locations[j]) if not locations: continue clusters.append(cluster_named_tuple()(label=cluster_id, centroid=Point(cluster_centre), location=location_callback( locations))) return clusters
def model(self): #cname = sys._getframe().f_code.co_name cname = 'keras' train, y, test = self.train_, self.y_, self.test_ np.random.seed(1234) train.drop('id', axis=1, inplace=True) test.drop('id', axis=1, inplace=True) from sklearn import pipeline pipe = pipeline.make_pipeline(preprocessing.Imputer(), preprocessing.RobustScaler()) train = pipe.fit_transform(train) test = pipe.transform(test) self.input_dims_ = train.shape[1] def build_model(): return self.build_keras_model() batch_size = self.batch_size_ build_model().summary(line_length=120) ss = model_selection.StratifiedKFold(n_splits = self.num_splits_, random_state = 11, shuffle = True) scores = list() model_path = self.temp_name('keras_mlp_weights') v, z = self.v_, self.z_ v[cname] = 0 z[cname] = 0 for n, (itrain, ival) in enumerate(ss.split(train, y)): xtrain, xval = train[itrain], train[ival] ytrain, yval = y[itrain], y[ival] model = build_model() model.fit( xtrain, ytrain, batch_size = batch_size, epochs = 10000, validation_data = (xval, yval), verbose = 0, callbacks = build_keras_fit_callbacks(model_path), shuffle = True ) model.load_weights(model_path) p = model.predict(xval) v.loc[ival, cname] += p.ravel() score = metrics.log_loss(y[ival], p) if score != score: raise Exception('NaN score!!!') print(cname, 'fold %d: '%(n+1), score, self.now()) scores.append(score) z[cname] += model.predict(test).ravel() del model for i in range(3): gc.collect(i) print('scores:', scores, np.mean(scores), np.std(scores)) self.drop_temp(model_path) cv=np.mean(scores) z[cname] /= self.num_splits_ z['y'] = z[cname] return cv, None
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')