我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用sklearn.cross_validation()。
def train_and_calibrate_cv(model, X_tr, y_tr, cv=5): y_pred_xval = np.zeros(len(y_tr)) skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True) i = 0; for train, test in skf: i = i+1 print("training fold {} of {}".format(i, cv)) X_train_xval = np.array(X_tr)[train,:] X_test_xval = np.array(X_tr)[test,:] y_train_xval = np.array(y_tr)[train] # We could also copy the model first and then fit it model_copy = clone(model) model_copy.fit(X_train_xval,y_train_xval) y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1] print("training full model") model_copy = clone(model) model_copy.fit(X_tr,y_tr) print("calibrating function") calib_func = prob_calibration_function(y_tr, y_pred_xval) return model_copy, calib_func
def _cv_len(cv, X, y): """This method computes the length of a cross validation object, agnostic of whether sklearn-0.17 or sklearn-0.18 is being used. Parameters ---------- cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator` The cv object from which to extract length. If using sklearn-0.17, this can be computed by calling `len` on ``cv``, else it's computed with `cv.get_n_splits(X, y)`. X : pd.DataFrame or np.ndarray, shape(n_samples, n_features) The dataframe or np.ndarray being fit in the grid search. y : np.ndarray, shape(n_samples,) The target being fit in the grid search. Returns ------- int """ return len(cv) if not SK18 else cv.get_n_splits(X, y)
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'): from sklearn.cross_validation import train_test_split from sklearn.metrics import silhouette_score shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers)) train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio) train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values)) full_mat = np.array(list(shape_df.values)) centroids = None labels = None best_score = 0 for k in k_range: res = cluster_shapes(train_mat, full_mat, k) score = silhouette_score(full_mat, res[1]) if score > best_score: centroids = res[0] labels = res[1] best_score = score mols[cluster_key] = labels return mols, centroids
def splitValidateModel(self, visualizePredictions = False): (label_vector, input_vector) = loadData(self.featureFile) indexArray = range(0, len(input_vector)) trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \ cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit)) kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(trainData, trainLabels) predictedLabels = kNNClassifier.predict(testData) print("Classification report for classifier %s:\n%s\n" % ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels)) print('Split Validation training :: Done.\n') if visualizePredictions: self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
def trainLimited(self, featureFile, n_datapoints): (label_vector, input_vector) = loadData(featureFile) trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0)) n_totalrows = int((len(label_vector)/n_datapoints)) for n in range(0, n_totalrows): limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') kNNClassifier.fit(limited_input_vector, limited_label_vector) scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5) print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
def _set_cv(cv, X, y, classifier): """This method returns either a `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17 or sklearn-0.18 is being used. Parameters ---------- cv : int, `_PartitionIterator` or `BaseCrossValidator` The CV object or int to check. If an int, will be converted into the appropriate class of crossvalidator. X : pd.DataFrame or np.ndarray, shape(n_samples, n_features) The dataframe or np.ndarray being fit in the grid search. y : np.ndarray, shape(n_samples,) The target being fit in the grid search. classifier : bool Whether the estimator being fit is a classifier Returns ------- `_PartitionIterator` or `BaseCrossValidator` """ return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
def shuffle_data(data, labels): data, _, labels, _ = sklearn.cross_validation.train_test_split(data, labels, test_size=0.0, random_state=42) return data, labels
def crossValidateModel(self): (label_vector, input_vector) = loadData(self.featureFile) kFold = 5 kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance') scores = cross_validation.cross_val_score(kNNClassifier, input_vector, label_vector, cv = kFold) print("\n----- k-fold Cross Validation -----") print(scores) print("Average: ", sum(scores) / len(scores))
def trainLimitedMLP(self, featureFile, n_datapoints): (label_vector, input_vector) = self.__loadData__(featureFile) n_totalrows = int((len(label_vector)/n_datapoints)) k=[] for n in range(0, n_totalrows): trainData, testData, trainLabels, testLabels = \ cross_validation.train_test_split(input_vector, label_vector, test_size=(0.2)) limited_label_vector = trainLabels[0: (n+1) * n_datapoints] limited_input_vector = trainData[0: (n+1) * n_datapoints] average = [] for a in range(0,5): _, maxVal = self.trainMLPWithData(limited_input_vector, limited_label_vector, 1000) average.append(maxVal) averageMaxVal = sum(average) / len(average) print 'Total Average Value: %s \n\n' % (averageMaxVal) average = [] k.append(averageMaxVal) print('Limited MLP training result -------------') for i in range (0,len(k)): print '%f on %d datapoints' % (k[i], n_datapoints * (i+1)) print '------------------------------------------'
def _cross_val(data, est, cv, n_jobs): """Helper to compute cross validation.""" try: from sklearn.model_selection import cross_val_score except ImportError: # XXX support sklearn < 0.18 from sklearn.cross_validation import cross_val_score return np.mean(cross_val_score(est, data, cv=cv, n_jobs=n_jobs, scoring=_gaussian_loglik_scorer))
def _set_cv(cv, estimator=None, X=None, y=None): """Set the default CV depending on whether clf is classifier/regressor.""" # Detect whether classification or regression if estimator in ['classifier', 'regressor']: est_is_classifier = estimator == 'classifier' else: est_is_classifier = is_classifier(estimator) # Setup CV if check_version('sklearn', '0.18'): from sklearn import model_selection as models from sklearn.model_selection import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): XFold = StratifiedKFold if est_is_classifier else KFold cv = XFold(n_splits=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) cv = cv() cv = check_cv(cv=cv, y=y, classifier=est_is_classifier) else: from sklearn import cross_validation as models from sklearn.cross_validation import (check_cv, StratifiedKFold, KFold) if isinstance(cv, (int, np.int)): if est_is_classifier: cv = StratifiedKFold(y=y, n_folds=cv) else: cv = KFold(n=len(y), n_folds=cv) elif isinstance(cv, str): if not hasattr(models, cv): raise ValueError('Unknown cross-validation') cv = getattr(models, cv) if cv.__name__ not in ['KFold', 'LeaveOneOut']: raise NotImplementedError('CV cannot be defined with str' ' for sklearn < .017.') cv = cv(len(y)) cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier) # Extract train and test set to retrieve them at predict time if hasattr(cv, 'split'): cv_splits = [(train, test) for train, test in cv.split(X=np.zeros_like(y), y=y)] else: # XXX support sklearn.cross_validation cv cv_splits = [(train, test) for train, test in cv] if not np.all([len(train) for train, _ in cv_splits]): raise ValueError('Some folds do not have any train epochs.') return cv, cv_splits
def trainMLPWithData(self, input_vector, label_vector, printSteps = 250): percent_split = 0.7 trX, teX, trY, teY = cross_validation.train_test_split(input_vector, label_vector, test_size=(1.0-percent_split), random_state=0) n_inputs = 10 n_outputs = 8 X = tf.placeholder("float", [None, n_inputs]) Y = tf.placeholder("float", [None, n_outputs]) w_h = tf.Variable(tf.random_normal([n_inputs, 10], stddev=0.01)) w_o = tf.Variable(tf.random_normal([10, n_outputs], stddev=0.01)) p_keep_input = tf.placeholder("float") p_keep_hidden = tf.placeholder("float") X = tf.nn.dropout(X, p_keep_input) h = tf.nn.relu(tf.matmul(X, w_h)) h = tf.nn.dropout(h, p_keep_hidden) py_x = tf.matmul(h, w_o) learnRate = 0.01 cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y)) train_step = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost) #train_step = tf.train.GradientDescentOptimizer(learnRate).minimize(cost) # Add accuracy checking nodes tf_correct_prediction = tf.equal(tf.argmax(py_x,1), tf.argmax(teY,1)) tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float")) # Init variables init = tf.initialize_all_variables() sess = tf.Session() sess.run(init) k=[] for i in range(10000): sess.run(train_step, feed_dict={X: trX, Y: trY, p_keep_input: 0.8, p_keep_hidden: 0.5}) result = sess.run(tf_accuracy, feed_dict={X: teX, Y: teY, p_keep_input: 1.0, p_keep_hidden: 1.0}) # Save data k.append(result) if (i % printSteps == 0): print("Run {},{}".format(i,result)) k=np.array(k) print("Max accuracy: {}".format(k.max())) print(('MLP training with %s datapoints :: Done \n\n') % (len(input_vector))) self.trainedModel = sess return (self.trainedModel, k.max())
def rand_forest_train(self): # ?????????? users = pd.read_csv('names.csv') # ??similarity?platform?reputation?entropy???????????? X = users[['similarity', 'platform', 'reputation', 'entropy']] y = users['human_or_machine'] # ?????????? 25%??????? from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) # ???????????????? from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) # ????????????????????? from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) # ??????????????????????? from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) # ??????????????????????? from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report # ??????????????????? ?????????? ??? F1?? print("??????????", dtc.score(X_test, y_test)) print(classification_report(dtc_y_pred, y_test)) # ??????????????????????????????? ??? F1?? print("????????????", rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) # ??????????????????????????????? ??? F1?? print("????????????", gbc.score(X_test, y_test)) print(classification_report(gbc_y_pred, y_test)) users = pd.read_csv('values.csv') # ?????????? X = users[['similarity', 'platform', 'reputation', 'entropy']] X = vec.transform(X.to_dict(orient='record')) print(rfc.predict(X)) self.dtc = dtc self.rfc = rfc self.gbc = gbc