Python sklearn 模块,cross_validation() 实例源码

我们从Python开源项目中,提取了以下13个代码示例,用于说明如何使用sklearn.cross_validation()

项目:introspective    作者:numeristical    | 项目源码 | 文件源码
def train_and_calibrate_cv(model, X_tr, y_tr, cv=5):
    y_pred_xval = np.zeros(len(y_tr))
    skf = cross_validation.StratifiedKFold(y_tr, n_folds=cv,shuffle=True)
    i = 0;
    for train, test in skf:
        i = i+1
        print("training fold {} of {}".format(i, cv))
        X_train_xval = np.array(X_tr)[train,:]
        X_test_xval = np.array(X_tr)[test,:]
        y_train_xval = np.array(y_tr)[train]
        # We could also copy the model first and then fit it
        model_copy = clone(model)
        model_copy.fit(X_train_xval,y_train_xval)
        y_pred_xval[test]=model.predict_proba(X_test_xval)[:,1]
    print("training full model")
    model_copy = clone(model)
    model_copy.fit(X_tr,y_tr)
    print("calibrating function")
    calib_func = prob_calibration_function(y_tr, y_pred_xval)
    return model_copy, calib_func
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def _cv_len(cv, X, y):
    """This method computes the length of a cross validation
    object, agnostic of whether sklearn-0.17 or sklearn-0.18
    is being used.

    Parameters
    ----------

    cv : `sklearn.cross_validation._PartitionIterator` or `sklearn.model_selection.BaseCrossValidator`
        The cv object from which to extract length. If using
        sklearn-0.17, this can be computed by calling `len` on
        ``cv``, else it's computed with `cv.get_n_splits(X, y)`.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    Returns
    -------

    int
    """
    return len(cv) if not SK18 else cv.get_n_splits(X, y)
项目:sptgraph    作者:epfl-lts2    | 项目源码 | 文件源码
def best_shape_clustering(mols, nb_layers, k_range=range(3, 20), train_ratio=0.8, cluster_key='shape_cid'):
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import silhouette_score

    shape_df = mols['dynamic'].apply(lambda x: temporal_shape(x, nb_layers))
    train_idx, test_idx = train_test_split(shape_df.index.values, train_size=train_ratio)

    train_mat = np.array(list(shape_df[shape_df.index.isin(train_idx)].values))
    full_mat = np.array(list(shape_df.values))

    centroids = None
    labels = None
    best_score = 0
    for k in k_range:
        res = cluster_shapes(train_mat, full_mat, k)
        score = silhouette_score(full_mat, res[1])
        if score > best_score:
            centroids = res[0]
            labels = res[1]
            best_score = score

    mols[cluster_key] = labels
    return mols, centroids
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def splitValidateModel(self, visualizePredictions = False):
    (label_vector, input_vector) = loadData(self.featureFile)

    indexArray = range(0, len(input_vector))
    trainData, testData, trainLabels, expectedLabels, trainIndices, testIndices = \
      cross_validation.train_test_split(input_vector, label_vector, indexArray, test_size=(1.0 - self.percentSplit))

    kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
    kNNClassifier.fit(trainData, trainLabels) 
    predictedLabels = kNNClassifier.predict(testData)

    print("Classification report for classifier %s:\n%s\n"
          % ('k-NearestNeighbour', metrics.classification_report(expectedLabels, predictedLabels)))
    print("Confusion matrix:\n%s" % metrics.confusion_matrix(expectedLabels, predictedLabels))
    print('Split Validation training :: Done.\n')

    if visualizePredictions:
      self.__visualizePredictedDataset__(input_vector, testIndices, predictedLabels, expectedLabels)
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def trainLimited(self, featureFile, n_datapoints):
    (label_vector, input_vector) = loadData(featureFile)

    trainData, testData, trainLabels, testLabels = \
      cross_validation.train_test_split(input_vector, label_vector, test_size=(0))

    n_totalrows = int((len(label_vector)/n_datapoints))
    for n in range(0, n_totalrows):
      limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
      limited_input_vector = trainData[0: (n+1) * n_datapoints]

      kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
      kNNClassifier.fit(limited_input_vector, limited_label_vector)

      scores = cross_validation.cross_val_score(kNNClassifier, limited_input_vector, limited_label_vector, cv = 5)
      print '%f on %d datapoints' % ((sum(scores) / len(scores)), len(limited_label_vector))
项目:skutil    作者:tgsmith61591    | 项目源码 | 文件源码
def _set_cv(cv, X, y, classifier):
    """This method returns either a `sklearn.cross_validation._PartitionIterator` or 
    `sklearn.model_selection.BaseCrossValidator` depending on whether sklearn-0.17
    or sklearn-0.18 is being used.

    Parameters
    ----------

    cv : int, `_PartitionIterator` or `BaseCrossValidator`
        The CV object or int to check. If an int, will be converted
        into the appropriate class of crossvalidator.

    X : pd.DataFrame or np.ndarray, shape(n_samples, n_features)
        The dataframe or np.ndarray being fit in the grid search.

    y : np.ndarray, shape(n_samples,)
        The target being fit in the grid search.

    classifier : bool
        Whether the estimator being fit is a classifier

    Returns
    -------

    `_PartitionIterator` or `BaseCrossValidator`
    """
    return check_cv(cv, X, y, classifier) if not SK18 else check_cv(cv, y, classifier)
项目:cifar100-datasets-caffe    作者:bikong2    | 项目源码 | 文件源码
def shuffle_data(data, labels):
    data, _, labels, _ = sklearn.cross_validation.train_test_split(data, labels, test_size=0.0, random_state=42)
    return data, labels
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def crossValidateModel(self):
    (label_vector, input_vector) = loadData(self.featureFile)
    kFold = 5

    kNNClassifier = neighbors.KNeighborsClassifier(self.n_neighbors, weights='distance')
    scores = cross_validation.cross_val_score(kNNClassifier, input_vector, label_vector, cv = kFold)

    print("\n----- k-fold Cross Validation -----")
    print(scores)
    print("Average: ", sum(scores) / len(scores))
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def trainLimitedMLP(self, featureFile, n_datapoints):
    (label_vector, input_vector) = self.__loadData__(featureFile)

    n_totalrows = int((len(label_vector)/n_datapoints))
    k=[]
    for n in range(0, n_totalrows):
      trainData, testData, trainLabels, testLabels = \
        cross_validation.train_test_split(input_vector, label_vector, test_size=(0.2))

      limited_label_vector = trainLabels[0: (n+1) * n_datapoints]
      limited_input_vector = trainData[0: (n+1) * n_datapoints]

      average = []
      for a in range(0,5):
        _, maxVal = self.trainMLPWithData(limited_input_vector, limited_label_vector, 1000)
        average.append(maxVal)

      averageMaxVal = sum(average) / len(average)
      print 'Total Average Value: %s \n\n' % (averageMaxVal)
      average = []
      k.append(averageMaxVal)

    print('Limited MLP training result -------------')
    for i in range (0,len(k)):
        print '%f on %d datapoints' % (k[i], n_datapoints * (i+1))
    print '------------------------------------------'
项目:decoding_challenge_cortana_2016_3rd    作者:kingjr    | 项目源码 | 文件源码
def _cross_val(data, est, cv, n_jobs):
    """Helper to compute cross validation."""
    try:
        from sklearn.model_selection import cross_val_score
    except ImportError:
        # XXX support sklearn < 0.18
        from sklearn.cross_validation import cross_val_score
    return np.mean(cross_val_score(est, data, cv=cv, n_jobs=n_jobs,
                                   scoring=_gaussian_loglik_scorer))
项目:pyglmnet    作者:glm-tools    | 项目源码 | 文件源码
def _set_cv(cv, estimator=None, X=None, y=None):
        """Set the default CV depending on whether clf
           is classifier/regressor."""
        # Detect whether classification or regression
        if estimator in ['classifier', 'regressor']:
            est_is_classifier = estimator == 'classifier'
        else:
            est_is_classifier = is_classifier(estimator)
        # Setup CV
        if check_version('sklearn', '0.18'):
            from sklearn import model_selection as models
            from sklearn.model_selection import (check_cv,
                                                 StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                XFold = StratifiedKFold if est_is_classifier else KFold
                cv = XFold(n_splits=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                cv = cv()
            cv = check_cv(cv=cv, y=y, classifier=est_is_classifier)
        else:
            from sklearn import cross_validation as models
            from sklearn.cross_validation import (check_cv,
                                                  StratifiedKFold, KFold)
            if isinstance(cv, (int, np.int)):
                if est_is_classifier:
                    cv = StratifiedKFold(y=y, n_folds=cv)
                else:
                    cv = KFold(n=len(y), n_folds=cv)
            elif isinstance(cv, str):
                if not hasattr(models, cv):
                    raise ValueError('Unknown cross-validation')
                cv = getattr(models, cv)
                if cv.__name__ not in ['KFold', 'LeaveOneOut']:
                    raise NotImplementedError('CV cannot be defined with str'
                                              ' for sklearn < .017.')
                cv = cv(len(y))
            cv = check_cv(cv=cv, X=X, y=y, classifier=est_is_classifier)

        # Extract train and test set to retrieve them at predict time
        if hasattr(cv, 'split'):
            cv_splits = [(train, test) for train, test in
                         cv.split(X=np.zeros_like(y), y=y)]
        else:
            # XXX support sklearn.cross_validation cv
            cv_splits = [(train, test) for train, test in cv]

        if not np.all([len(train) for train, _ in cv_splits]):
            raise ValueError('Some folds do not have any train epochs.')

        return cv, cv_splits
项目:static-gesture-recognition    作者:windmark    | 项目源码 | 文件源码
def trainMLPWithData(self, input_vector, label_vector, printSteps = 250):
    percent_split = 0.7
    trX, teX, trY, teY = cross_validation.train_test_split(input_vector, 
              label_vector, test_size=(1.0-percent_split), random_state=0)

    n_inputs = 10
    n_outputs = 8

    X = tf.placeholder("float", [None, n_inputs])
    Y = tf.placeholder("float", [None, n_outputs])

    w_h = tf.Variable(tf.random_normal([n_inputs, 10], stddev=0.01))
    w_o = tf.Variable(tf.random_normal([10, n_outputs], stddev=0.01))

    p_keep_input = tf.placeholder("float")
    p_keep_hidden = tf.placeholder("float")
    X = tf.nn.dropout(X, p_keep_input)
    h = tf.nn.relu(tf.matmul(X, w_h))
    h = tf.nn.dropout(h, p_keep_hidden)
    py_x = tf.matmul(h, w_o)

    learnRate = 0.01
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
    train_step = tf.train.RMSPropOptimizer(0.001, 0.9).minimize(cost)
    #train_step = tf.train.GradientDescentOptimizer(learnRate).minimize(cost)

    # Add accuracy checking nodes
    tf_correct_prediction = tf.equal(tf.argmax(py_x,1), tf.argmax(teY,1))
    tf_accuracy = tf.reduce_mean(tf.cast(tf_correct_prediction, "float"))

    # Init variables
    init = tf.initialize_all_variables()

    sess = tf.Session()
    sess.run(init)

    k=[]
    for i in range(10000):
        sess.run(train_step, feed_dict={X: trX, Y: trY, p_keep_input: 0.8, p_keep_hidden: 0.5})
        result = sess.run(tf_accuracy, feed_dict={X: teX, Y: teY, p_keep_input: 1.0, p_keep_hidden: 1.0})
        # Save data
        k.append(result)
        if (i % printSteps == 0):
          print("Run {},{}".format(i,result))

    k=np.array(k)
    print("Max accuracy: {}".format(k.max()))
    print(('MLP training with %s datapoints :: Done \n\n') % (len(input_vector)))

    self.trainedModel = sess
    return (self.trainedModel, k.max())
项目:SinaWeiboSpider    作者:SuperSaiyanSSS    | 项目源码 | 文件源码
def rand_forest_train(self):
        # ??????????
        users = pd.read_csv('names.csv')
        # ??similarity?platform?reputation?entropy????????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        y = users['human_or_machine']

        # ?????????? 25%???????
        from sklearn.cross_validation import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

        # ????????????????
        from sklearn.feature_extraction import DictVectorizer
        vec = DictVectorizer(sparse=False)
        X_train = vec.fit_transform(X_train.to_dict(orient='record'))
        X_test = vec.transform(X_test.to_dict(orient='record'))

        # ?????????????????????
        from sklearn.tree import DecisionTreeClassifier
        dtc = DecisionTreeClassifier()
        dtc.fit(X_train, y_train)
        dtc_y_pred = dtc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import RandomForestClassifier
        rfc = RandomForestClassifier()
        rfc.fit(X_train, y_train)
        rfc_y_pred = rfc.predict(X_test)

        # ???????????????????????
        from sklearn.ensemble import GradientBoostingClassifier
        gbc = GradientBoostingClassifier()
        gbc.fit(X_train, y_train)
        gbc_y_pred = gbc.predict(X_test)

        from sklearn.metrics import classification_report
        # ??????????????????? ?????????? ??? F1??
        print("??????????", dtc.score(X_test, y_test))
        print(classification_report(dtc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", rfc.score(X_test, y_test))
        print(classification_report(rfc_y_pred, y_test))

        # ??????????????????????????????? ??? F1??
        print("????????????", gbc.score(X_test, y_test))
        print(classification_report(gbc_y_pred, y_test))


        users = pd.read_csv('values.csv')

        # ??????????
        X = users[['similarity', 'platform', 'reputation', 'entropy']]
        X = vec.transform(X.to_dict(orient='record'))
        print(rfc.predict(X))

        self.dtc = dtc
        self.rfc = rfc
        self.gbc = gbc