Python sklearn 模块,neighbors() 实例源码


项目:django-corenlp    作者:arunchaganty    | 项目源码 | 文件源码
def k_nearest_approx(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of cosine similarity).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, cosine similarity) pairs, in descending order
        if not hasattr(self, 'lshf'):
            self.lshf = self._init_lsh_forest()

        # TODO(kelvin): make this inner product score, to be consistent with k_nearest
        distances, neighbors = self.lshf.kneighbors(vec, n_neighbors=k, return_distance=True)
        scores = np.subtract(1, distances)
        nbr_score_pairs = self.score_map(np.squeeze(neighbors), np.squeeze(scores))

        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)
项目:django-corenlp    作者:arunchaganty    | 项目源码 | 文件源码
def k_nearest(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of highest inner products).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
        nbr_score_pairs = self.inner_products(vec)
        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k]
项目:django-corenlp    作者:arunchaganty    | 项目源码 | 文件源码
def _init_lsh_forest(self):
        """Construct an LSH forest for nearest neighbor search."""
        import sklearn.neighbors
        lshf = sklearn.neighbors.LSHForest()
        return lshf
项目:django-corenlp    作者:arunchaganty    | 项目源码 | 文件源码
def k_nearest(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of highest inner products).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
        nbr_score_pairs = self.inner_products(vec)
        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k]
项目:django-corenlp    作者:arunchaganty    | 项目源码 | 文件源码
def _init_lsh_forest(self):
        """Construct an LSH forest for nearest neighbor search."""
        import sklearn.neighbors
        lshf = sklearn.neighbors.LSHForest()
        return lshf
项目:ac_pysmac    作者:belkhir-nacim    | 项目源码 | 文件源码
def choose_classifier(classifier,  # which classifier to use
                      # parameters for the tree based classifiers
                      trees_n_estimators=None, trees_criterion=None,
                      trees_max_features=None, trees_max_depth=None,
                      # the ones for k-nearest-neighbors
                      knn_n_neighbors=None, knn_weights=None):
    # note that possibly inactive variables have to be optional
    # as ac_pysmac does not assign a value for inactive variables
    # during the minimization phase
    if classifier == 'random_forest':
        predictor = sklearn.ensemble.RandomForestClassifier(
            trees_n_estimators, trees_criterion,
            trees_max_features, trees_max_depth)
    elif classifier == 'extra_trees':
        predictor = sklearn.ensemble.ExtraTreesClassifier(
            trees_n_estimators, trees_criterion,
            trees_max_features, trees_max_depth)
    elif classifier == 'k_nearest_neighbors':
        predictor = sklearn.neighbors.KNeighborsClassifier(
            knn_n_neighbors, knn_weights), Y_train)
    return -predictor.score(X_test, Y_test)

# defining all the parameters with respective defaults.
项目:healthcareai-py    作者:HealthCatalyst    | 项目源码 | 文件源码
def knn(self,
        A light wrapper for Sklearn's knn classifier that performs randomized search over an overridable default
        hyperparameter grid.

            scoring_metric (str): Any sklearn scoring metric appropriate for classification
            hyperparameter_grid (dict): hyperparameters by name
            randomized_search (bool): True for randomized search (default)
            number_iteration_samples (int): Number of models to train during the randomized search for exploring the
                hyperparameter space. More may lead to a better model, but will take longer.

        if hyperparameter_grid is None:
            neighbors = list(range(5, 26))
            hyperparameter_grid = {'n_neighbors': neighbors, 'weights': ['uniform', 'distance']}
            number_iteration_samples = 10

            print('KNN Grid: {}'.format(hyperparameter_grid))
        algorithm = get_algorithm(KNeighborsClassifier,

        trained_supervised_model = self._create_trained_supervised_model(algorithm)

        return trained_supervised_model
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def findEps(ssearch):
    Find a good epsilon value to use.
    # Calculate nearest neighbors

    # Create a nearest neighbors model--we need 2 nearest neighbors since the 
    # nearest neighbor to a point is going to be itself.
    nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)

    t0 = time.time()

    # Find nearest neighbors.
    distances, indices = nbrs_model.kneighbors(ssearch.index.index)

    elapsed = time.time() - t0

    print 'Took %.2f seconds' % elapsed

    distances = [d[1] for d in distances]
    indeces = [ind[1] for ind in indices]

    # Histogram the nearest neighbor distances.

    import matplotlib.pyplot as plt

    counts, bins, patches = plt.hist(distances, bins=16)
    plt.title("Nearest neighbor distances")

    print '\n%d bins:' % len(counts)

    countAcc = 0
    num_points = len(ssearch.index.index)

    for i in range(0, len(counts)):
        countAcc += counts[i]

        # Calculate the percentage of values which fall below the upper limit 
        # of this bin.
        prcnt = float(countAcc) / float(num_points) * 100.0    

        print '  %.2f%% < %.2f' % (prcnt, bins[i + 1])
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def findMinPts(ssearch, eps):
    Find a good value for MinPts.

    # Count neighbors within threshold

    print 'Calculating pair-wise distances...'
    # Calculate pair-wise cosine distance for all documents.
    t0 = time.time()

    DD = sklearn.metrics.pairwise.cosine_distances(ssearch.index.index)

    elapsed = time.time() - t0

    print '    Took %.2f seconds' % elapsed

    print 'Counting number of neighbors...'

    t0 = time.time()

    # Create a list to hold the number of neighbors for each point.
    numNeighbors = [0]*len(DD)

    for i in range(0, len(DD)):
        dists = DD[i]

        count = 0
        for j in range(0, len(DD)):
            if (dists[j] < eps):
                count += 1

        numNeighbors[i] = count            

    elapsed = time.time() - t0

    print '    Took %.2f seconds' % elapsed

    # Histogram the nearest neighbor distances.

    import matplotlib.pyplot as plt

    counts, bins, patches = plt.hist(numNeighbors, bins=60)
    plt.title("Number of neighbors")
    plt.xlabel("Number of neighbors")

    print '\n%d bins:' % (len(bins) - 1)
    binsStr = ''
    for b in bins:
        binsStr += '  %0.2f' % b

    print binsStr