Python sklearn.cluster 模块,DBSCAN 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.DBSCAN

项目:eclipse2017    作者:google    | 项目源码 | 文件源码
def cluster_points(coordinates, eps, min_samples, n_jobs=1):
    """Given coordinates, function returns the number of clusters in the
    set of coordinates and a list of integer labels corresponding to
    the input coordinate list

    Arguments:
      coordinates: a sequence of (lat, lon) tuples
      eps: the cluster size in radial degrees
      min_samples: the size of the smallest cluster
      n_jobs: number of CPUs to use to compute the clusters
    Returns:
      n_clusters: number of clusters
      labels: the labels of the clusters
    """

    db = DBSCAN(eps=eps,
                min_samples=min_samples,
                n_jobs=n_jobs).fit(coordinates)


    return db
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def dbFun(_x,_original_vals, f):
    db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    labels = db.labels_
    #print(labels)
    n_clusters_ = len(set(labels)) - (1 if -1 else 0)
    #gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
    #_original_vals)
    print("Wait plotting clusters.....")
    plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
    return

##############################################################################################
# Plotting the cluster after the result of DBSCAN
项目:base_function    作者:Rockyzsu    | 项目源码 | 文件源码
def dbscan(fig):
    global X_iris, geo
    ax = fig.add_subplot(geo + 5, projection='3d', title='dbscan')
    dbscan = cluster.DBSCAN()
    dbscan.fit(X_iris)
    res = dbscan.labels_
    core = dbscan.core_sample_indices_
    print repr(core)
    size = [5 if i not in core else 40 for i in range(len(X_iris))]
    print repr(size)
    for n, i in enumerate(X_iris):
        ax.scatter(*i[: 3], s=size[n], c='bgrcmyk'[res[n] % 7],
                   alpha=0.8, marker='o')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    return res
项目:image-segmentation    作者:alexlouden    | 项目源码 | 文件源码
def cluster_dbscan(self, image_cols):
        print 'DBSCAN'
        # TODO handle outliers/noise
        # Look at different metrics?

        db = DBSCAN(eps=self.params.epsilon, min_samples=10, metric='euclidean')
        db.fit(image_cols)

        # from IPython import embed; embed(); import ipdb; ipdb.set_trace()
        self.number_of_clusters = np.max(db.labels_) + 1
        # Ignore -1 cluster, it's noise

        print 'number of clusters', self.number_of_clusters

        # Clusters
        centers = np.zeros((self.number_of_clusters, 3))
        for i in range(0, self.number_of_clusters):
            cluster_points = image_cols[db.labels_ == i]
            cluster_mean = np.mean(cluster_points, axis=0)
            centers[i, :] = cluster_mean

        return centers
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def train(self, data, sample_weight=None):
        """
        :type data: pyspark.RDD
        :param data: (key, k-dim vector like)
        Train the model using a (key, vector) RDD
        """
        parts = KDPartitioner(data, self.max_partitions)
        self.data = data
        self.bounding_boxes = parts.bounding_boxes
        self.expanded_boxes = {}
        self._create_neighborhoods()
        # repartition data set on the partition label
        self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \
            .partitionBy(len(parts.partitions)) \
            .map(lambda (p, (k, v)): ((k, p), v))
        # create parameters for sklearn DBSCAN
        params = self.dbscan_params or {
            'eps': self.eps,
            'min_samples': self.min_samples,
            'metric': self.metric}
        # perform dbscan on each part
        self.data = self.data.mapPartitions(
            lambda iterable: dbscan_partition(iterable, params, sample_weight))
        self.data.cache()
        self._remap_cluster_ids()
项目:pyhiro    作者:wanweiwei07    | 项目源码 | 文件源码
def __init__(self, ompath, density = 4.0):
        """

        :param ompath: path of the mesh template

        author: weiwei
        date: 20170711
        """

        cadtemp = CADTemp.CADTemp(ompath = ompath, density = density)

        self.objnp = pg.packpandanp(cadtemp.objtrimesh.vertices,
                               cadtemp.objtrimesh.face_normals,
                               cadtemp.objtrimesh.faces,
                               name='')
        self.temppnt = cadtemp.pcdtemp

        self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth)
        self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1)
        self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15)
        self.tablepnt = []
        self.objectpnt = []
项目:scikit-discovery    作者:MITHaystack    | 项目源码 | 文件源码
def process(self, obj_data):
        ''' 
        Run DBScan on data. Stores result in data wrapper

        @param obj_data: Data wrapper to be processed
        '''

        epsilon = self.ap_paramList[0]()
        min_points = self.ap_paramList[1]()

        results = dict()


        for label, data in obj_data.getIterator():
            results[label] = DBSCAN(eps=epsilon, min_samples = min_points).fit_predict(data.loc[:,self.column_names])

        obj_data.addResult(self.str_description, results)
项目:yellowbrick    作者:DistrictDataLabs    | 项目源码 | 文件源码
def test_clusterer_enforcement(self):
        """
        Assert that only clustering estimators can be passed to cluster viz
        """
        nomodels = [
            SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier
        ]

        for nomodel in nomodels:
            with self.assertRaises(YellowbrickTypeError):
                visualizer = ClusteringScoreVisualizer(nomodel())

        models = [
            KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch
        ]

        for model in models:
            try:
                visualizer = ClusteringScoreVisualizer(model())
            except YellowbrickTypeError:
                self.fail("could not pass clustering estimator to visualizer")
项目:artorithmia    作者:alichtner    | 项目源码 | 文件源码
def fit(self, model, n_clusters=5):
        """
        Fits clusters to the feature set using a Kmeans model.

        Input:  n_clusters (int) number of clusters to use during clustering
        Output: None
        """
        self.n_clusters = n_clusters
        scaler = StandardScaler()
        self.features = scaler.fit_transform(self.features)

        if model == 'kmeans':
            self.model = KMeans(self.n_clusters)
        elif model == 'DBSCAN':
            self.model = DBSCAN(eps=0.3, min_samples = 3)
        self.cluster_fit = self.model.fit(self.features)
        print ('-- Running clustering on {} piece collection --'
               .format(self.n_artworks))
项目:EchoBurst    作者:TyJK    | 项目源码 | 文件源码
def newDBSCANModel(vectorFile, outputFile):
    model = Doc2Vec.load("Models\\" + vectorFile)
    vecs = []
    for doc in range(0, len(model.docvecs)):
        doc_vec = model.docvecs[doc]
        # print doc_vec
        vecs.append(doc_vec.reshape((1, 300)))

    doc_vecs = np.array(vecs, dtype='float')  # TSNE expects float type values

    # print doc_vecs
    docs = []
    for i in doc_vecs:
        docs.append(i[0])
    db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs)
    joblib.dump(db, outputFile)


    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters = db.labels_.tolist()
    cluster_info = {'labels': model.docvecs.offset2doctag,
                    "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in
                                                            model.docvecs.offset2doctag],
                    'clusters': clusters}
    sentenceDF = pd.DataFrame(cluster_info, index=[clusters],
                              columns=['labels', "index, wordcount and repeated words", 'clusters'])
    print(sentenceDF)
    sentenceDF.to_csv("DBSCAN.csv")

    print('Estimated number of clusters: %d' % n_clusters_)
项目:pypardis    作者:bwoneill    | 项目源码 | 文件源码
def dbscan_partition(iterable, params):
    """
    :type iterable: iter
    :param iterable: iterator yielding ((key, partition), vector)
    :type params: dict
    :param params: dictionary containing sklearn DBSCAN parameters
    :rtype: iter
    :return: ((key, cluster_id), v)
    Performs a DBSCAN on a given partition of the data
    """
    # read iterable into local memory
    data = list(iterable)
    (key, part), vector = data[0]
    x = np.array([v for (_, __), v in data])
    y = np.array([k for (k, _), __ in data])
    # perform DBSCAN
    model = skc.DBSCAN(**params)
    c = model.fit_predict(x)
    cores = set(model.core_sample_indices_)
    # yield (key, cluster_id), non-core samples labeled with *
    for i in xrange(len(c)):
        flag = '' if i in cores else '*'
        yield (y[i], '%i:%i%s' % (part, c[i], flag))
项目:SUPPA    作者:comprna    | 项目源码 | 文件源码
def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric):

    # Setting logging preferences
    logger = logging.getLogger(__name__)

    # The metric is "cosine" works only with the algorithm "brute"
    if metric == "cosine":
        alg = 'brute'
    else:
        alg = 'auto'

    try:
        db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix)
        labels = db.labels_
    except:
        logger.error("Unknown error: {}".format(sys.exc_info()))
        sys.exit(1)

    eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)}

    return eventid_labels_dict, labels
项目:SUPPA    作者:comprna    | 项目源码 | 文件源码
def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering,
                     separation, output):

    path = os.path.dirname(os.path.realpath(dpsi))
    os.chdir(path)

    psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes)

    if(clustering=="DBSCAN"):
        eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric)
        #eventid_labels_dict are the labels of the clustering for eacg event

        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output)

    else:
        #OPTICS
        points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py
        optics = Optics(points_list, eps, minpts)  # Maximum radius to be considered, cluster size >= 2 points
        optics.run()  # run the algorithm
        clusters = optics.cluster(separation)  # minimum threshold for clustering (upper limit to separate the clusters)
        eventid_labels_dict, labels = generate_labels(clusters, eventid_lst)
        write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output)
        calculate_cluster_scores(psi_matrix, labels, output)
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeDBScan(X=None, k=-1):
    return cluster.DBSCAN(eps=.2)
项目:ananke    作者:beiko-lab    | 项目源码 | 文件源码
def sts_matrix_generator(ind, slope_matrix):
    """Work-horse function. Computes the short time-series (STS) distance for
    an index, ind of the slope matrix.

    Parameters
    ----------
    ind: int
        The index of the slope matrix that is being computed.
    slope_matrix: np.matrix
        The slope matrix.

    Returns
    -------
        (ind, dists): ind is the index and dists is a np.matrix containing the
                      STS distances
    """
    mx = slope_matrix[ind, :]
    mv = slope_matrix[ind:, :]
    mx_rep = np.vstack((mx,)*mv.shape[0])
    diff = mx_rep - mv
    diff = np.square(diff)
    sts_squared = diff.sum(axis=1)
    dists = np.sqrt(sts_squared)
    return (ind, dists)

#  DBSCAN from scikit learn
项目:ananke    作者:beiko-lab    | 项目源码 | 文件源码
def cluster_dbscan(matrix, distance_measure="sts", eps=1):
    """Clusters the distance matrix for a given epsilon value, if distance
    measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, 
    ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, 
    ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
    ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, 
    ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]

    Parameters
    ----------
    matrix: np.matrix
        The input matrix. If distance measure is sts, this should be the sts
        distance matrix. If other distance, this should be the time-series
        matrix of size ngenes x nsamples.
    distance_measure: str
        The distance measure, default is sts, short time-series distance.
        Any distance measure available in scikit-learn is available here.
        Note: multiple time-series is NOT supported for distances other than    
        "sts".

    Returns
    -------
    cluster_labels: list of int
        A list of size ngenes that defines cluster membership.
    """
    if (distance_measure == "sts"):
        dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
    else:
        dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
    cluster_labels = dbs.fit_predict(matrix)
    return cluster_labels
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def dbFun( _x,_original_vals, f):
    db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    labels = db.labels_
    #print(labels)
    n_clusters_ = len(set(labels)) - (1 if -1 else 0)
    #gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
    #_original_vals)
    print("Wait plotting clusters.....")
    plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
    return
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def dbFun( _x,_original_vals, f):
    db = DBSCAN(eps=0.3, min_samples=20).fit(_x)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    labels = db.labels_
    #print(labels)
    n_clusters_ = len(set(labels)) - (1 if -1 else 0)
    #gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_,
    #_original_vals)
    print("Wait plotting clusters.....")
    plotCluster(_x, labels, core_samples_mask, n_clusters_, f)
    return
项目:iFruitFly    作者:AdnanMuhib    | 项目源码 | 文件源码
def demo_printing_picture(anomaly_file, prefix, rgb_directory, pre_prefix, dir, file_name):
    #clusters = webDemo.main(anomaly_file,
    #"D:\\ifruitly_junk\\results\\result.jpg")
    clusters = v_demo(anomaly_file, prefix, pre_prefix, file_name, dir)
    return

##############################################################################################
# Running the DBSCAN for output
项目:lol-category    作者:vonum    | 项目源码 | 文件源码
def db_scan(data, eps, min_samples, metric):
  dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data)
  print 'DBSCAN'
  print metrics.silhouette_score(data, dbscan.labels_)
  print collections.Counter(dbscan.labels_)
  reduced_data = reduce_with_pca(data)
  plot_2d_data(reduced_data, dbscan.labels_)
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def sdbscanTrain(self, settings, mname, data):
        '''
        :param data: -> dataframe with data
        :param settings: -> settings dictionary
        :param mname: -> name of serialized clusterer
        :return: -> clusterer
        :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' ,
        algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1}
        '''
        for k, v in settings.iteritems():
            logger.info('[%s] : [INFO] SDBSCAN %s set to %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
            print "SDBSCAN %s set to %s" % (k, v)
        sdata = StandardScaler().fit_transform(data)
        try:
            db = DBSCAN(eps=float(settings['eps']), min_samples=int(settings['min_samples']), metric=settings['metric'],
                        algorithm=settings['algorithm'], leaf_size=int(settings['leaf_size']), p=float(settings['p']),
                        n_jobs=int(settings['n_jobs'])).fit(sdata)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot instanciate sDBSCAN with %s and %s',
                           datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Error while  instanciating sDBSCAN with %s and %s" % (type(inst), inst.args)
            sys.exit(1)
        labels = db.labels_
        print labels
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print'Estimated number of clusters: %d' % n_clusters_
        self.__serializemodel(db, 'sdbscan', mname)
        return db
项目:image-segmentation    作者:alexlouden    | 项目源码 | 文件源码
def __init__(self, image, colour_space='hsv', cluster_method='ward', scale=None, num_clusters=None, quantile=None):
        self.image = image
        self.colour_space = colour_space
        self.cluster_method = cluster_method

        self.params = Parameters()

        # Scaling colour space
        if scale is None:
            self.params.scale = (1, 1, 1)
        else:
            # TODO validate 3 float tuple
            self.params.scale = scale

        # K-means param
        if num_clusters is None:
            self.params.num_clusters = 8
        else:
            # TODO validate
            self.params.num_clusters = int(num_clusters)

        # Mean-shift param
        if quantile is None:
            self.params.quantile = 0.1
        else:
            self.params.quantile = float(quantile)

        # DBSCAN param
        # if epsilon is None:
        self.params.epsilon = 255*0.1

        # Log
        h, w = self.image.shape[:2]
        msg = 'Clustering a {}x{} image: cluster_method={} colour_space={} num_clusters={} quantile={}'.format(
            w, h, cluster_method, colour_space, num_clusters, quantile
        )
        print msg
项目:FreeDiscovery    作者:FreeDiscovery    | 项目源码 | 文件源码
def dbscan(self, n_clusters=None, eps=0.5, min_samples=10,
               algorithm='auto', leaf_size=30):
        """
        Perform DBSCAN clustering

        This can also be used for Duplicate Detection (when ep

        Parameters
        ----------
        n_clusters : int
            number of clusters # not used just present for compatibility
        lsi_components : int
            apply LSA before the clustering algorithm
        eps : float
            The maximum distance between two samples for them to be considered
             as in the same neighborhood.
        min_samples : int
            The number of samples (or total weight) in a neighborhood
            for a point to be considered as a core point.
            This includes the point itself.
        """
        from sklearn.cluster import DBSCAN
        pars = {'is_hierarchical': False, "metric": self.metric}

        km = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algorithm,
                    leaf_size=leaf_size)

        return self._cluster_func(n_clusters, km, pars)
项目:Particle-Picking-Cryo-EM    作者:hqythu    | 项目源码 | 文件源码
def main():
    centers = get_list('out_center.txt')
    labels = get_list('142-label.txt')
    judge(centers, labels)
    n_class = int(len(centers) * 0.18)
    est = KMeans(n_clusters=n_class, max_iter=1000)
    est.fit(centers)
    new_list = []
    for x, y in est.cluster_centers_:
        min_num = 10000
        min_x = -1
        min_y = -1
        for x_, y_ in centers:
            dist = distance(x, y, x_, y_)
            if (dist < min_num) or (min_x == -1):
                min_num = dist
                min_x = x_
                min_y = y_
        new_list.append([min_x, min_y])
    judge(new_list, labels)
    judge(est.cluster_centers_, labels)

    # db = DBSCAN(eps=0.3, min_samples=180).fit(centers)
    # print(db.core_sample_indices_)
    # judge(new_list, labels)
    # print(est.cluster_centers_)
    # save_list('result.txt', est.cluster_centers_)
    # af = AffinityPropagation(preference=180).fit(centers)
    # judge(af.cluster_centers_, labels)
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def dbscan_partition(iterable, params, sample_weight=None):
    """
    :type iterable: iter
    :param iterable: iterator yielding ((key, partition), vector)
    :type params: dict
    :param params: dictionary containing sklearn DBSCAN parameters
    :rtype: iter
    :return: ((key, cluster_id), v)
    Performs a DBSCAN on a given partition of the data
    """
    # read iterable into local memory
    data = list(iterable)
    (key, part), vector = data[0]
    x = np.array([v for (_, __), v in data])
    y = np.array([k for (k, _), __ in data])
    # perform DBSCAN
    model = skc.DBSCAN(**params)
    # import sys
    # print(model, file=sys.stderr)
    weights = [sample_weight[k[0]] for k in x]
    c = model.fit_predict(x, sample_weight=weights)
    cores = set(model.core_sample_indices_)
    # yield (key, cluster_id), non-core samples labeled with *
    for i in xrange(len(c)):
        flag = '' if i in cores else '*'
        yield (y[i], '%i:%i%s' % (part, c[i], flag))
项目:Vision-based-parking-lot-availability-OpenCV    作者:Saar1312    | 项目源码 | 文件源码
def dbscan(points,eps,min_samples):
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(points) # eps=5 min_samples = 80

    # Labeling pixels by cluster
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    # Creating list of clusters
    return [points[labels == i] for i in xrange(n_clusters_)]
项目:textcatvis    作者:cod3licious    | 项目源码 | 文件源码
def cluster_texts(textdict, eps=0.45, min_samples=3):
    """
    cluster the given texts

    Input:
        textdict: dictionary with {docid: text}
    Returns:
        doccats: dictionary with {docid: cluster_id}
    """
    doc_ids = list(textdict.keys())
    # transform texts into length normalized kpca features
    ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False)
    docfeats = ft.texts2features(textdict)
    X, featurenames = features2mat(docfeats, doc_ids)
    e_lkpca = KernelPCA(n_components=250, kernel='linear')
    X = e_lkpca.fit_transform(X)
    xnorm = np.linalg.norm(X, axis=1)
    X = X/xnorm.reshape(X.shape[0], 1)
    # compute cosine similarity
    D = 1. - linear_kernel(X)
    # and cluster with dbscan
    clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples)
    y_pred = clst.fit_predict(D)
    return {did: y_pred[i] for i, did in enumerate(doc_ids)}
项目:pyhiro    作者:wanweiwei07    | 项目源码 | 文件源码
def getRotMat(verts):
    """

    find the table and do calibration

    :param verts: see depthToXYZ
    :return:

    author: weiwei
    date: 20170711
    """

    cutverts = []
    for vert in verts:
        if vert[0] < 700.0 and vert[0] > -700.0:
            if vert[1] < 200.0 and vert[1] > -600.0:
                if vert[2] < -1000.0 and vert[2] > -1500.0:
                    cutverts.append([vert[0], vert[1], vert[2]])

    # clustering using DBSCAN
    X = np.array(cutverts)
    db = DBSCAN(eps=20, min_samples = 100, n_jobs = -1).fit(X)
    print db.labels_
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    returnvertslist = []
    unique_labels = set(labels)
    for k in unique_labels:
        class_member_mask = (labels == k)
        print class_member_mask, core_samples_mask
        xyzlist = X[class_member_mask & core_samples_mask]
        print xyzlist
        returnvertslist.append(xyzlist.tolist())

    return returnvertslist
    # return verts
项目:pyhiro    作者:wanweiwei07    | 项目源码 | 文件源码
def __init__(self):
        """
        Kinect interface

        author: weiwei
        date: 20170715
        """

        self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth)
        self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1)
        self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15)
项目:MasterDegree    作者:Waszker    | 项目源码 | 文件源码
def _get_dbscan(parameters):
    if parameters is None:
        parameters = {
        }
    return DBSCAN(**parameters)
项目:Snakepit    作者:K4lium    | 项目源码 | 文件源码
def clusterMalwareNames(malwareNames):
    # strictly lexical clustering over malware-names
    wordCount = {}
    # create a distance matrix
    matrix = np.zeros((len(malwareNames), len(malwareNames)))
    for i in range(len(malwareNames)):
        for j in range(len(malwareNames)):
            if matrix[i, j] == 0.0:        
                matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
                matrix[j, i] = matrix[i, j]

    # Scikit-Learn's DBSCAN implementation to cluster the malware-names
    clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
    clust.fit(matrix)    

    preds = clust.labels_
    clabels = np.unique(preds)

    # create Word-Count Map
    for i in range(clabels.shape[0]):
        if clabels[i] < 0:
            continue

        cmem_ids = np.where(preds == clabels[i])[0]
        cmembers = []

        for cmem_id in cmem_ids:
            cmembers.append(malwareNames[cmem_id])

        wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
    return wordCount
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_DBSCAN(*data):
    '''
    test the DBSCAN method
    :param data:  train, target
    :return: None
    '''
    X,labels_true=data
    clst=cluster.DBSCAN()
    predicted_labels=clst.fit_predict(X)
    print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels))
    print("Core sample num:{0}".format(len(clst.core_sample_indices_)))
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_DBSCAN_epsilon(*data):
    '''
    test the score with different eps
    :param data:  train, target
    :return: None
    '''
    X,labels_true=data
    epsilons=np.logspace(-1,1.5)
    ARIs=[]
    Core_nums=[]
    for epsilon in epsilons:
        clst=cluster.DBSCAN(eps=epsilon)
        predicted_labels=clst.fit_predict(X)
        ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
        Core_nums.append(len(clst.core_sample_indices_))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,2,1)
    ax.plot(epsilons,ARIs,marker='+')
    ax.set_xscale('log')
    ax.set_xlabel(r"$\epsilon$")
    ax.set_ylim(0,1)
    ax.set_ylabel('ARI')

    ax=fig.add_subplot(1,2,2)
    ax.plot(epsilons,Core_nums,marker='o')
    ax.set_xscale('log')
    ax.set_xlabel(r"$\epsilon$")
    ax.set_ylabel('Core_Nums')

    fig.suptitle("DBSCAN")
    plt.show()
项目:ML-note    作者:JasonK93    | 项目源码 | 文件源码
def test_DBSCAN_min_samples(*data):
    '''
    test the score with different min_sample
    :param data:  train, target
    :return:  None
    '''
    X,labels_true=data
    min_samples=range(1,100)
    ARIs=[]
    Core_nums=[]
    for num in min_samples:
        clst=cluster.DBSCAN(min_samples=num)
        predicted_labels=clst.fit_predict(X)
        ARIs.append( adjusted_rand_score(labels_true,predicted_labels))
        Core_nums.append(len(clst.core_sample_indices_))

    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,2,1)
    ax.plot(min_samples,ARIs,marker='+')
    ax.set_xlabel( "min_samples")
    ax.set_ylim(0,1)
    ax.set_ylabel('ARI')

    ax=fig.add_subplot(1,2,2)
    ax.plot(min_samples,Core_nums,marker='o')
    ax.set_xlabel( "min_samples")
    ax.set_ylabel('Core_Nums')

    fig.suptitle("DBSCAN")
    plt.show()
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def runClustering(ssearch, eps, min_samples):
    """
    Run DBSCAN with the determined eps and MinPts values.
    """
    print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples))

    # Initialize DBSCAN with parameters.
    # I forgot to use cosine at first!
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute')

    # Time this step.
    t0 = time.time()

    # Cluster the LSI vectors.     
    db.fit(ssearch.index.index)

    # Calculate the elapsed time (in seconds)
    elapsed = (time.time() - t0)
    print("  done in %.3fsec" % elapsed)

    # Get the set of unique IDs.
    cluster_ids = set(db.labels_)

    # Show the number of clusters (don't include noise label)
    print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1))  

    # For each of the clusters...    
    for cluster_id in cluster_ids:

            # Get the list of all doc IDs belonging to this cluster.
            cluster_doc_ids = []
            for doc_id in range(0, len(db.labels_)):            
                if db.labels_[doc_id] == cluster_id:
                    cluster_doc_ids.append(doc_id)

            # Get the top words in this cluster
            top_words = ssearch.getTopWordsInCluster(cluster_doc_ids)

            print('  Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
项目:simsearch    作者:chrisjmccormick    | 项目源码 | 文件源码
def main():   
    """
    Entry point for the script.
    """

    ###########################################################################
    # Load the corpus
    ###########################################################################

    # Load the pre-built corpus.
    print('Loading the saved SimSearch and corpus...')
    (ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/')

    print '    %d documents.' % len(ssearch.index.index)

    # Step 1: Run a technique to find a good 'eps' value.
    #findEps(ssearch)
    #eps = 0.5
    eps = 0.44

    # Step 2: Run a technique to find a good 'MinPts' value.    
    # TODO - This took ~17 min. on my desktop!
    #findMinPts(ssearch, eps)
    #min_samples = 8
    min_samples = 4

    # Step 3: Run DBSCAN
    runClustering(ssearch, eps, min_samples)
项目:geolife    作者:xuzhongyou    | 项目源码 | 文件源码
def dbscan(userid,X):

    db = DBSCAN(eps=0.15,min_samples=4).fit(X)
    # print db.labels_     zeros_like
    core_samples_mask = np.zeros_like(db.labels_,dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    lables = db.labels_ 
    labels_list = list(lables)
    # print labels_list.count(-1)
    out_user.setdefault(userid,0)
    out_user[userid] = labels_list.count(-1)
    print out_user

    # print labels_list.index(-1)
    print lables
    n_clusters_ = len(set(lables)) -(1 if -1 in lables else 0)
    unique_lables = set(lables)
    cols = plt.cm.Spectral(np.linspace(0,1,len(unique_lables)))
    # center_points = []
    for k,col in zip(unique_lables,cols):
        if k == -1:
            col = 'k'
        class_member_mask = (lables == k)
        k_x = X[class_member_mask & core_samples_mask]
        plt.plot(k_x[:,0],k_x[:,1],'o',markerfacecolor = col,
            markeredgecolor = 'k' , markersize = 5)
        center_points.append([np.mean(k_x[:,1]),np.mean(k_x[:,0])])
    plt.title('DBSCAN :Estimated number of clusters: %d' % n_clusters_)
    # plt.show()
项目:cartographer    作者:pablodecm    | 项目源码 | 文件源码
def __init__(self, filterer=PCA(n_components=2),
                 coverer=HyperRectangleCoverer(),
                 clusterer=DBSCAN(),
                 params=None):
        self.filterer = filterer
        self.coverer = coverer
        self.clusterer = clusterer
        if params is not None:
            self.set_params(**params)
项目:Parallel-SGD    作者:angadgill    | 项目源码 | 文件源码
def set_random_state(estimator, random_state=0):
    """Set random state of an estimator if it has the `random_state` param.

    Classes for whom random_state is deprecated are ignored. Currently DBSCAN
    is one such class.
    """

    if isinstance(estimator, DBSCAN):
        return

    if "random_state" in estimator.get_params():
        estimator.set_params(random_state=random_state)
项目:pypardis    作者:bwoneill    | 项目源码 | 文件源码
def train(self, data):
        """
        :type data: pyspark.RDD
        :param data: (key, k-dim vector like)
        Train the model using a (key, vector) RDD
        """
        parts = KDPartitioner(data, self.max_partitions)
        self.data = data
        self.bounding_boxes = parts.bounding_boxes
        self.expanded_boxes = {}
        self._create_neighborhoods()
        # repartition data set on the partition label
        self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \
            .partitionBy(len(parts.partitions)) \
            .map(lambda (p, (k, v)): ((k, p), v))
        # create parameters for sklearn DBSCAN
        params = {'eps': self.eps, 'min_samples': self.min_samples,
                  'metric': self.metric}
        # perform dbscan on each part
        self.data = self.data.mapPartitions(
            lambda iterable: dbscan_partition(iterable, params))
        self.data.cache()
        self._remap_cluster_ids()
项目:pypardis    作者:bwoneill    | 项目源码 | 文件源码
def assignments(self):
        """
        :rtype: list
        :return: list of (key, cluster_id)
        Retrieve the results of the DBSCAN
        """
        return self.result.collect()
项目:extract    作者:dblalock    | 项目源码 | 文件源码
def makeClusterers(X, k=2):
    return [('MiniBatchKMeans', makeKMeans(X, k)),
            ('AffinityPropagation', makeAffinityProp()),
            ('MeanShift', makeMeanShift(X)),
            ('SpectralClustering', makeSpectral(X, k)),
            ('Ward', makeWard(X, k)),
            ('AgglomerativeAvg', makeAvgLinkage(X, k)),
            ('AgglomerativeMax', makeMaxLinkage(X, k)),
            ('AgglomerativeWard', makeWardLinkage(X, k)),
            ('DBSCAN', makeDBScan())]
项目:SnapStitch    作者:avikj    | 项目源码 | 文件源码
def cluster(X, eps=1, min_pts=30, algorithm='DBSCAN', n_clusters=10):
  if algorithm == 'DBSCAN':
    cluster_result = DBSCAN(eps=eps, min_samples=min_pts).fit(X)
  elif algorithm == 'KMeans':
    cluster_result = KMeans(n_clusters=n_clusters)
  labels = cluster_result.labels_
  return labels
项目:TrackToTrip    作者:ruipgil    | 项目源码 | 文件源码
def update_location_centroid(point, cluster, max_distance, min_samples):
    """ Updates the centroid of a location cluster with another point

    Args:
        point (:obj:`Point`): Point to add to the cluster
        cluster (:obj:`list` of :obj:`Point`): Location cluster
        max_distance (float): Max neighbour distance
        min_samples (int): Minimum number of samples
    Returns:
        (:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid
            and new point cluster (given cluster + given point)
    """
    cluster.append(point)
    points = [p.gen2arr() for p in cluster]

    # Estimates the epsilon
    eps = estimate_meters_to_deg(max_distance, precision=6)

    p_cluster = DBSCAN(eps=eps, min_samples=min_samples)
    p_cluster.fit(points)

    clusters = {}
    for i, label in enumerate(p_cluster.labels_):
        if label in clusters.keys():
            clusters[label].append(points[i])
        else:
            clusters[label] = [points[i]]

    centroids = []
    biggest_centroid_l = -float("inf")
    biggest_centroid = None

    for label, n_cluster in clusters.items():
        centroid = compute_centroid(n_cluster)
        centroids.append(centroid)

        if label >= 0 and len(n_cluster) >= biggest_centroid_l:
            biggest_centroid_l = len(n_cluster)
            biggest_centroid = centroid

    if biggest_centroid is None:
        biggest_centroid = compute_centroid(points)

    return biggest_centroid, cluster
项目:tianchi_power    作者:lvniqi    | 项目源码 | 文件源码
def classify_user():
    new_df_log_scaled = get_scaled_user()
    c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T)
    pd.value_counts(c.labels_)
    d = c.labels_
    types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0]
    types[types == -1] = 2
    return types
项目:dmon-adp    作者:igabriel85    | 项目源码 | 文件源码
def detect(self, method, model, data):
        '''
        :param method: -> method name
        :param model: -> trained clusterer
        :param data: -> dataframe with data
        :return: -> dictionary that contains the list of anomalous timestamps
        '''
        smodel = self.__loadClusterModel(method, model)
        anomalieslist = []
        if not smodel:
            dpredict = 0
        else:
            if data.shape[0]:
                if isinstance(smodel, IsolationForest):
                    print "Detected IsolationForest model"
                    print "Contamination -> %s" % smodel.contamination
                    print "Max_Features -> %s" % smodel.max_features
                    print "Max_Samples -> %s" % smodel.max_samples_
                    print "Threashold -> %s " % smodel.threshold_
                    try:
                        dpredict = smodel.predict(data)
                        print "IsolationForest Prediction Array -> %s" %str(dpredict)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
                        dpredict = 0

                elif isinstance(smodel, DBSCAN):
                    print "Detected DBSCAN model"
                    print "Leaf_zise -> %s" % smodel.leaf_size
                    print "Algorithm -> %s" % smodel.algorithm
                    print "EPS -> %s" % smodel.eps
                    print "Min_Samples -> %s" % smodel.min_samples
                    print "N_jobs -> %s" % smodel.n_jobs
                    try:
                        dpredict = smodel.fit_predict(data)
                    except Exception as inst:
                        logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s',
                                     datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
                                     inst.args)
                        dpredict = 0
            else:
                dpredict = 0
                logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)',
                             datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]),
                             str(data.shape[1]))
                print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]),
                             str(data.shape[1]))
            print "dpredict type is %s" % (type(dpredict))
            if type(dpredict) is not int:
                anomalyarray = np.argwhere(dpredict == -1)
                for an in anomalyarray:
                    anomalies = {}
                    anomalies['utc'] = int(data.iloc[an[0]]['key'])
                    anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key']))
                    anomalieslist.append(anomalies)
        anomaliesDict = {}
        anomaliesDict['anomalies'] = anomalieslist
        logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict))
        return anomaliesDict
项目:information-extraction-PT    作者:davidsbatista    | 项目源码 | 文件源码
def main():

    """
    compute_embeddings_vectors()
    print "Reading embedding vectors"
    with open('triples_vectors.pkl', 'r') as in_file:
        triples = pickle.load(in_file)
    vectors = []
    for t in triples:
        vectors.append(t.vector)
    """

    text = []
    triples = []
    with open('triples.csv', 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for t in reader:
            e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4]
            t = Triple(e1, e1_type, rel, e2, e2_type)
            text.append(rel)
            triples.append(t)

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(text)

    print "Clustering"
    dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute',
                    leaf_size=30, p=None, n_jobs=1)
    labels = dbscan.fit_predict(tfidf_matrix)
    with open('triples_labels.txt', 'w') as out_file:
        for l in labels:
            out_file.write(str(l) + '\n')

    print "Reading cluster labels"
    labels = []
    with open('triples_labels.txt', 'r') as in_file:
        for label in in_file:
            labels.append(int(label.strip()))

    for i in range(len(triples)):
        triples[i].label = labels[i]

    clusters = dict()
    for t in triples:
        try:
            clusters[t.label] += 1
        except KeyError:
            clusters[t.label] = 1

    print clusters
    exit(-1)
    # print len(clusters)

    # top-terms for each cluster
    for x in range(-1, len(clusters)):
        print x, len(clusters[x])
        for t in triples:
            if t.label == str(x):
                print t.rel
        print
        print
项目:meleedb-segment    作者:sashahashi    | 项目源码 | 文件源码
def detect_match_chunks(self, max_error=.06):
        percent = cv2.imread("assets/pct.png")
        corr_series = []

        for (time, scene) in self.sample_frames(interval=self.polling_interval):
            cv2.imwrite("scene.png", scene)
            scene = cv2.imread("scene.png")

            scaled_percent = cv2.resize(
                percent, (0, 0), fx=self.scale, fy=self.scale)
            scaled_percent = cv2.Canny(scaled_percent, 50, 200)

            percent_corrs = []
            for port_number, roi in enumerate(self.ports):
                if roi is not None:
                    scene_roi = scene[roi.top:(roi.top + roi.height), roi.left:(roi.left + roi.width)]
                    scene_roi = cv2.Canny(scene_roi, 50, 200)

                    corr_map = cv2.matchTemplate(scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
                    _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
                    percent_corrs.append(max_corr)

            point = [time, max(percent_corrs)]
            corr_series.append(point)

        corr_series = np.array(corr_series)

        medians = pd.rolling_median(corr_series[:, 1], self.min_gap //
                                    self.polling_interval, center=True)[2:-2]

        clusters = DBSCAN(eps=0.03, min_samples=10).fit(medians.reshape(-1, 1))

        dataframe = list(zip(corr_series[:, 0][2:-2], medians, clusters.labels_))

        labels = list(set(x[2] for x in dataframe))
        cluster_means = [sum(cluster) / len(cluster) for cluster in [[x[1] for x in dataframe if x[2] == label] for label in labels]]
        cluster_means = list(zip(labels, cluster_means))

        game_label = max(cluster_means, key=lambda x: x[1])[0]
        game_groups = [(k, list(v)) for k, v in groupby(dataframe, lambda pt: pt[2])]
        games = [[v[0][0], v[-1][0]] for k, v in game_groups if k == game_label]

        return games
项目:meleedb-segment    作者:sashahashi    | 项目源码 | 文件源码
def __detect_match_chunks(self, max_error=.04):
        percent = cv2.imread("assets/pct.png")
        corr_series = []

        for (time, scene) in spaced_frames(self, interval=self.polling_interval):
            cv2.imwrite("scene.png", scene)
            scene = cv2.imread("scene.png")

            scaled_percent = cv2.resize(
                percent, (0, 0), fx=self.scale, fy=self.scale)
            scaled_percent = cv2.Canny(scaled_percent, 50, 200)

            percent_corrs = []
            for port_number, roi in enumerate(self.ports):
                if roi is not None:
                    scene_roi = scene[roi.top:roi.bottom, roi.left:roi.right]
                    scene_roi = cv2.Canny(scene_roi, 50, 200)

                    corr_map = cv2.matchTemplate(
                        scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED)
                    _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map)
                    percent_corrs.append(max_corr)

            point = [time, max(percent_corrs)]
            corr_series.append(point)

        corr_series = np.array(corr_series)

        def moving_average(series, n=5):
            return np.convolve(series, np.ones((n,)) / n, mode='valid')

        medians = rolling_median(corr_series[:, 1], self.min_gap // self.polling_interval, center=True)[2:-2]
        clusters = DBSCAN(eps=0.05, min_samples=10).fit(medians.reshape(-1, 1))

        centers = kmeans.cluster_centers_
        points = zip([time + (self.min_gap / 2)
                      for time, corr in corr_series], kmeans.labels_)

        # Throw out the lowest cluster
        groups = [(k, list(v))
                  for k, v in groupby(points, lambda pt: centers[pt[1]] > max(min(centers), .2))]
        games = [[v[0][0], v[-1][0]] for k, v in groups if k]

        return games
项目:icing    作者:slipguru    | 项目源码 | 文件源码
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))