我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.cluster.DBSCAN。
def cluster_points(coordinates, eps, min_samples, n_jobs=1): """Given coordinates, function returns the number of clusters in the set of coordinates and a list of integer labels corresponding to the input coordinate list Arguments: coordinates: a sequence of (lat, lon) tuples eps: the cluster size in radial degrees min_samples: the size of the smallest cluster n_jobs: number of CPUs to use to compute the clusters Returns: n_clusters: number of clusters labels: the labels of the clusters """ db = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=n_jobs).fit(coordinates) return db
def dbFun(_x,_original_vals, f): db = DBSCAN(eps=0.3, min_samples=20).fit(_x) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ #print(labels) n_clusters_ = len(set(labels)) - (1 if -1 else 0) #gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_, #_original_vals) print("Wait plotting clusters.....") plotCluster(_x, labels, core_samples_mask, n_clusters_, f) return ############################################################################################## # Plotting the cluster after the result of DBSCAN
def dbscan(fig): global X_iris, geo ax = fig.add_subplot(geo + 5, projection='3d', title='dbscan') dbscan = cluster.DBSCAN() dbscan.fit(X_iris) res = dbscan.labels_ core = dbscan.core_sample_indices_ print repr(core) size = [5 if i not in core else 40 for i in range(len(X_iris))] print repr(size) for n, i in enumerate(X_iris): ax.scatter(*i[: 3], s=size[n], c='bgrcmyk'[res[n] % 7], alpha=0.8, marker='o') ax.set_xlabel('X Label') ax.set_ylabel('Y Label') ax.set_zlabel('Z Label') return res
def cluster_dbscan(self, image_cols): print 'DBSCAN' # TODO handle outliers/noise # Look at different metrics? db = DBSCAN(eps=self.params.epsilon, min_samples=10, metric='euclidean') db.fit(image_cols) # from IPython import embed; embed(); import ipdb; ipdb.set_trace() self.number_of_clusters = np.max(db.labels_) + 1 # Ignore -1 cluster, it's noise print 'number of clusters', self.number_of_clusters # Clusters centers = np.zeros((self.number_of_clusters, 3)) for i in range(0, self.number_of_clusters): cluster_points = image_cols[db.labels_ == i] cluster_mean = np.mean(cluster_points, axis=0) centers[i, :] = cluster_mean return centers
def train(self, data, sample_weight=None): """ :type data: pyspark.RDD :param data: (key, k-dim vector like) Train the model using a (key, vector) RDD """ parts = KDPartitioner(data, self.max_partitions) self.data = data self.bounding_boxes = parts.bounding_boxes self.expanded_boxes = {} self._create_neighborhoods() # repartition data set on the partition label self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \ .partitionBy(len(parts.partitions)) \ .map(lambda (p, (k, v)): ((k, p), v)) # create parameters for sklearn DBSCAN params = self.dbscan_params or { 'eps': self.eps, 'min_samples': self.min_samples, 'metric': self.metric} # perform dbscan on each part self.data = self.data.mapPartitions( lambda iterable: dbscan_partition(iterable, params, sample_weight)) self.data.cache() self._remap_cluster_ids()
def __init__(self, ompath, density = 4.0): """ :param ompath: path of the mesh template author: weiwei date: 20170711 """ cadtemp = CADTemp.CADTemp(ompath = ompath, density = density) self.objnp = pg.packpandanp(cadtemp.objtrimesh.vertices, cadtemp.objtrimesh.face_normals, cadtemp.objtrimesh.faces, name='') self.temppnt = cadtemp.pcdtemp self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth) self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1) self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15) self.tablepnt = [] self.objectpnt = []
def process(self, obj_data): ''' Run DBScan on data. Stores result in data wrapper @param obj_data: Data wrapper to be processed ''' epsilon = self.ap_paramList[0]() min_points = self.ap_paramList[1]() results = dict() for label, data in obj_data.getIterator(): results[label] = DBSCAN(eps=epsilon, min_samples = min_points).fit_predict(data.loc[:,self.column_names]) obj_data.addResult(self.str_description, results)
def test_clusterer_enforcement(self): """ Assert that only clustering estimators can be passed to cluster viz """ nomodels = [ SVC, SVR, Ridge, RidgeCV, LinearRegression, RandomForestClassifier ] for nomodel in nomodels: with self.assertRaises(YellowbrickTypeError): visualizer = ClusteringScoreVisualizer(nomodel()) models = [ KMeans, MiniBatchKMeans, AffinityPropagation, MeanShift, DBSCAN, Birch ] for model in models: try: visualizer = ClusteringScoreVisualizer(model()) except YellowbrickTypeError: self.fail("could not pass clustering estimator to visualizer")
def fit(self, model, n_clusters=5): """ Fits clusters to the feature set using a Kmeans model. Input: n_clusters (int) number of clusters to use during clustering Output: None """ self.n_clusters = n_clusters scaler = StandardScaler() self.features = scaler.fit_transform(self.features) if model == 'kmeans': self.model = KMeans(self.n_clusters) elif model == 'DBSCAN': self.model = DBSCAN(eps=0.3, min_samples = 3) self.cluster_fit = self.model.fit(self.features) print ('-- Running clustering on {} piece collection --' .format(self.n_artworks))
def newDBSCANModel(vectorFile, outputFile): model = Doc2Vec.load("Models\\" + vectorFile) vecs = [] for doc in range(0, len(model.docvecs)): doc_vec = model.docvecs[doc] # print doc_vec vecs.append(doc_vec.reshape((1, 300))) doc_vecs = np.array(vecs, dtype='float') # TSNE expects float type values # print doc_vecs docs = [] for i in doc_vecs: docs.append(i[0]) db = DBSCAN(eps=0.03, algorithm="brute", metric='cosine').fit(docs) joblib.dump(db, outputFile) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) clusters = db.labels_.tolist() cluster_info = {'labels': model.docvecs.offset2doctag, "index, wordcount and repeated words": [model.docvecs.doctags[x] for x in model.docvecs.offset2doctag], 'clusters': clusters} sentenceDF = pd.DataFrame(cluster_info, index=[clusters], columns=['labels', "index, wordcount and repeated words", 'clusters']) print(sentenceDF) sentenceDF.to_csv("DBSCAN.csv") print('Estimated number of clusters: %d' % n_clusters_)
def dbscan_partition(iterable, params): """ :type iterable: iter :param iterable: iterator yielding ((key, partition), vector) :type params: dict :param params: dictionary containing sklearn DBSCAN parameters :rtype: iter :return: ((key, cluster_id), v) Performs a DBSCAN on a given partition of the data """ # read iterable into local memory data = list(iterable) (key, part), vector = data[0] x = np.array([v for (_, __), v in data]) y = np.array([k for (k, _), __ in data]) # perform DBSCAN model = skc.DBSCAN(**params) c = model.fit_predict(x) cores = set(model.core_sample_indices_) # yield (key, cluster_id), non-core samples labeled with * for i in xrange(len(c)): flag = '' if i in cores else '*' yield (y[i], '%i:%i%s' % (part, c[i], flag))
def DBSCAN_cluster(psi_matrix, eventid_lst, dist, minpts, metric): # Setting logging preferences logger = logging.getLogger(__name__) # The metric is "cosine" works only with the algorithm "brute" if metric == "cosine": alg = 'brute' else: alg = 'auto' try: db = DBSCAN(eps=dist, min_samples=minpts, metric=metric, algorithm=alg).fit(psi_matrix) labels = db.labels_ except: logger.error("Unknown error: {}".format(sys.exc_info())) sys.exit(1) eventid_labels_dict = {k: v for k, v in zip(eventid_lst, labels)} return eventid_labels_dict, labels
def cluster_analysis(dpsi, psivec, sig_threshold, dpsi_threshold, eps, minpts, metric, indexes, clustering, separation, output): path = os.path.dirname(os.path.realpath(dpsi)) os.chdir(path) psi_matrix, eventid_lst = process_cluster_input(dpsi, psivec, sig_threshold, dpsi_threshold, indexes) if(clustering=="DBSCAN"): eventid_labels_dict, labels = DBSCAN_cluster(psi_matrix, eventid_lst, eps, minpts, metric) #eventid_labels_dict are the labels of the clustering for eacg event write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output) calculate_cluster_scores(psi_matrix, labels, output) else: #OPTICS points_list = create_points_list(psi_matrix, eventid_lst) #Transform the points on psi_matrix to Points from optics.py optics = Optics(points_list, eps, minpts) # Maximum radius to be considered, cluster size >= 2 points optics.run() # run the algorithm clusters = optics.cluster(separation) # minimum threshold for clustering (upper limit to separate the clusters) eventid_labels_dict, labels = generate_labels(clusters, eventid_lst) write_averaged_cluster_output(psi_matrix, eventid_lst, eventid_labels_dict, output) calculate_cluster_scores(psi_matrix, labels, output)
def makeDBScan(X=None, k=-1): return cluster.DBSCAN(eps=.2)
def sts_matrix_generator(ind, slope_matrix): """Work-horse function. Computes the short time-series (STS) distance for an index, ind of the slope matrix. Parameters ---------- ind: int The index of the slope matrix that is being computed. slope_matrix: np.matrix The slope matrix. Returns ------- (ind, dists): ind is the index and dists is a np.matrix containing the STS distances """ mx = slope_matrix[ind, :] mv = slope_matrix[ind:, :] mx_rep = np.vstack((mx,)*mv.shape[0]) diff = mx_rep - mv diff = np.square(diff) sts_squared = diff.sum(axis=1) dists = np.sqrt(sts_squared) return (ind, dists) # DBSCAN from scikit learn
def cluster_dbscan(matrix, distance_measure="sts", eps=1): """Clusters the distance matrix for a given epsilon value, if distance measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] Parameters ---------- matrix: np.matrix The input matrix. If distance measure is sts, this should be the sts distance matrix. If other distance, this should be the time-series matrix of size ngenes x nsamples. distance_measure: str The distance measure, default is sts, short time-series distance. Any distance measure available in scikit-learn is available here. Note: multiple time-series is NOT supported for distances other than "sts". Returns ------- cluster_labels: list of int A list of size ngenes that defines cluster membership. """ if (distance_measure == "sts"): dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2) else: dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2) cluster_labels = dbs.fit_predict(matrix) return cluster_labels
def dbFun( _x,_original_vals, f): db = DBSCAN(eps=0.3, min_samples=20).fit(_x) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ #print(labels) n_clusters_ = len(set(labels)) - (1 if -1 else 0) #gettingCharacteristics(_x, core_samples_mask, labels, n_clusters_, #_original_vals) print("Wait plotting clusters.....") plotCluster(_x, labels, core_samples_mask, n_clusters_, f) return
def demo_printing_picture(anomaly_file, prefix, rgb_directory, pre_prefix, dir, file_name): #clusters = webDemo.main(anomaly_file, #"D:\\ifruitly_junk\\results\\result.jpg") clusters = v_demo(anomaly_file, prefix, pre_prefix, file_name, dir) return ############################################################################################## # Running the DBSCAN for output
def db_scan(data, eps, min_samples, metric): dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(data) print 'DBSCAN' print metrics.silhouette_score(data, dbscan.labels_) print collections.Counter(dbscan.labels_) reduced_data = reduce_with_pca(data) plot_2d_data(reduced_data, dbscan.labels_)
def sdbscanTrain(self, settings, mname, data): ''' :param data: -> dataframe with data :param settings: -> settings dictionary :param mname: -> name of serialized clusterer :return: -> clusterer :example settings: -> {eps:0.9, min_samples:10, metric:'euclidean' , algorithm:'auto, leaf_size:30, p:0.2, n_jobs:1} ''' for k, v in settings.iteritems(): logger.info('[%s] : [INFO] SDBSCAN %s set to %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v) print "SDBSCAN %s set to %s" % (k, v) sdata = StandardScaler().fit_transform(data) try: db = DBSCAN(eps=float(settings['eps']), min_samples=int(settings['min_samples']), metric=settings['metric'], algorithm=settings['algorithm'], leaf_size=int(settings['leaf_size']), p=float(settings['p']), n_jobs=int(settings['n_jobs'])).fit(sdata) except Exception as inst: logger.error('[%s] : [ERROR] Cannot instanciate sDBSCAN with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Error while instanciating sDBSCAN with %s and %s" % (type(inst), inst.args) sys.exit(1) labels = db.labels_ print labels n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print'Estimated number of clusters: %d' % n_clusters_ self.__serializemodel(db, 'sdbscan', mname) return db
def __init__(self, image, colour_space='hsv', cluster_method='ward', scale=None, num_clusters=None, quantile=None): self.image = image self.colour_space = colour_space self.cluster_method = cluster_method self.params = Parameters() # Scaling colour space if scale is None: self.params.scale = (1, 1, 1) else: # TODO validate 3 float tuple self.params.scale = scale # K-means param if num_clusters is None: self.params.num_clusters = 8 else: # TODO validate self.params.num_clusters = int(num_clusters) # Mean-shift param if quantile is None: self.params.quantile = 0.1 else: self.params.quantile = float(quantile) # DBSCAN param # if epsilon is None: self.params.epsilon = 255*0.1 # Log h, w = self.image.shape[:2] msg = 'Clustering a {}x{} image: cluster_method={} colour_space={} num_clusters={} quantile={}'.format( w, h, cluster_method, colour_space, num_clusters, quantile ) print msg
def dbscan(self, n_clusters=None, eps=0.5, min_samples=10, algorithm='auto', leaf_size=30): """ Perform DBSCAN clustering This can also be used for Duplicate Detection (when ep Parameters ---------- n_clusters : int number of clusters # not used just present for compatibility lsi_components : int apply LSA before the clustering algorithm eps : float The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. """ from sklearn.cluster import DBSCAN pars = {'is_hierarchical': False, "metric": self.metric} km = DBSCAN(eps=eps, min_samples=min_samples, algorithm=algorithm, leaf_size=leaf_size) return self._cluster_func(n_clusters, km, pars)
def main(): centers = get_list('out_center.txt') labels = get_list('142-label.txt') judge(centers, labels) n_class = int(len(centers) * 0.18) est = KMeans(n_clusters=n_class, max_iter=1000) est.fit(centers) new_list = [] for x, y in est.cluster_centers_: min_num = 10000 min_x = -1 min_y = -1 for x_, y_ in centers: dist = distance(x, y, x_, y_) if (dist < min_num) or (min_x == -1): min_num = dist min_x = x_ min_y = y_ new_list.append([min_x, min_y]) judge(new_list, labels) judge(est.cluster_centers_, labels) # db = DBSCAN(eps=0.3, min_samples=180).fit(centers) # print(db.core_sample_indices_) # judge(new_list, labels) # print(est.cluster_centers_) # save_list('result.txt', est.cluster_centers_) # af = AffinityPropagation(preference=180).fit(centers) # judge(af.cluster_centers_, labels)
def dbscan_partition(iterable, params, sample_weight=None): """ :type iterable: iter :param iterable: iterator yielding ((key, partition), vector) :type params: dict :param params: dictionary containing sklearn DBSCAN parameters :rtype: iter :return: ((key, cluster_id), v) Performs a DBSCAN on a given partition of the data """ # read iterable into local memory data = list(iterable) (key, part), vector = data[0] x = np.array([v for (_, __), v in data]) y = np.array([k for (k, _), __ in data]) # perform DBSCAN model = skc.DBSCAN(**params) # import sys # print(model, file=sys.stderr) weights = [sample_weight[k[0]] for k in x] c = model.fit_predict(x, sample_weight=weights) cores = set(model.core_sample_indices_) # yield (key, cluster_id), non-core samples labeled with * for i in xrange(len(c)): flag = '' if i in cores else '*' yield (y[i], '%i:%i%s' % (part, c[i], flag))
def dbscan(points,eps,min_samples): db = DBSCAN(eps=eps, min_samples=min_samples).fit(points) # eps=5 min_samples = 80 # Labeling pixels by cluster core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # Creating list of clusters return [points[labels == i] for i in xrange(n_clusters_)]
def cluster_texts(textdict, eps=0.45, min_samples=3): """ cluster the given texts Input: textdict: dictionary with {docid: text} Returns: doccats: dictionary with {docid: cluster_id} """ doc_ids = list(textdict.keys()) # transform texts into length normalized kpca features ft = FeatureTransform(norm='max', weight=True, renorm='length', norm_num=False) docfeats = ft.texts2features(textdict) X, featurenames = features2mat(docfeats, doc_ids) e_lkpca = KernelPCA(n_components=250, kernel='linear') X = e_lkpca.fit_transform(X) xnorm = np.linalg.norm(X, axis=1) X = X/xnorm.reshape(X.shape[0], 1) # compute cosine similarity D = 1. - linear_kernel(X) # and cluster with dbscan clst = DBSCAN(eps=eps, metric='precomputed', min_samples=min_samples) y_pred = clst.fit_predict(D) return {did: y_pred[i] for i, did in enumerate(doc_ids)}
def getRotMat(verts): """ find the table and do calibration :param verts: see depthToXYZ :return: author: weiwei date: 20170711 """ cutverts = [] for vert in verts: if vert[0] < 700.0 and vert[0] > -700.0: if vert[1] < 200.0 and vert[1] > -600.0: if vert[2] < -1000.0 and vert[2] > -1500.0: cutverts.append([vert[0], vert[1], vert[2]]) # clustering using DBSCAN X = np.array(cutverts) db = DBSCAN(eps=20, min_samples = 100, n_jobs = -1).fit(X) print db.labels_ core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ returnvertslist = [] unique_labels = set(labels) for k in unique_labels: class_member_mask = (labels == k) print class_member_mask, core_samples_mask xyzlist = X[class_member_mask & core_samples_mask] print xyzlist returnvertslist.append(xyzlist.tolist()) return returnvertslist # return verts
def __init__(self): """ Kinect interface author: weiwei date: 20170715 """ self.kinect = PyKinectRuntime.PyKinectRuntime(PyKinectV2.FrameSourceTypes_Depth) self.dbscan = DBSCAN(eps=50, min_samples=100, n_jobs=-1) self.randsac = linear_model.RANSACRegressor(linear_model.LinearRegression(), residual_threshold = 15)
def _get_dbscan(parameters): if parameters is None: parameters = { } return DBSCAN(**parameters)
def clusterMalwareNames(malwareNames): # strictly lexical clustering over malware-names wordCount = {} # create a distance matrix matrix = np.zeros((len(malwareNames), len(malwareNames))) for i in range(len(malwareNames)): for j in range(len(malwareNames)): if matrix[i, j] == 0.0: matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j]) matrix[j, i] = matrix[i, j] # Scikit-Learn's DBSCAN implementation to cluster the malware-names clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed") clust.fit(matrix) preds = clust.labels_ clabels = np.unique(preds) # create Word-Count Map for i in range(clabels.shape[0]): if clabels[i] < 0: continue cmem_ids = np.where(preds == clabels[i])[0] cmembers = [] for cmem_id in cmem_ids: cmembers.append(malwareNames[cmem_id]) wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids) return wordCount
def test_DBSCAN(*data): ''' test the DBSCAN method :param data: train, target :return: None ''' X,labels_true=data clst=cluster.DBSCAN() predicted_labels=clst.fit_predict(X) print("ARI:%s"% adjusted_rand_score(labels_true,predicted_labels)) print("Core sample num:{0}".format(len(clst.core_sample_indices_)))
def test_DBSCAN_epsilon(*data): ''' test the score with different eps :param data: train, target :return: None ''' X,labels_true=data epsilons=np.logspace(-1,1.5) ARIs=[] Core_nums=[] for epsilon in epsilons: clst=cluster.DBSCAN(eps=epsilon) predicted_labels=clst.fit_predict(X) ARIs.append( adjusted_rand_score(labels_true,predicted_labels)) Core_nums.append(len(clst.core_sample_indices_)) ## graph fig=plt.figure() ax=fig.add_subplot(1,2,1) ax.plot(epsilons,ARIs,marker='+') ax.set_xscale('log') ax.set_xlabel(r"$\epsilon$") ax.set_ylim(0,1) ax.set_ylabel('ARI') ax=fig.add_subplot(1,2,2) ax.plot(epsilons,Core_nums,marker='o') ax.set_xscale('log') ax.set_xlabel(r"$\epsilon$") ax.set_ylabel('Core_Nums') fig.suptitle("DBSCAN") plt.show()
def test_DBSCAN_min_samples(*data): ''' test the score with different min_sample :param data: train, target :return: None ''' X,labels_true=data min_samples=range(1,100) ARIs=[] Core_nums=[] for num in min_samples: clst=cluster.DBSCAN(min_samples=num) predicted_labels=clst.fit_predict(X) ARIs.append( adjusted_rand_score(labels_true,predicted_labels)) Core_nums.append(len(clst.core_sample_indices_)) ## graph fig=plt.figure() ax=fig.add_subplot(1,2,1) ax.plot(min_samples,ARIs,marker='+') ax.set_xlabel( "min_samples") ax.set_ylim(0,1) ax.set_ylabel('ARI') ax=fig.add_subplot(1,2,2) ax.plot(min_samples,Core_nums,marker='o') ax.set_xlabel( "min_samples") ax.set_ylabel('Core_Nums') fig.suptitle("DBSCAN") plt.show()
def runClustering(ssearch, eps, min_samples): """ Run DBSCAN with the determined eps and MinPts values. """ print('Clustering all documents with DBSCAN, eps=%0.2f min_samples=%d' % (eps, min_samples)) # Initialize DBSCAN with parameters. # I forgot to use cosine at first! db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine', algorithm='brute') # Time this step. t0 = time.time() # Cluster the LSI vectors. db.fit(ssearch.index.index) # Calculate the elapsed time (in seconds) elapsed = (time.time() - t0) print(" done in %.3fsec" % elapsed) # Get the set of unique IDs. cluster_ids = set(db.labels_) # Show the number of clusters (don't include noise label) print('Number of clusters (excluding "noise"): %d' % (len(cluster_ids) - 1)) # For each of the clusters... for cluster_id in cluster_ids: # Get the list of all doc IDs belonging to this cluster. cluster_doc_ids = [] for doc_id in range(0, len(db.labels_)): if db.labels_[doc_id] == cluster_id: cluster_doc_ids.append(doc_id) # Get the top words in this cluster top_words = ssearch.getTopWordsInCluster(cluster_doc_ids) print(' Cluster %d: (%d docs) %s' % (cluster_id, len(cluster_doc_ids), " ".join(top_words)))
def main(): """ Entry point for the script. """ ########################################################################### # Load the corpus ########################################################################### # Load the pre-built corpus. print('Loading the saved SimSearch and corpus...') (ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/') print ' %d documents.' % len(ssearch.index.index) # Step 1: Run a technique to find a good 'eps' value. #findEps(ssearch) #eps = 0.5 eps = 0.44 # Step 2: Run a technique to find a good 'MinPts' value. # TODO - This took ~17 min. on my desktop! #findMinPts(ssearch, eps) #min_samples = 8 min_samples = 4 # Step 3: Run DBSCAN runClustering(ssearch, eps, min_samples)
def dbscan(userid,X): db = DBSCAN(eps=0.15,min_samples=4).fit(X) # print db.labels_ zeros_like core_samples_mask = np.zeros_like(db.labels_,dtype=bool) core_samples_mask[db.core_sample_indices_] = True lables = db.labels_ labels_list = list(lables) # print labels_list.count(-1) out_user.setdefault(userid,0) out_user[userid] = labels_list.count(-1) print out_user # print labels_list.index(-1) print lables n_clusters_ = len(set(lables)) -(1 if -1 in lables else 0) unique_lables = set(lables) cols = plt.cm.Spectral(np.linspace(0,1,len(unique_lables))) # center_points = [] for k,col in zip(unique_lables,cols): if k == -1: col = 'k' class_member_mask = (lables == k) k_x = X[class_member_mask & core_samples_mask] plt.plot(k_x[:,0],k_x[:,1],'o',markerfacecolor = col, markeredgecolor = 'k' , markersize = 5) center_points.append([np.mean(k_x[:,1]),np.mean(k_x[:,0])]) plt.title('DBSCAN :Estimated number of clusters: %d' % n_clusters_) # plt.show()
def __init__(self, filterer=PCA(n_components=2), coverer=HyperRectangleCoverer(), clusterer=DBSCAN(), params=None): self.filterer = filterer self.coverer = coverer self.clusterer = clusterer if params is not None: self.set_params(**params)
def set_random_state(estimator, random_state=0): """Set random state of an estimator if it has the `random_state` param. Classes for whom random_state is deprecated are ignored. Currently DBSCAN is one such class. """ if isinstance(estimator, DBSCAN): return if "random_state" in estimator.get_params(): estimator.set_params(random_state=random_state)
def train(self, data): """ :type data: pyspark.RDD :param data: (key, k-dim vector like) Train the model using a (key, vector) RDD """ parts = KDPartitioner(data, self.max_partitions) self.data = data self.bounding_boxes = parts.bounding_boxes self.expanded_boxes = {} self._create_neighborhoods() # repartition data set on the partition label self.data = self.data.map(lambda ((k, p), v): (p, (k, v))) \ .partitionBy(len(parts.partitions)) \ .map(lambda (p, (k, v)): ((k, p), v)) # create parameters for sklearn DBSCAN params = {'eps': self.eps, 'min_samples': self.min_samples, 'metric': self.metric} # perform dbscan on each part self.data = self.data.mapPartitions( lambda iterable: dbscan_partition(iterable, params)) self.data.cache() self._remap_cluster_ids()
def assignments(self): """ :rtype: list :return: list of (key, cluster_id) Retrieve the results of the DBSCAN """ return self.result.collect()
def makeClusterers(X, k=2): return [('MiniBatchKMeans', makeKMeans(X, k)), ('AffinityPropagation', makeAffinityProp()), ('MeanShift', makeMeanShift(X)), ('SpectralClustering', makeSpectral(X, k)), ('Ward', makeWard(X, k)), ('AgglomerativeAvg', makeAvgLinkage(X, k)), ('AgglomerativeMax', makeMaxLinkage(X, k)), ('AgglomerativeWard', makeWardLinkage(X, k)), ('DBSCAN', makeDBScan())]
def cluster(X, eps=1, min_pts=30, algorithm='DBSCAN', n_clusters=10): if algorithm == 'DBSCAN': cluster_result = DBSCAN(eps=eps, min_samples=min_pts).fit(X) elif algorithm == 'KMeans': cluster_result = KMeans(n_clusters=n_clusters) labels = cluster_result.labels_ return labels
def update_location_centroid(point, cluster, max_distance, min_samples): """ Updates the centroid of a location cluster with another point Args: point (:obj:`Point`): Point to add to the cluster cluster (:obj:`list` of :obj:`Point`): Location cluster max_distance (float): Max neighbour distance min_samples (int): Minimum number of samples Returns: (:obj:`Point`, :obj:`list` of :obj:`Point`): Tuple with the location centroid and new point cluster (given cluster + given point) """ cluster.append(point) points = [p.gen2arr() for p in cluster] # Estimates the epsilon eps = estimate_meters_to_deg(max_distance, precision=6) p_cluster = DBSCAN(eps=eps, min_samples=min_samples) p_cluster.fit(points) clusters = {} for i, label in enumerate(p_cluster.labels_): if label in clusters.keys(): clusters[label].append(points[i]) else: clusters[label] = [points[i]] centroids = [] biggest_centroid_l = -float("inf") biggest_centroid = None for label, n_cluster in clusters.items(): centroid = compute_centroid(n_cluster) centroids.append(centroid) if label >= 0 and len(n_cluster) >= biggest_centroid_l: biggest_centroid_l = len(n_cluster) biggest_centroid = centroid if biggest_centroid is None: biggest_centroid = compute_centroid(points) return biggest_centroid, cluster
def classify_user(): new_df_log_scaled = get_scaled_user() c = DBSCAN(eps=90,min_samples=50,metric='manhattan').fit(new_df_log_scaled.T) pd.value_counts(c.labels_) d = c.labels_ types = pd.DataFrame(d,index=new_df_log_scaled.columns)[0] types[types == -1] = 2 return types
def detect(self, method, model, data): ''' :param method: -> method name :param model: -> trained clusterer :param data: -> dataframe with data :return: -> dictionary that contains the list of anomalous timestamps ''' smodel = self.__loadClusterModel(method, model) anomalieslist = [] if not smodel: dpredict = 0 else: if data.shape[0]: if isinstance(smodel, IsolationForest): print "Detected IsolationForest model" print "Contamination -> %s" % smodel.contamination print "Max_Features -> %s" % smodel.max_features print "Max_Samples -> %s" % smodel.max_samples_ print "Threashold -> %s " % smodel.threshold_ try: dpredict = smodel.predict(data) print "IsolationForest Prediction Array -> %s" %str(dpredict) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting isolationforest model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 elif isinstance(smodel, DBSCAN): print "Detected DBSCAN model" print "Leaf_zise -> %s" % smodel.leaf_size print "Algorithm -> %s" % smodel.algorithm print "EPS -> %s" % smodel.eps print "Min_Samples -> %s" % smodel.min_samples print "N_jobs -> %s" % smodel.n_jobs try: dpredict = smodel.fit_predict(data) except Exception as inst: logger.error('[%s] : [ERROR] Error while fitting sDBSCAN model to event with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) dpredict = 0 else: dpredict = 0 logger.warning('[%s] : [WARN] Dataframe empty with shape (%s,%s)', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(data.shape[0]), str(data.shape[1])) print "Empty dataframe received with shape (%s,%s)" % (str(data.shape[0]), str(data.shape[1])) print "dpredict type is %s" % (type(dpredict)) if type(dpredict) is not int: anomalyarray = np.argwhere(dpredict == -1) for an in anomalyarray: anomalies = {} anomalies['utc'] = int(data.iloc[an[0]]['key']) anomalies['hutc'] = ut2hum(int(data.iloc[an[0]]['key'])) anomalieslist.append(anomalies) anomaliesDict = {} anomaliesDict['anomalies'] = anomalieslist logger.info('[%s] : [INFO] Detected anomalies with model %s using method %s are -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), model, method, str(anomaliesDict)) return anomaliesDict
def main(): """ compute_embeddings_vectors() print "Reading embedding vectors" with open('triples_vectors.pkl', 'r') as in_file: triples = pickle.load(in_file) vectors = [] for t in triples: vectors.append(t.vector) """ text = [] triples = [] with open('triples.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t') for t in reader: e1, e1_type, rel, e2, e2_type = t[0], t[1], t[2], t[3], t[4] t = Triple(e1, e1_type, rel, e2, e2_type) text.append(rel) triples.append(t) tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(text) print "Clustering" dbscan = DBSCAN(eps=0.4, min_samples=15, metric='cosine', algorithm='brute', leaf_size=30, p=None, n_jobs=1) labels = dbscan.fit_predict(tfidf_matrix) with open('triples_labels.txt', 'w') as out_file: for l in labels: out_file.write(str(l) + '\n') print "Reading cluster labels" labels = [] with open('triples_labels.txt', 'r') as in_file: for label in in_file: labels.append(int(label.strip())) for i in range(len(triples)): triples[i].label = labels[i] clusters = dict() for t in triples: try: clusters[t.label] += 1 except KeyError: clusters[t.label] = 1 print clusters exit(-1) # print len(clusters) # top-terms for each cluster for x in range(-1, len(clusters)): print x, len(clusters[x]) for t in triples: if t.label == str(x): print t.rel print print
def detect_match_chunks(self, max_error=.06): percent = cv2.imread("assets/pct.png") corr_series = [] for (time, scene) in self.sample_frames(interval=self.polling_interval): cv2.imwrite("scene.png", scene) scene = cv2.imread("scene.png") scaled_percent = cv2.resize( percent, (0, 0), fx=self.scale, fy=self.scale) scaled_percent = cv2.Canny(scaled_percent, 50, 200) percent_corrs = [] for port_number, roi in enumerate(self.ports): if roi is not None: scene_roi = scene[roi.top:(roi.top + roi.height), roi.left:(roi.left + roi.width)] scene_roi = cv2.Canny(scene_roi, 50, 200) corr_map = cv2.matchTemplate(scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED) _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map) percent_corrs.append(max_corr) point = [time, max(percent_corrs)] corr_series.append(point) corr_series = np.array(corr_series) medians = pd.rolling_median(corr_series[:, 1], self.min_gap // self.polling_interval, center=True)[2:-2] clusters = DBSCAN(eps=0.03, min_samples=10).fit(medians.reshape(-1, 1)) dataframe = list(zip(corr_series[:, 0][2:-2], medians, clusters.labels_)) labels = list(set(x[2] for x in dataframe)) cluster_means = [sum(cluster) / len(cluster) for cluster in [[x[1] for x in dataframe if x[2] == label] for label in labels]] cluster_means = list(zip(labels, cluster_means)) game_label = max(cluster_means, key=lambda x: x[1])[0] game_groups = [(k, list(v)) for k, v in groupby(dataframe, lambda pt: pt[2])] games = [[v[0][0], v[-1][0]] for k, v in game_groups if k == game_label] return games
def __detect_match_chunks(self, max_error=.04): percent = cv2.imread("assets/pct.png") corr_series = [] for (time, scene) in spaced_frames(self, interval=self.polling_interval): cv2.imwrite("scene.png", scene) scene = cv2.imread("scene.png") scaled_percent = cv2.resize( percent, (0, 0), fx=self.scale, fy=self.scale) scaled_percent = cv2.Canny(scaled_percent, 50, 200) percent_corrs = [] for port_number, roi in enumerate(self.ports): if roi is not None: scene_roi = scene[roi.top:roi.bottom, roi.left:roi.right] scene_roi = cv2.Canny(scene_roi, 50, 200) corr_map = cv2.matchTemplate( scene_roi, scaled_percent, cv2.TM_CCOEFF_NORMED) _, max_corr, _, max_loc = cv2.minMaxLoc(corr_map) percent_corrs.append(max_corr) point = [time, max(percent_corrs)] corr_series.append(point) corr_series = np.array(corr_series) def moving_average(series, n=5): return np.convolve(series, np.ones((n,)) / n, mode='valid') medians = rolling_median(corr_series[:, 1], self.min_gap // self.polling_interval, center=True)[2:-2] clusters = DBSCAN(eps=0.05, min_samples=10).fit(medians.reshape(-1, 1)) centers = kmeans.cluster_centers_ points = zip([time + (self.min_gap / 2) for time, corr in corr_series], kmeans.labels_) # Throw out the lowest cluster groups = [(k, list(v)) for k, v in groupby(points, lambda pt: centers[pt[1]] > max(min(centers), .2))] games = [[v[0][0], v[-1][0]] for k, v in groups if k] return games
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200, method='ap'): """Define clusters given the similarity matrix and the threshold.""" n, labels = connected_components(similarity_matrix, directed=False) prev_max_clust = 0 print("connected components: %d" % n) clusters = labels.copy() if method == 'dbscan': ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1) if method == 'ap': ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter, preference='median') for i in range(n): idxs = np.where(labels == i)[0] if idxs.shape[0] > 1: sm = similarity_matrix[idxs][:, idxs] sm += sm.T + scipy.sparse.eye(sm.shape[0]) # Hierarchical clustering if method == 'hc': dists = squareform(1 - sm.toarray()) links = fastcluster.linkage(dists, method='ward') try: clusters_ = fcluster(links, threshold, 'distance') except ValueError as err: logging.critical(err) clusters_ = np.zeros(1, dtype=int) # DBSCAN elif method == 'dbscan': db = ap.fit(1. - sm.toarray()) # Number of clusters in labels, ignoring noise if present. clusters_ = db.labels_ # n_clusters_ = len(set(clusters_)) - int(0 in clusters_) # AffinityPropagation # ap = AffinityPropagation(affinity='precomputed') elif method == 'ap': db = ap.fit(sm) clusters_ = db.labels_ else: raise ValueError("clustering method %s unknown" % method) if np.min(clusters_) == 0: clusters_ += 1 clusters_ += prev_max_clust clusters[idxs] = clusters_ prev_max_clust = max(clusters_) else: # connected component contains just 1 element prev_max_clust += 1 clusters[idxs] = prev_max_clust return np.array(extra.flatten(clusters))