我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用sklearn.cluster.FeatureAgglomeration()。
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X) assert_raises(ValueError, linkage_tree, X, linkage='foo') assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_ward_agglomeration(): # Check that we obtain the correct solution in a simplistic case rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def feature_agglomeration(df, number_of_clusters=int(df.shape[1] / 1.2)): df = df.copy() # Todo: find optimal number of clusters for the feature clustering # number_of_clusters = int(df.shape[1]/2) agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters) if any(tuple(df.columns == 'Call Outcome')): res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna() .shape), y=df['Call Outcome'].values) else: res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape)) df = pd.DataFrame(data=res) return df
def dendrogram(df, number_of_clusters=int(df.shape[1] / 1.2)): # Create Dendrogram agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters) used_networks = np.arange(0, number_of_clusters, dtype=int) # Create a custom palette to identify the networks network_pal = sns.cubehelix_palette(len(used_networks), light=.9, dark=.1, reverse=True, start=1, rot=-2) network_lut = dict(zip(map(str, df.columns), network_pal)) # Convert the palette to vectors that will be drawn on the side of the matrix networks = df.columns.get_level_values(None) network_colors = pd.Series(networks, index=df.columns).map(network_lut) sns.set(font="monospace") # Create custom colormap cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True) cg = sns.clustermap(df.astype(float).corr(), cmap=cmap, linewidths=.5, row_colors=network_colors, col_colors=network_colors) plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) plt.show()
def feature_agglomeration(df): df = df.copy() # Todo: find optimal number of clusters for the feature clustering # number_of_clusters = int(df.shape[1]/2) number_of_clusters = int(df.shape[1] / 1.2) from sklearn.cluster import FeatureAgglomeration agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters) # mask = ~df[features].isnull() # mask_index = mask[mask == 1].index if any(tuple(df.columns == 'SalePrice')): # res = agglomerated_features.fit_transform(np.reshape(np.array(df[HousePrices._feature_names_num.values] # [mask].values), # df[HousePrices._feature_names_num.values][mask] # .shape), y=df.SalePrice.values).toarray() res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna() .shape), y=df.SalePrice.values) else: # res = agglomerated_features.fit_transform(np.reshape(np.array(df.dropna().values), df.dropna() # .shape)) res = agglomerated_features.fit_transform(np.reshape(np.array(df.values), df.shape)) # Todo: in case of adding values using df.loc[], remember mask is only possible for a single feature at a time. print(''.join(['labels:', str(agglomerated_features.labels_)])) print(''.join(['Children:', str(agglomerated_features.children_)])) print(''.join(['number of leaves in the hierarchical tree:', str(agglomerated_features.n_leaves_)])) HousePrices.dendrogram(df, number_of_clusters, agglomerated_features.labels_) df = pd.DataFrame(data=res) return df
def data_compression(fmri_masked, mask_img, mask_np, output_size): """ data : array_like A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask : a numpy array of the mask output_size : integer The number of elements that the data should be reduced to """ ## Transform nifti files to a data matrix with the NiftiMasker import time from nilearn import input_data datacompressiontime=time.time() nifti_masker = input_data.NiftiMasker(mask_img= mask_img, memory='nilearn_cache', mask_strategy='background', memory_level=1, standardize=False) ward=[] # Perform Ward clustering from sklearn.feature_extraction import image shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) from sklearn.cluster import FeatureAgglomeration start = time.time() ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start)) labels = ward.labels_ #print ('Extracting reduced Dimension Data') data_reduced = ward.transform(fmri_masked) fmri_masked=[] #print('Data compression took ', (time.time()- datacompressiontime), ' seconds') return {'data':data_reduced, 'labels':labels}