我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.manifold.TSNE。
def tsne_cluster_cuisine(df,sublist): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) palette = sns.color_palette("hls", len(sublist)) plt.figure(figsize=(10,10)) for i,cuisine in enumerate(sublist): plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\ tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i]) plt.legend() #interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def word_cloud(word_embedding_matrix, vocab, s, save_file='scatter.png'): words = [(i, vocab[i]) for i in s] model = TSNE(n_components=2, random_state=0) #Note that the following line might use a good chunk of RAM tsne_embedding = model.fit_transform(word_embedding_matrix) words_vectors = tsne_embedding[np.array([item[1] for item in words])] plt.subplots_adjust(bottom = 0.1) plt.scatter( words_vectors[:, 0], words_vectors[:, 1], marker='o', cmap=plt.get_cmap('Spectral')) for label, x, y in zip(s, words_vectors[:, 0], words_vectors[:, 1]): plt.annotate( label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', fontsize=20, # bbox=dict(boxstyle='round,pad=1.', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle = '<-', connectionstyle='arc3,rad=0') ) plt.show() # plt.savefig(save_file)
def plot_tsne(z_mu, classes, name): import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from sklearn.manifold import TSNE model_tsne = TSNE(n_components=2, random_state=0) z_states = z_mu.data.cpu().numpy() z_embed = model_tsne.fit_transform(z_states) classes = classes.data.cpu().numpy() fig666 = plt.figure() for ic in range(10): ind_vec = np.zeros_like(classes) ind_vec[:, ic] = 1 ind_class = classes[:, ic] == 1 color = plt.cm.Set1(ic) plt.scatter(z_embed[ind_class, 0], z_embed[ind_class, 1], s=10, color=color) plt.title("Latent Variable T-SNE per Class") fig666.savefig('./vae_results/'+str(name)+'_embedding_'+str(ic)+'.png') fig666.savefig('./vae_results/'+str(name)+'_embedding.png')
def compute_bulk_smushing(self): """Get average signal from each plate ('bulk') and find 2d embedding""" grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING]) if os.path.exists(self.bulk_smushed_cache_file): smushed = pd.read_csv(self.bulk_smushed_cache_file, names=[0, 1], header=0, index_col=0) # if the set of plates hasn't changed, return the cached version if set(grouped.groups) == set(smushed.index): return smushed # if the cache was missing or invalid, compute a new projection medians = grouped.median() smusher = TSNE(random_state=0, perplexity=10, metric='cosine') smushed = pd.DataFrame(smusher.fit_transform(medians), index=medians.index) smushed.to_csv(self.bulk_smushed_cache_file) return smushed
def compute_cell_smushing(self): """Within each plate, find a 2d embedding of all cells""" grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING]) if os.path.exists(self.cell_smushed_cache_file): smusheds = pd.read_pickle(self.cell_smushed_cache_file) # if nothing is missing, return the cached version if not set(grouped.groups) - set(smusheds): return smusheds else: smusheds = {} for plate_name, genes_subset in grouped: if plate_name not in smusheds: cell_smusher = TSNE(metric='cosine', random_state=0) cell_smushed = pd.DataFrame( cell_smusher.fit_transform(genes_subset), index=genes_subset.index) smusheds[plate_name] = cell_smushed pd.to_pickle(smusheds, self.cell_smushed_cache_file) return smusheds
def gen(self): embedding, _ = self.embedding() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, tf.train.latest_checkpoint('.')) embedding = sess.run(embedding) # ??? data = embedding[:self.viz_words, :] # ??????? tsne = TSNE(n_components=2, init='pca', random_state=0) embed_tsne = tsne.fit_transform(data) # ?? plt.subplots(figsize=(10, 10)) for idx in range(self.viz_words): plt.scatter(*embed_tsne[idx, :], color='steelblue') plt.annotate(self.train_text.int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7) plt.show()
def main(): args = parse_args() print('Called with args:') print(args) lang_db = get_language_model(args.lang_name) imdb = get_imdb(args.imdb_name) # Get words in space vocabulary = imdb.get_labels(args.space) # Get features for words wv = [lang_db.word_vector(w) for w in vocabulary] from sklearn.metrics.pairwise import cosine_similarity from scipy import spatial #spatial.distance.cosine(dataSetI, dataSetII) tsne = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) Y = tsne.fit_transform(wv) plt.scatter(Y[:, 0], Y[:, 1]) for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]): plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') plt.show()
def main(we_file='glove_model_50.npz', w2i_file='glove_word2idx_50.json'): words = ['japan', 'japanese', 'england', 'english', 'australia', 'australian', 'china', 'chinese', 'italy', 'italian', 'french', 'france', 'spain', 'spanish'] with open(w2i_file) as f: word2idx = json.load(f) npz = np.load(we_file) W = npz['arr_0'] V = npz['arr_1'] We = (W + V.T) / 2 idx = [word2idx[w] for w in words] # We = We[idx] tsne = TSNE() Z = tsne.fit_transform(We) Z = Z[idx] plt.scatter(Z[:,0], Z[:,1]) for i in xrange(len(words)): plt.annotate(s=words[i], xy=(Z[i,0], Z[i,1])) plt.show()
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format(parsed_args.input)) data_set = load(parsed_args.input) features = np.reshape(data_set.features, [data_set.num_instances, -1]) if features.shape[1] > 50: self.log.info("applying PCA") pca = PCA(n_components=200) pca.fit(features) features = pca.transform(features) self.log.info("computing T-SNE embedding") tsne = TSNE(perplexity=parsed_args.perplexity, learning_rate=parsed_args.learning_rate, verbose=self.app_args.verbose_level) embedding = tsne.fit_transform(features) self.log.info("plotting embedding") self.plot_with_labels(data_set, embedding)
def _plot_proto_symbol_space(coordinates, target_names, name, args): # Reduce to 2D so that we can plot it. coordinates_2d = TSNE().fit_transform(coordinates) n_samples = coordinates_2d.shape[0] x = coordinates_2d[:, 0] y = coordinates_2d[:, 1] colors = cm.rainbow(np.linspace(0, 1, n_samples)) fig = plt.figure(1) plt.clf() ax = fig.add_subplot(111) dots = [] for idx in xrange(n_samples): dots.append(ax.plot(x[idx], y[idx], "o", c=colors[idx], markersize=15)[0]) ax.annotate(target_names[idx], xy=(x[idx], y[idx])) lgd = ax.legend(dots, target_names, ncol=4, numpoints=1, loc='upper center', bbox_to_anchor=(0.5,-0.1)) ax.grid('on') if args.output_dir is not None: path = os.path.join(args.output_dir, name + '.pdf') print('Saved plot to file "%s"' % path) fig.savefig(path, bbox_extra_artists=(lgd,), bbox_inches='tight') else: plt.show()
def embed_or_load_cache(codes, gen, r_idx, batch_size, save_path): cache_fp = os.path.join(save_path, 'tsne_plots', 'embedded_points_r%02d.csv' % (r_idx,)) if os.path.isfile(cache_fp): lines = open(cache_fp).readlines() lines = [line.strip().split(",") for line in lines[1:]] vals = [(float(x), float(y)) for (x, y) in lines] return np.array(vals, dtype=np.float32) else: codes_r = generate_codes_by_r(gen, codes, r_idx, batch_size) print(codes_r.shape) print("Embedding %s via TSNE..." % (str(codes_r.shape),)) tsne = TSNE(perplexity=40, n_iter=10000, learning_rate=4000, verbose=True) #tsne = TSNE(perplexity=40, n_iter=10000, n_jobs=4, verbose=True) #tsne = PCA(n_components=2) codes_r_2d = tsne.fit_transform(codes_r.astype(np.float64)) print("shape after embedding: %s" % (str(codes_r_2d.shape),)) with open(cache_fp, "w") as f: f.write("#x,y\n") for i in xrange(codes_r.shape[0]): f.write("%.6f,%.6f\n" % (codes_r_2d[i, 0], codes_r_2d[i, 1])) return codes_r_2d
def plotInputData(X, Y, title, data_len): time_start = time.time() X = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300).fit_transform(X) print("After Reduction Data Shape : {0}".format(X.shape)) print 't-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start) # Main scatter plot and plot annotation f, ax = plt.subplots(figsize=(7, 7)) ax.scatter(X[:data_len / 2, 0] * 10, X[:data_len / 2, 1] * 10, marker = 'o', color = 'green', s=30, alpha=0.5) ax.scatter(X[data_len / 2:, 0] * 10, X[data_len / 2:, 1] * 10, marker = '^', color = 'blue', s=30, alpha=0.5) plt.legend(["Melanoma", "Benign"], loc='upper right') plt.title(title) plt.ylabel('Y') plt.xlabel('X') # plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) # plt.xlabel('X') # plt.ylabel('Y') # plt.title('SVC Data Plot') plt.show()
def main(): # model_file = "../data/word2vec/character.model" model_file = "../data/word2vec_new/word.model" checkSimilarity(model_file, "?") # character_wv_file = '../data/word2vec/character_model.txt' # word_wv_file = '../data/word2vec/word_model.txt' # # embeddings_file = word_wv_file # wv, vocabulary = load_embeddings(embeddings_file) # # tsne = TSNE(n_components=2, random_state=0) # np.set_printoptions(suppress=True) # Y = tsne.fit_transform(wv[:1000, :]) # # plt.scatter(Y[:, 0], Y[:, 1]) # for label, x, y in zip(vocabulary, Y[:, 0], Y[:, 1]): # plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') # plt.show()
def __init__(self, ax=None, decompose='svd', decompose_by=50, classes=None, colors=None, colormap=None, **kwargs): """ Initialize the TSNE visualizer with visual hyperparameters. """ super(TSNEVisualizer, self).__init__(ax=ax, **kwargs) # Visualizer parameters self.classes_ = classes self.n_instances_ = 0 # Visual Parameters # TODO: Only colors currently works to select the colors of classes. self.colors = colors self.colormap = colormap # TSNE Parameters self.transformer_ = self.make_transformer(decompose, decompose_by, kwargs)
def finalize(self, **kwargs): """ Finalize the drawing by adding a title and legend, and removing the axes objects that do not convey information about TNSE. """ # Add a title self.set_title( "TSNE Projection of {} Documents".format(self.n_instances_) ) # Remove the ticks self.ax.set_yticks([]) self.ax.set_xticks([]) # Add the legend outside of the figure box. if self.classes_: box = self.ax.get_position() self.ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) self.ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
def tSNE_model(Vt,aid_dict): tsne_model = TSNE(n_components=2, verbose=1, random_state=0) tsne_V = tsne_model.fit_transform(np.transpose(Vt)) # Put data in a pandas dataframe: tsne_df = pd.DataFrame(tsne_V, columns=['x', 'y']) # Save it: tsne_df.to_csv('results\\tsne_svd.csv') # Get anime names: con = sqlite3.connect('user_anime_data.db') cur = con.cursor() anime_data = cur.execute('SELECT Anime, Name, Score FROM animeData').fetchall() anime_data=dict([(x[0],(x[1],x[2])) for x in anime_data]) anime_names = [anime_data[aid_dict[x]][0] for x in range(Vt.shape[1])] anime_scores = [anime_data[aid_dict[x]][1] for x in range(Vt.shape[1])] anime_ids = [aid_dict[x] for x in range(Vt.shape[1])] tsne_df['anime_name'] = anime_names tsne_df['anime_id'] = anime_ids tsne_df['rating'] = anime_scores return tsne_df # Plotting the data:
def computeTSNEProjectionOfLatentSpace(X, encoder, display=True): # Compute latent space representation print("Computing latent space projection...") X_encoded = encoder.predict(X) # Compute t-SNE embedding of latent space print("Computing t-SNE embedding...") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_tsne = tsne.fit_transform(X_encoded) # Plot images according to t-sne embedding if display: print("Plotting t-SNE visualization...") fig, ax = plt.subplots() imscatter(X_tsne[:, 0], X_tsne[:, 1], imageData=X, ax=ax, zoom=0.15) plt.show() else: return X_tsne # Show dataset images with T-sne projection of pixel space
def computeTSNEProjectionOfPixelSpace(X, display=True): # Compute t-SNE embedding of latent space print("Computing t-SNE embedding...") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_tsne = tsne.fit_transform(X.reshape([-1,imageSize*imageSize*3])) # Plot images according to t-sne embedding if display: print("Plotting t-SNE visualization...") fig, ax = plt.subplots() imscatter(X_tsne[:, 0], X_tsne[:, 1], imageData=X, ax=ax, zoom=0.15) plt.show() else: return X_tsne # Reconstructions for samples in dataset
def plot_tsne(images, X, filename): def imscatter(x, y, images, ax=None, zoom=1.0): if ax is None: ax = plt.gca() x, y = np.atleast_1d(x, y) artists = [] for x0, y0, img0 in zip(x, y, images): im = OffsetImage(img0, zoom=zoom) ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=False) artists.append(ax.add_artist(ab)) ax.update_datalim(np.column_stack([x, y])) ax.autoscale() return artists def plot_embedding(X, imgs, title=None): x_min, x_max = np.min(X, 0), np.max(X, 0) X = (X - x_min) / (x_max - x_min) plt.figure() ax = plt.subplot(111) for i in range(X.shape[0]): plt.text(X[i, 0], X[i, 1], ".", fontdict={'weight': 'bold', 'size': 9}) if hasattr(offsetbox, 'AnnotationBbox'): imscatter(X[:,0], X[:,1], imgs, zoom=0.1, ax=ax) plt.xticks([]), plt.yticks([]) if title is not None: plt.title(title) print("Computing t-SNE embedding") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_tsne = tsne.fit_transform(X) plot_embedding(X_tsne, images, "t-SNE embedding of images") plt.savefig(filename, bbox_inches='tight') # Driver
def plot(self, filename="./corpus/model/blog.png"): tsne = TSNE(perplexity=30, n_components=2, init="pca", n_iter=5000) plot_only=500 low_dim_embeddings = tsne.fit_transform(self.final_embeddings[:plot_only, :]) reversed_dictionary = dict(zip(self.dictionary.values(), self.dictionary.keys())) labels = [reversed_dictionary[i] for i in range(plot_only)] plt.figure(figsize=(18, 18)) for i, label in enumerate(labels): x, y = low_dim_embeddings[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords="offset points", ha="right", va="bottom") plt.savefig(filename) print("Scatter plot was saved to", filename)
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) # TSNE ????
def plot_embedding2D(node_pos, node_colors=None, di_graph=None): node_num, embedding_dimension = node_pos.shape if(embedding_dimension > 2): print("Embedding dimension greater than 2, use tSNE to reduce it to 2") model = TSNE(n_components=2) node_pos = model.fit_transform(node_pos) if di_graph is None: # plot using plt scatter plt.scatter(node_pos[:, 0], node_pos[:, 1], c=node_colors) else: # plot using networkx with edge structure pos = {} for i in range(node_num): pos[i] = node_pos[i, :] if node_colors: nx.draw_networkx_nodes(di_graph, pos, node_color=node_colors, width=0.1, node_size=100, arrows=False, alpha=0.8, font_size=5) else: nx.draw_networkx(di_graph, pos, node_color=node_colors, width=0.1, node_size=300, arrows=False, alpha=0.8, font_size=12)
def main(): f = open('label.txt','w') #target_names = np.array(args.names) X, target_names, y = getXY(args.image_dir) X = np.asfarray(X,dtype='float') colors = cm.gnuplot2(np.linspace(0, 1, len(target_names))) #X_pca = PCA(n_components=128).fit_transform(X) X_pca = X tsne = TSNE(n_components=2, init='random', random_state=0) X_r = tsne.fit_transform(X_pca) for c, i, target_name in zip(colors, list(range(0, len(target_names))), target_names): plt.scatter(X_r[y[i], 0], X_r[y[i], 1], c=c, label=str(i+1)) f.write(target_name+'\n') plt.legend() plt.savefig("{}/10crop1.png".format('./')) f.close()
def twoDB(Xtrain, Ytrain): pca = PCA() reduced = pca.fit_transform(Xtrain) tsne = TSNE() Z = tsne_divide(tsne, reduced, len(Ytrain)) # Save the PCA vectors with open("packet_tsneBinfo"+str(reduce_number), "wb") as f: pickle.dump(Z, f) pickle.dump(Xtrain, f) pickle.dump(Ytrain, f) return
def threeDB(Xtrain, Ytrain): pca = PCA() reduced = pca.fit_transform(Xtrain) tsne = TSNE(n_components=3) Z = tsne_divide(tsne, reduced, len(Ytrain)) # Save the PCA vectors with open("packet_3DtsneBinfo"+str(reduce_number), "wb") as f: pickle.dump(Z, f) pickle.dump(Xtrain, f) pickle.dump(Ytrain, f) return
def main(flags): with open(flags.emb_file, 'rb') as f: emb_dict = pickle.load(f) final_embeddings = [] words = [] for k, v in emb_dict.items(): words.append(k) final_embeddings.append(v) tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) low_dim_embs = tsne.fit_transform(final_embeddings[:flags.plot_num]) labels = words[:flags.plot_num] plot_with_labels(low_dim_embs, labels) return 0
def plot_bokeh(df,sublist,filename): lenlist=[0] df_sub = df[df['cuisine']==sublist[0]] lenlist.append(df_sub.shape[0]) for cuisine in sublist[1:]: temp = df[df['cuisine']==cuisine] df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True) lenlist.append(df_sub.shape[0]) df_X = df_sub.drop(['cuisine','recipeName'],axis=1) print df_X.shape, lenlist dist = squareform(pdist(df_X, metric='cosine')) tsne = TSNE(metric='precomputed').fit_transform(dist) #cannot use seaborn palette for bokeh palette =['red','green','blue','yellow'] colors =[] for i in range(len(sublist)): for j in range(lenlist[i+1]-lenlist[i]): colors.append(palette[i]) #plot with boken output_file(filename) source = ColumnDataSource( data=dict(x=tsne[:,0],y=tsne[:,1], cuisine = df_sub['cuisine'], recipe = df_sub['recipeName'])) hover = HoverTool(tooltips=[ ("cuisine", "@cuisine"), ("recipe", "@recipe")]) p = figure(plot_width=1000, plot_height=1000, tools=[hover], title="flavor clustering") p.circle('x', 'y', size=10, source=source,fill_color=colors) show(p)
def plot_tsne(doc_codes, doc_labels, classes_to_visual, save_file): # markers = ["D", "p", "*", "s", "d", "8", "^", "H", "v", ">", "<", "h", "|"] markers = ["o", "v", "8", "s", "p", "*", "h", "H", "+", "x", "D"] plt.rc('legend',**{'fontsize':30}) classes_to_visual = list(set(classes_to_visual)) C = len(classes_to_visual) while True: if C <= len(markers): break markers += markers class_ids = dict(zip(classes_to_visual, range(C))) if isinstance(doc_codes, dict) and isinstance(doc_labels, dict): codes, labels = zip(*[(code, doc_labels[doc]) for doc, code in doc_codes.items() if doc_labels[doc] in classes_to_visual]) else: codes, labels = doc_codes, doc_labels X = np.r_[list(codes)] tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) np.set_printoptions(suppress=True) X = tsne.fit_transform(X) plt.figure(figsize=(10, 10), facecolor='white') for c in classes_to_visual: idx = np.array(labels) == c # idx = get_indices(labels, c) plt.plot(X[idx, 0], X[idx, 1], linestyle='None', alpha=1, marker=markers[class_ids[c]], markersize=10, label=c) legend = plt.legend(loc='upper right', shadow=True) # plt.title("tsne") # plt.savefig(save_file) plt.savefig(save_file, format='eps', dpi=2000) plt.show()
def plot_tsne_3d(doc_codes, doc_labels, classes_to_visual, save_file, maker_size=None, opaque=None): markers = ["D", "p", "*", "s", "d", "8", "^", "H", "v", ">", "<", "h", "|"] plt.rc('legend',**{'fontsize':20}) colors = ['r', 'b', 'g', 'c', 'm', 'y', 'k'] C = len(classes_to_visual) while True: if C <= len(markers): break markers += markers while True: if C <= len(colors): break colors += colors class_ids = dict(zip(classes_to_visual, range(C))) if isinstance(doc_codes, dict) and isinstance(doc_labels, dict): codes, labels = zip(*[(code, doc_labels[doc]) for doc, code in doc_codes.items() if doc_labels[doc] in classes_to_visual]) else: codes, labels = doc_codes, doc_labels X = np.r_[list(codes)] tsne = TSNE(perplexity=30, n_components=3, init='pca', n_iter=5000) np.set_printoptions(suppress=True) X = tsne.fit_transform(X) fig = plt.figure(figsize=(10, 10), facecolor='white') ax = fig.add_subplot(111, projection='3d') # The problem is that the legend function don't support the type returned by a 3D scatter. # So you have to create a "dummy plot" with the same characteristics and put those in the legend. scatter_proxy = [] for i in range(C): cls = classes_to_visual[i] idx = np.array(labels) == cls ax.scatter(X[idx, 0], X[idx, 1], X[idx, 2], c=colors[i], alpha=opaque[i] if opaque else 1, s=maker_size[i] if maker_size else 20, marker=markers[i], label=cls) scatter_proxy.append(mpl.lines.Line2D([0],[0], linestyle="none", c=colors[i], marker=markers[i], label=cls)) ax.legend(scatter_proxy, classes_to_visual, numpoints=1) plt.savefig(save_file) plt.show()
def DBN_plot_tsne(doc_codes, doc_labels, classes_to_visual, save_file): markers = ["o", "v", "8", "s", "p", "*", "h", "H", "+", "x", "D"] C = len(classes_to_visual) while True: if C <= len(markers): break markers += markers class_ids = dict(zip(classes_to_visual.keys(), range(C))) codes, labels = doc_codes, doc_labels X = np.r_[list(codes)] tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) np.set_printoptions(suppress=True) X = tsne.fit_transform(X) plt.figure(figsize=(10, 10), facecolor='white') for c in classes_to_visual.keys(): idx = np.array(labels) == c # idx = get_indices(labels, c) plt.plot(X[idx, 0], X[idx, 1], linestyle='None', alpha=0.6, marker=markers[class_ids[c]], markersize=6, label=classes_to_visual[c]) legend = plt.legend(loc='upper center', shadow=True) plt.title("tsne") plt.savefig(save_file) plt.show()
def reuters_visualize_tsne(doc_codes, doc_labels, classes_to_visual, save_file): """ Visualize the input data on a 2D PCA plot. Depending on the number of components, the plot will contain an X amount of subplots. @param doc_codes: @param number_of_components: The number of principal components for the PCA plot. """ # markers = ["p", "s", "h", "H", "+", "x", "D"] markers = ["o", "v", "8", "s", "p", "*", "h", "H", "+", "x", "D"] C = len(classes_to_visual) while True: if C <= len(markers): break markers += markers class_names = classes_to_visual.keys() class_ids = dict(zip(class_names, range(C))) class_names = set(class_names) codes, labels = zip(*[(code, doc_labels[doc]) for doc, code in doc_codes.items() if class_names.intersection(set(doc_labels[doc]))]) X = np.r_[list(codes)] tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) np.set_printoptions(suppress=True) X = tsne.fit_transform(X) plt.figure(figsize=(10, 10), facecolor='white') for c in classes_to_visual.keys(): idx = get_indices(labels, c) plt.plot(X[idx, 0], X[idx, 1], linestyle='None', alpha=0.6, marker=markers[class_ids[c]], markersize=6, label=classes_to_visual[c]) legend = plt.legend(loc='upper center', shadow=True) plt.title("tsne") plt.savefig(save_file) plt.show()
def fit_tsne(values): if not values: return start = time.time() mat = np.array(values) model = TSNE(n_components=2, random_state=0, learning_rate=150, init='pca') fitted = model.fit_transform(mat) print "FIT TSNE TOOK %s" % (time.time() - start) return fitted
def save_visualization_to_image(self, inputs, outputs, folder_path_for_result_image): print("Computing t-SNE embedding") x = np.array([state.reshape(-1, ) for state in inputs]) y = outputs tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) x_tsne = tsne.fit_transform(x) self._tsne_plot_embedding(x=x_tsne, y=y, inputs=inputs, path_result_image=os.path.join(folder_path_for_result_image, "t-SNE.png"))
def visualize_latent_rep(args, model, x_latent): print("pca_on=%r pca_comp=%d tsne_comp=%d tsne_perplexity=%f tsne_lr=%f" % ( args.use_pca, args.pca_components, args.tsne_components, args.tsne_perplexity, args.tsne_lr )) if args.use_pca: pca = PCA(n_components = args.pca_components) x_latent = pca.fit_transform(x_latent) figure(figsize=(6, 6)) scatter(x_latent[:, 0], x_latent[:, 1], marker='.') show() tsne = TSNE(n_components = args.tsne_components, perplexity = args.tsne_perplexity, learning_rate = args.tsne_lr, n_iter = args.tsne_iterations, verbose = 4) x_latent_proj = tsne.fit_transform(x_latent) del x_latent figure(figsize=(6, 6)) scatter(x_latent_proj[:, 0], x_latent_proj[:, 1], marker='.') show()
def tSNE_pairwise(D): """ From clustering_on_transcript_compatibility_counts, see github for MIT license """ tsne = manifold.TSNE(n_components=2, random_state=0, metric='precomputed', n_iter=2000, verbose=1); X_tsne = tsne.fit_transform(D); return X_tsne # Plot function with Zeisel's colors corresponding to labels
def do_embedding(self, event=None): converted = self.parent.converted if converted is None: #self.conversion.convert_frames() self.parent.converted = np.load(self.parent.output_folder+'/converted.npy') #FIXME For debugging converted = self.parent.converted method_ind = self.method.currentIndex() print('Doing %s' % self.method.currentText()) if method_ind == 0: self.embedder = manifold.SpectralEmbedding(n_components=4, n_jobs=-1) elif method_ind == 1: self.embedder = manifold.Isomap(n_components=4, n_jobs=-1) elif method_ind == 2: self.embedder = manifold.LocallyLinearEmbedding(n_components=4, n_jobs=-1, n_neighbors=20, method='modified') elif method_ind == 3: self.embedder = manifold.LocallyLinearEmbedding(n_components=4, n_jobs=-1, n_neighbors=20, method='hessian', eigen_solver='dense') elif method_ind == 4: self.embedder = manifold.MDS(n_components=4, n_jobs=-1) elif method_ind == 5: self.embedder = manifold.TSNE(n_components=3, init='pca') self.embedder.fit(converted) self.embed = self.embedder.embedding_ self.embed_plot = self.embed self.gen_hist() self.plot_embedding() if not self.embedded: self.add_classes_frame() self.embedded = True
def visualize2D(model, layerID, inputData, labels, withTime = False): print("\n Generating output distribution for layer {}".format(layerID)) vLayer = K.function([model.layers[0].input], [model.layers[layerID].output]) result = vLayer([inputData]) values = [] for instance in result: for line in instance: array = [] for val in line: if withTime: for deepVal in val: array.append(deepVal) else: array.append(val) values.append(array) npvalues = np.array(values) model = TSNE(n_components = 2, random_state = 0) # model = PCA(n_components = 2) scatterValues = model.fit_transform(npvalues) labels2D = np.zeros((len(labels), 1)) for i in range(len(labels)): labels2D[i][0] = labels[i] scatterValues = np.hstack((scatterValues, labels2D)) dFrame = pd.DataFrame(scatterValues, columns = ('a', 'b', 'c')) plot = dFrame.plot.scatter(x = 'a', y = 'b', c = 'c', cmap = 'plasma') fig = plot.get_figure() fig.savefig('{}/{}'.format(cc.cfg['plots']['dir'],SCATTER_NAME)) print(" ...done")
def index(request): if 'model' not in request.session: return HttpResponseRedirect(URL_PREFIX + '/') template = loader.get_template('conceptualiser.html') lexicons = [] for lexicon in Lexicon.objects.all().filter(author=request.user): setattr(lexicon,'size',Word.objects.all().filter(lexicon=lexicon.id).count()) lexicons.append(lexicon) methods = ["PCA","TSNE","MDS"] return HttpResponse(template.render({'STATIC_URL':STATIC_URL,'lexicons':lexicons,'methods':methods},request))
def generate_tsne(self, path="glove/model/model", size=(100, 100), word_count=1000, embeddings=None): if embeddings is None: embeddings = self.embeddings from sklearn.manifold import TSNE tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) low_dim_embs = tsne.fit_transform(numpy.asarray(list(embeddings.values()))) labels = self.words[:word_count] return _plot_with_labels(low_dim_embs, labels, path, size)
def display_tsne(train_x, train_y, label_map=None): """ t-distributed Stochastic Neighbor Embedding (t-SNE) visualization [1]. [1]: Maaten, L., Hinton, G. (2008). Visualizing Data using t-SNE. JMLR 9(Nov):2579--2605. Args: train_x: 2d numpy array (batch, features) of samples train_y: 2d numpy array (batch, labels) for samples label_map: a dict of labelled (str(int), string) key, value pairs """ tsne = TSNE(n_components=2, random_state=0) x_transform = tsne.fit_transform(train_x) y_unique = np.unique(train_y) if label_map is None: label_map = {str(i): str(i) for i in y_unique} elif not isinstance(label_map, dict): raise ValueError('label_map most be a dict of a key' ' mapping to its true label') colours = plt.cm.rainbow(np.linspace(0, 1, len(y_unique))) plt.figure() for index, cl in enumerate(y_unique): plt.scatter(x=x_transform[train_y == cl, 0], y=x_transform[train_y == cl, 1], s=100, c=colours[index], marker='o', edgecolors='none', label=label_map[str(cl)]) plt.xlabel('X in t-SNE') plt.ylabel('Y in t-SNE') plt.legend(loc='upper right') plt.title('t-SNE visualization') plt.show(False)
def main(): model = TSNE(n_components=2) countries = dictdata(getCountrydict()) result = model.fit_transform(countries.getData()) hidden, graph = plt.subplots() graph.scatter(result[:, 0], result[:, 1], s=1) for i, country in enumerate(countries.getName()): graph.annotate(country, xy=(result[i, 0], result[i, 1]), size=10) plt.show()
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'): # collect embeddings for mfi: X = np.asarray([self.w2v_model[w] for w in self.mfi \ if w in self.w2v_model], dtype='float32') # dimension reduction: tsne = TSNE(n_components=2) coor = tsne.fit_transform(X) # unsparsify plt.clf() sns.set_style('dark') sns.plt.rcParams['axes.linewidth'] = 0.4 fig, ax1 = sns.plt.subplots() labels = self.mfi # first plot slices: x1, x2 = coor[:,0], coor[:,1] ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none') # clustering on top (add some colouring): clustering = AgglomerativeClustering(linkage='ward', affinity='euclidean', n_clusters=nb_clusters) clustering.fit(coor) # add names: for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_): ax1.text(x, y, name, ha='center', va="center", color=plt.cm.spectral(cluster_label / 10.), fontdict={'family': 'Arial', 'size': 8}) # control aesthetics: ax1.set_xlabel('') ax1.set_ylabel('') ax1.set_xticklabels([]) ax1.set_xticks([]) ax1.set_yticklabels([]) ax1.set_yticks([]) sns.plt.savefig(outputfile, bbox_inches=0)
def main(_): """Train a word2vec model.""" if not FLAGS.train_data or not FLAGS.save_path: print("--train_data and --save_path must be specified.") sys.exit(1) opts = Options() with tf.Graph().as_default(), tf.Session() as session: model = Word2Vec(opts, session) for _ in xrange(opts.epochs_to_train): model.train() # Process one epoch # Perform a final save. model.saver.save(session, os.path.join(opts.save_path, opts.name+".model.base.ckpt"), global_step=model.global_step) model.nearby(['Switzerland']) tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) plot_only = len(model._id2word) final_embeddings = model._emb.eval(session) print(final_embeddings) pkl.dump(final_embeddings,open("embeddings/"+opts.name+".emb.base.pkl","wb")) pkl.dump(model._word2id, open("dicts/"+opts.name+".w2i.base.pkl","wb")) pkl.dump(model._id2word, open("dicts/"+opts.name+".i2w.base.pkl","wb")) low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:]) print(low_dim_embs) # print(zip(model._id2word.iteritems(),low_dim_embs)) labels = [model._id2word[i] for i in xrange(plot_only)] plot_with_labels(low_dim_embs, labels,"plots/"+opts.name+".tsne.base.png") if FLAGS.interactive: # E.g., # [0]: model.analogy('france', 'paris', 'russia') # [1]: model.nearby(['proton', 'elephant', 'maxwell']) _start_shell(locals())
def calculate_tsne(self): self._perform_svd() if self.method == SKLEARN: tsne_vectors = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(self.data_vectors) else: tsne_vectors = MATTENS_TSNE(self.data_vectors, no_dims=2, initial_dims=self.data_vectors.shape[1], perplexity=40.0) self.tsne_vectors = tsne_vectors
def main(): tsne = TSNE(perplexity=40) Z = tsne.fit_transform(X) plt.scatter(Z[:,0], Z[:,1]) for i in xrange(D): plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1])) plt.show()
def main(): X, Y = get_donut_data() plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) plt.show() tsne = TSNE(perplexity=40) Z = tsne.fit_transform(X) plt.scatter(Z[:,0], Z[:,1], s=100, c=Y, alpha=0.5) plt.show()
def main(): Xtrain, Ytrain, _, _ = getKaggleMNIST() sample_size = 1000 X = Xtrain[:sample_size] Y = Ytrain[:sample_size] tsne = TSNE() Z = tsne.fit_transform(X) plt.scatter(Z[:,0], Z[:,1], s=100, c=Y, alpha=0.5) plt.show()
def main(): Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() dbn = DBN([1000, 750, 500], UnsupervisedModel=AutoEncoder) # dbn = DBN([1000, 750, 500, 10]) output = dbn.fit(Xtrain, pretrain_epochs=2) print "output.shape", output.shape # sample before using t-SNE because it requires lots of RAM sample_size = 600 tsne = TSNE() reduced = tsne.fit_transform(output[:sample_size]) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5) plt.title("t-SNE visualization") plt.show() # t-SNE on raw data reduced = tsne.fit_transform(Xtrain[:sample_size]) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5) plt.title("t-SNE visualization") plt.show() pca = PCA() reduced = pca.fit_transform(output) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5) plt.title("PCA visualization") plt.show()
def main(): X, Y = get_xor_data() plt.scatter(X[:,0], X[:,1], s=100, c=Y, alpha=0.5) plt.show() tsne = TSNE(perplexity=40) Z = tsne.fit_transform(X) plt.scatter(Z[:,0], Z[:,1], s=100, c=Y, alpha=0.5) plt.show()
def main(): audio_embeddings_dict = cPickle.load(open(AUDIO_EMBEDDINGS_DICT, 'rb')) audio_label_indices_dict = cPickle.load(open(AUDIO_LABEL_INDICES_DICT, 'rb')) X = [] ids = [] for k in audio_embeddings_dict.keys()[:EXAMPLES_SIZE_LIMIT]: for embedding in audio_embeddings_dict[k]: X.append(embedding) ids.append(audio_label_indices_dict[k]) # Apply t-SNE tsne = TSNE(n_components=N_COMPONENTS, perplexity=PERPLEXITY, \ learning_rate=LEARNING_RATE, n_iter=N_ITER) Xtransformed = tsne.fit_transform(X) # save the embeddings along with the list of class IDs associated with # the clip from which it was taken. # Header for output file if N_COMPONENTS == 2: output_lines = ["dim1,dim2,labels"] elif N_COMPONENTS == 3: output_lines = ["dim1,dim2,dim3,labels"] for i in range(len(Xtransformed)): output_lines.append(",".join([str(j) for j in Xtransformed[i]])+ \ "," + ",".join([str(k) for k in ids[i]])) output_file_contents = "\n".join(output_lines) with open(OUTPUT_FILENAME, 'w') as fh: fh.write(output_file_contents)