我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用seaborn.set()。
def visualize_results(self): # Visualize logistic curve using seaborn sns.set(style="darkgrid") sns.regplot(x="pageviews_cumsum", y="is_conversion", data=self.df, logistic=True, n_boot=500, y_jitter=.01, scatter_kws={"s": 60}) sns.set(font_scale=1.3) sns.plt.title('Logistic Regression Curve') sns.plt.ylabel('Conversion probability') sns.plt.xlabel('Cumulative sum of pageviews') sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10) sns.plt.show()
def outlier_identification(self, model, x_train, y_train): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print('\nOutlier shapes') print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) model.fit(x_train_split, y_train_split) y_predicted = model.predict(x_test_split) residuals = np.absolute(y_predicted - y_test_split) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) outliers_mask = residuals >= rmse_pred_vs_actual outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask]) not_an_outlier = outliers_mask == 0 # Resample the training set from split, since the set was randomly split x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0) y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0) return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split, y_test_split, title_name): # Split the training data into an extra set of test # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual y') plt.ylabel('Predicted y') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def __init__(self, parent): fig = Figure(figsize=(4, 4), dpi=100, tight_layout=True) super(DefaultGraph, self).__init__(fig) self.setParent(parent) sns.set(style="dark") for index, s in zip(range(9), np.linspace(0, 3, 10)): axes = fig.add_subplot(3, 3, index + 1) x, y = np.random.randn(2, 50) cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True) sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes) axes.set_xlim(-3, 3) axes.set_ylim(-3, 3) axes.set_xticks([]) axes.set_yticks([]) fig.suptitle("Activity Browser", y=0.5, fontsize=30, backgroundcolor=(1, 1, 1, 0.5)) self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding) self.updateGeometry()
def plot_correlation_fig(data): """ Creates a correlation heat map for all columns in user data. Parameters ---------- data: Pandas DataFrame User data file as a Pandas DataFrame Returns ------- Matplotlib Figure object. """ sns.set(context='talk', style='white') fig = plt.figure() sns.heatmap(data.corr(), vmin=-1, vmax=1) plt.tight_layout() return fig
def plot_count_fig(tasks): """ Create count plot, as a 2-row x 3-col bar plot of data points for each k in each covar. Parameters ---------- tasks: list(dict) Returns ------- Matplotlib Figure object. """ sns.set(context='talk', style='whitegrid') df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied'])) df = df.loc[:, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']] df['covar_type'] = [x.capitalize() for x in df['covar_type']] df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']] f = sns.factorplot(x='k', kind='count', col='covar_type', row='covar_tied', data=df, row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True, palette='Blues_d') f.set_titles("{col_name}-{row_name}") f.set_xlabels("Num. of Clusters (K)") return f.fig
def generateRawPlot(test): # set figure size plt.figure(figsize=(15, 6)) handles = [] # draw plot for raw in test: label = raw.pop(0) xAxis = range(len(raw)) yAxis = [float(i) for i in raw] handle, = plt.plot(xAxis, yAxis, label=label) handles.append(handle) # put axis labels plt.xlabel("operations") plt.ylabel("time (s)") plt.legend(handles=handles)
def generateMassPlot(test): # set figure size plt.figure(figsize=(15, 6)) handles = [] # draw plot for raw in test: label = raw.pop(0) yAxis = [i / (len(raw)) for i in range(len(raw) + 1)] values = sorted([float(i) for i in raw]) xAxis = [0] + values handle, = plt.plot(xAxis, yAxis, label=label) handles.append(handle) # put axis labels plt.xlabel("time (s)") plt.ylabel("probability of completion") plt.legend(handles=handles)
def cor_df(data, cols=None, xticklabels=False, yticklabels=False, close=True): ''' ??: ??????????? ???: data: ?????dataframe?? cols: ?????list??????data???? close: ???????? ???: cormat: ??????dataframe?? heatmap: ????fig?? ''' if cols is None: cols=list(data.columns) corrmat = data[cols].corr() fig = plt.figure() ax = fig.add_subplot(111) sns.set(context='paper', font='monospace') sns.heatmap(corrmat, vmax=0.8, square=True, ax=ax, xticklabels=xticklabels, yticklabels=yticklabels) ax.set_title('Heatmap of Correlation Matrix') if close: plt.close('all') return corrmat, fig #Distribution
def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times
def generate_speed_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky'], labels=None, colours=None): labels = labels or {} colours = colours or {} seaborn.set() fig, ax = plt.subplots() factors = data['factors'] for key in keys: ax.plot(factors, data[key], color=colours.get(key, COLOURS.get(key)), marker='o', markersize=6) ax.text(factors[-1] + 5, data[key][-1], labels.get(key, LABELS[key]), fontsize=10) ax.set_ylabel("Seconds per Iteration") ax.set_xlabel("Factors") plt.savefig(filename, bbox_inches='tight', dpi=300)
def find_n_most_similar_articles(self): """ Find the n most similar articles with the highest similarity score for each article in the DataFrame. :return: """ # Iterate over each article in DataFrame for index, row in self.df_article_vectors.iterrows(): # Get the similarity scores of the current article compared to all other articles similarity_scores = self.similarity_score_dict[index] # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar. for i in range(0, self.n_most_similar): # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min! most_similar_article_index = max(similarity_scores, key=similarity_scores.get) most_similar_article_score = similarity_scores[most_similar_article_index] del similarity_scores[most_similar_article_index] # Find corresponding title and set it as most similar article i in DataFrame title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8') title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score) self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
def tokenize(text): """ Tokenizes sequences of text and stems the tokens. :param text: String to tokenize :return: List with stemmed tokens """ tokens = nl.WhitespaceTokenizer().tokenize(text) tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens)) tokens = [word for word in tokens if word not in stopwords.words('english')] tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens)) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
def outlier_prediction(x_train, y_train): # Use built-in isolation forest or use predicted vs. actual # Compute squared residuals of every point # Make a threshold criteria for inclusion # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1 rng = np.random.RandomState(42) clf_all_features = IsolationForest(max_samples=100, random_state=rng) clf_all_features.fit(x_train) # Predict if a particular sample is an outlier using all features for higher dimensional data set. y_pred_train = clf_all_features.predict(x_train) # Exclude suggested outlier samples for improvement of prediction power/score outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train)) x_train_modified = x_train[outlier_map_out_train, ] y_train_modified = y_train[outlier_map_out_train, ] return x_train_modified, y_train_modified
def drop_variable(self, df): # if HousePrices._is_one_hot_encoder: # Drop all categorical feature helping columns ('Num') # Todo: is it defined when importing data set? _feature_names_num # for feature_name in HousePrices._feature_names_num: # df = df.drop([feature_name], axis=1) # is_with_feature_agglomeration = 0 # if is_with_feature_agglomeration: # print(df.shape) # df = HousePrices.feature_agglomeration(df) # print(df.shape) # df = df.drop(['Fireplaces'], axis=1) df = df.drop(['Id'], axis=1) if not any(tuple(df.columns == 'SalePrice')): # All feature var names occuring in test data is assigned the public varaible df_test_all_feature_var_names. self.df_test_all_feature_var_names = df.columns return df
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter=50000, cv=10) # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, # 0.3, 0.6, 1], cv=10) lasso.fit(x_train_split, y_train_split) y_predicted = lasso.predict(X=x_test_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name): # Split the training data into an extra set of test x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split) dtest_split = xgb.DMatrix(x_test_split) res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds) y_predicted = gbdt.predict(dtest_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def ranges_to_list(x, start=0, stop=None): s = set() for xi in x: xi = str(xi) if xi.find('-') >= 0: t = xi.split('-') if len(t) != 2: raise ValueError('Invalid range!') if len(t[0]) == 0: t[0] = start if len(t[1]) == 0: t[1] = stop s |= set(range(int(t[0]), int(t[1]) + 1)) else: s.add(int(xi)) s = sorted(list(s)) return s
def scoped_mpl_import(): import matplotlib matplotlib.rcParams['backend'] = MPL_BACKEND import matplotlib.pyplot as plt plt.rcParams['toolbar'] = 'None' # mute matplotlib toolbar import seaborn as sns sns.set(style="whitegrid", color_codes=True, font_scale=1.0, rc={'lines.linewidth': 1.0, 'backend': matplotlib.rcParams['backend']}) palette = sns.color_palette("Blues_d") palette.reverse() sns.set_palette(palette) return (matplotlib, plt, sns)
def cross_section_cndl(data, factor_name): '''??????????????? ?????????????? ?? ------------------------------ data:DataFrame(index:[Date,IDs],factor1,factor2,...) factor_name:str ''' data = data.reset_index() sns.set(style='ticks') ax = sns.boxplot(x='Date', y=factor_name, data=data, palette='PRGn') sns.despine(offset=10, trim=True) return ax # ??2 # ?????, ?????????????
def factor_plot(dataFrame, factors, prediction, color="Set3"): # First, plot the total for each factor. Then, plot the total for each # factor for the prediction variable (so in a conversion example, how # many people converted, revenue per country, etc.) # These refer to the rows and columns of the axis numpy array; not the # data itself. row = 0 column = 0 sns.set(style="whitegrid") # TODO: Set the width based on the max number of unique # values for the factors. plots = plt.subplots(len(factors), 2, figsize=(8,12)) # It should for factor in factors: sns.countplot(x=factor, palette="Set3", data=dataFrame, ax=plots[1][row][column]) # Then print the total for each prediction sns.barplot(x=factor, y=prediction, data=dataFrame, ax=plots[1][row][column+1]) row += 1 plt.tight_layout() # Need this or else plots will crash into each other
def swarm(data,x,y,xscale='linear',yscale='linear'): # set default pretty settings from Seaborn sns.set(style="white", palette="muted") sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 0.2}) # createthe plot g = sns.swarmplot(x=x, y=y, data=data, palette='RdYlGn') plt.tick_params(axis='both', which='major', pad=10) g.set(xscale=xscale) g.set(yscale=yscale) # Setting plot limits start = data[y].min().min() plt.ylim(start,); sns.despine()
def correlation(data,title=''): corr = data.corr(method='spearman') mask = np.zeros_like(corr) mask[np.triu_indices_from(mask)] = True sns.set(style="white") sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 0.3}) rcParams['figure.figsize'] = 25, 12 rcParams['font.family'] = 'Verdana' rcParams['figure.dpi'] = 300 g = sns.heatmap(corr, mask=mask, linewidths=1, cmap="RdYlGn", annot=False) g.set_xticklabels(data,rotation=25,ha="right"); plt.tick_params(axis='both', which='major', pad=15);
def plot(self, ax=None, holdon=False): sns.set(style="white") data = self.X if ax is None: _, ax = plt.subplots() for i, index in enumerate(self.clusters): point = np.array(data[index]).T ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i]) for point in self.centroids: ax.scatter(*point, marker='x', linewidths=10) if not holdon: plt.show()
def plot_mds(subjects, experiments, axes): for subj, exp, ax in zip(subjects, experiments, axes): res_fname = "correlation_analysis/{}_{}_ifs.pkz".format(subj, exp) res = moss.load_pkl(res_fname) sorter = np.argsort(np.abs(res.prefs)) x_, y_ = res.mds_coords.T.dot(res.prefs) t = np.arctan2(y_, x_) rot = [[np.cos(t), np.sin(t)], [-np.sin(t), np.cos(t)]] x, y = np.dot(rot, res.mds_coords[sorter].T) cmap = get_colormap(exp) ax.scatter(x, y, c=res.prefs[sorter], cmap=cmap, vmin=-1.75, vmax=1.75, s=8, linewidth=0) ax.set(xlim=(-.9, .9), ylim=(-.9, .9), aspect="equal") ax.set_axis_off()
def visualize_housing_data(df): sns.set(style='whitegrid', context='notebook') cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'] sns.pairplot(df[cols], size=2.5) plt.show() correlation_matrix = np.corrcoef(df[cols].values.T) sns.set(font_scale=1.5) heatmap = sns.heatmap( correlation_matrix, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 15}, yticklabels=cols, xticklabels=cols, ) plt.show()
def visualize_results(df): # Visualize logistic curve using seaborn sns.set(style="darkgrid") sns.regplot(x="pageviews_cumsum", y="is_conversion", data=df, logistic=True, n_boot=500, y_jitter=.01, scatter_kws={"s": 60}) sns.set(font_scale=1.3) sns.plt.title('Logistic Regression Curve') sns.plt.ylabel('Conversion probability') sns.plt.xlabel('Cumulative sum of pageviews') sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10) sns.plt.show() # Run the final program
def tokenize(text): """ Tokenizes sequences of text and stems the tokens. :param text: String to tokenize :return: List with stemmed tokens """ tokens = nltk.WhitespaceTokenizer().tokenize(text) tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens)) tokens = [word for word in tokens if word not in stopwords.words('english')] tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens)) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
def find_n_most_similar_articles(self): """ Find the n most similar articles with the highest similarity score for each TMT article in the DataFrame. :return: """ # Iterate over each article in DataFrame for index, row in self.df_article_vectors.iterrows(): # Get the similarity scores of the current article compared to all other articles similarity_scores = self.similarity_score_dict[index] # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar. for i in range(0, self.n_most_similar): # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min! most_similar_article_index = max(similarity_scores, key=similarity_scores.get) most_similar_article_score = similarity_scores[most_similar_article_index] del similarity_scores[most_similar_article_index] # Find corresponding title and set it as most similar article i in DataFrame title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8') title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score) self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
def image(path, costs): ys = ['0', '1', '2', '3', '4', '5', '6', '7+', 'X'] xs = [costs.get(k, 0) for k in ys] sns.set_style('white') sns.set(font='Concourse C3', font_scale=3) g = sns.barplot(ys, xs, palette=['grey'] * len(ys)) g.axes.yaxis.set_ticklabels([]) rects = g.patches sns.set(font='Concourse C3', font_scale=2) for rect, label in zip(rects, xs): if label == 0: continue height = rect.get_height() g.text(rect.get_x() + rect.get_width()/2, height + 0.5, label, ha='center', va='bottom') g.margins(y=0, x=0) sns.despine(left=True, bottom=True) g.get_figure().savefig(path, transparent=True, pad_inches=0, bbox_inches='tight') plt.clf() # Clear all data from matplotlib so it does not persist across requests. return path
def plot_decision_function(score_df, partition, output_file): """ Plots the decision function for a given partition (either 'train' or 'test') and saves a figure to file. Arguments: :param score_df: a specific folds decision scores and status :param partition: either 'train' or 'test' will plot performance :param output_file: file to output the figure """ ax = sns.kdeplot(score_df.ix[(score_df.status == 1) & (score_df.partition == partition), :] .decision, color='red', label='Deficient', shade=True) ax = sns.kdeplot(score_df.ix[(score_df.status == 0) & (score_df.partition == partition), :] .decision, color='blue', label='Wild-Type', shade=True) ax.set(xlabel='Decision Function', ylabel='Density') ax.set_title('Classifier Decision Function') sns.despine() plt.tight_layout() plt.savefig(output_file) plt.close()
def sns_triangle(matrix, plt_title, only_class=None): sns.set(style="white") # Generate a mask for the upper triangle mask = np.zeros_like(matrix, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(matrix.as_matrix(), mask=mask, cmap=cmap, vmax=.3, square=True, xticklabels=5, yticklabels=5, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) title(plt_title) xlabel('Preprocessed Features') ylabel('Preprocessed Features') if only_class is None: only_class = '' savefig('images/triangle'+only_class+'.png')
def setup_theme(context='notebook', style="darkgrid", palette='deep', font='Helvetica'): try: import seaborn as sns extra_rc = {"lines.linewidth": 1, "lines.markeredgewidth": 1, "patch.edgecolor": 'k', } sns.set(context=context, style=style, palette=palette, rc=extra_rc) except (ImportError, SyntaxError): pass rc('text', usetex=True) if font == "Helvetica": rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']}) elif font == "Palatino": rc('font', **{'family':'serif','serif': ['Palatino']}) elif font == "Schoolbook": rc('font', **{'family':'serif','serif': ['Century Schoolbook L']})
def plot_corr_heatmap(corr, labels, heading): sns.set(style="white") # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(8, 8)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, square=True, xticklabels=labels, yticklabels=labels, linewidths=.5, ax=ax, cbar_kws={"shrink": .5}, annot=True) ax.set_title(heading) plt.show()
def get_level_colors(index): pallete = sns.color_palette("colorblind") * int(1e6) colors = list() if hasattr(index, "levels"): for level in index.levels: color_dict = dict(zip(level, pallete)) level_colors = [color_dict[x] for x in index.get_level_values(level.name)] colors.append(level_colors) else: color_dict = dict(zip(set(index), pallete)) index_colors = [color_dict[x] for x in index] colors.append(index_colors) return colors
def sanity_check(Ds): names, Ds = zip(*Ds) D0 = Ds[0] Ps = set(D0.policy.unique()) Es = set(D0.example.unique()) # Sanity check. for name, dd in zip(names, Ds): # same policy and examples if (set(dd.policy.unique()) != Ps or set(dd.example.unique()) != Es): print colors.bold % colors.red % '======================================' print colors.bold % colors.red % 'WARNING: some policies arent finished.' print colors.bold % colors.red % '======================================' print name, 'want/got sizes %s/%s' % (len(Ps), len(set(dd.policy.unique()))) #assert set(dd.policy.unique()) == Ps # same policies #assert set(dd.example.unique()) == Es # same examples bestof = aggregate_multiple_runtime_trials(Ds, Ps) return D0, Ds, bestof
def target_plot(self): target_type = self.input_data.metadata.loc[self.target].type target_data = self.input_data.df[self.target] sns.set(style="white", color_codes=True) if not self.run_time_config['is_time_series']: if target_type == ColType.BINARY: plt.figure(figsize=(6, 1)) sns.barplot(target_data.sum() / target_data.shape[0]) plt.xlim([0, 1]) plt.title(target_data.name + ' rate') elif target_type == ColType.NUMERIC or target_type == ColType.ORDINAL: plt.figure(figsize=(6, 2)) ax = sns.distplot(target_data, hist_kws=dict(edgecolor='black')) ax.set_xlim(target_data.min(), target_data.max()) plt.title(target_data.name + ' histogram') else: self.time_series_target_plot()
def predictions_vs_actual_regression(model_results, model_name, size=6, bins=None, gridsize=30, outlier_ratio=None, **kwargs): holdout = model_results.holdout_data target = model_results.target if outlier_ratio is not None: holdout = utils.remove_outlier_rows(holdout, 'prediction', outlier_ratio) holdout = utils.remove_outlier_rows(holdout, target, outlier_ratio) sns.set(style="white", color_codes=True) marginal_kws = dict(hist_kws=dict(edgecolor='black')) plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14) grid = sns.jointplot('prediction', target, holdout, 'hexbin', gridsize=gridsize, size=size, bins=bins, space=0, marginal_kws=marginal_kws, **kwargs) plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1) # shrink fig so cbar is visible cax = grid.fig.add_axes([.95, .18, .04, .5]) # x, y, width, height color_bar = sns.plt.colorbar(cax=cax) if bins is None: color_bar.set_label('count') elif bins == 'log': color_bar.set_label('log_10(count)') return grid
def scatter(x,y,xlabel='x',ylabel='y',title=None,line=False,name=None,show=False): sns.set() title = "%s vs %s"%(xlabel,ylabel) if title is None else title plt.scatter(x,y) if line: plt.plot(x,y) plt.title(title) plt.ylabel('y: %s'%ylabel) plt.xlabel('x: %s'%xlabel) if name is not None: #fig = plt.Figure() plt.savefig(name) if show: plt.show() plt.clf()
def distribution(data,xlabel="data",ylabel="percentage",name=None): ax = plt.axes() ax.set(xlabel=xlabel,ylabel=ylabel) ds = sns.distplot(data,ax=ax) plt.show() if name is not None: ds.get_figure().savefig(name)
def corr_heatmap(df,cols=None,name=None): sns.set() if cols is None: cols = [i for i in df.columns.values if df[i].dtype!='object'] df = df[cols].corr() print(df.shape) ds = sns.heatmap(df, annot=False) plt.show() if name is not None: ds.get_figure().savefig(name)
def get_average_metrics(metrics_list): """Get algorithm performance over all folds""" eval_metrics = {} track_metric = 0 for fold in metrics_list: track_metric += fold["auc"] # Divide by number of folds track_metric /= len(metrics_list) eval_metrics.update({"auc": track_metric}) for metric in ("fpr", "tpr"): # the fpr, tpr output from scikit-learn may not have the same # number of elements in the arrays, set to Null for now eval_metrics.update({metric: [0, 0]}) # TODO for threshold in THRESHOLDS: eval_metrics[threshold] = {} for metric in ("precision", "recall", "f1"): track_metric = 0 for fold in metrics_list: track_metric += fold[threshold][metric] # Divide by number of folds track_metric /= len(metrics_list) eval_metrics[threshold].update({metric: track_metric}) return eval_metrics
def precision_recall_at_x_proportion(test_labels, test_predictions, x_proportion=0.01, return_cutoff=False): """Compute precision, recall, F1 for a specified fraction of the test set. :params list test_labels: true labels on test set :params list test_predicted: predicted labels on test set :params float x_proportion: proportion of the test set to flag :params bool return_cutoff: if True return the cutoff probablility :returns float precision: fraction correctly flagged :returns float recall: fraction of the positive class recovered :returns float f1: """ cutoff_index = int(len(test_predictions) * x_proportion) cutoff_index = min(cutoff_index, len(test_predictions) - 1) sorted_by_probability = np.sort(test_predictions)[::-1] cutoff_probability = sorted_by_probability[cutoff_index] test_predictions_binary = [1 if x > cutoff_probability else 0 for x in test_predictions] precision, recall, f1, _ = metrics.precision_recall_fscore_support( test_labels, test_predictions_binary) # Only interested in metrics for label 1 precision, recall, f1 = precision[1], recall[1], f1[1] if return_cutoff: return precision, recall, f1, cutoff_probability else: return precision, recall, f1
def plot_allkfolds_ROC(timestamp, cv, fpr_arr, tpr_arr): sns.set(style="white", palette="muted", color_codes=True) mean_tpr = 0.0 mean_fpr = 0.0 all_roc_auc = [] bins_roc = np.linspace(0, 1, 300) with plt.style.context(('seaborn-muted')): fig, ax = plt.subplots(figsize=(10, 8)) for i, (train, test) in enumerate(cv): mean_tpr += interp(bins_roc, fpr_arr[i], tpr_arr[i]) mean_tpr[0] = 0.0 mean_fpr += interp(bins_roc, fpr_arr[i], tpr_arr[i]) mean_fpr[0] = 0.0 roc_auc = metrics.auc(fpr_arr[i], tpr_arr[i]) all_roc_auc.append(roc_auc) ax.plot(fpr_arr[i], tpr_arr[i], lw=1, label='KFold %d (AUC = %0.2f)' % (i, roc_auc)) ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = np.mean(all_roc_auc) ax.plot(bins_roc, mean_tpr, 'k--', label='Mean ROC (AUC = %0.2f)' % mean_auc, lw=2) ax.set_xlim([-0.05, 1.05]) ax.set_ylim([-0.05, 1.05]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('Receiver Operating Characteristic') ax.legend(loc="lower right") plt.savefig('{}_roc.png'.format(timestamp)) plt.close('all') return mean_auc
def feature_mapping_to_numerical_values(self, df): TwoSigmaFinModTools._is_one_hot_encoder = 0 mask = ~df.isnull() # Assume that training set has all possible feature_var_names # Although it may occur in real life that a training set may hold a feature_var_name. But it is probably # avoided since such features cannot # be part of the trained learning algo. # Add missing feature_var_names of training set not occurring in test set. Add these with zeros in columns. if not any(tuple(df.columns == 'y')): # All one-hot encoded feature var names occurring in test data is assigned the public variable # df_test_all_feature_var_names. self.df_test_all_feature_var_names = df.columns _feature_names_num = np.zeros((TwoSigmaFinModTools._non_numerical_feature_names.shape[0],), dtype=object) ith = 0 for feature_name in TwoSigmaFinModTools._non_numerical_feature_names: # Create a feature_nameNum list feature_name_num = ''.join([feature_name, 'Num']) _feature_names_num[ith] = feature_name_num ith += 1 TwoSigmaFinModTools.encode_labels_in_numeric_format(df, feature_name) if TwoSigmaFinModTools._is_one_hot_encoder: is_with_label_binarizer = 0 if is_with_label_binarizer: mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True) feature_var_values = mapper_df.fit_transform(df.copy()) print(df[feature_name].isnull().sum().sum()) print(df[feature_name][mask[feature_name]].isnull().sum().sum()) for ite in feature_var_values.columns: df[ite] = feature_var_values[ite] else: TwoSigmaFinModTools.one_hot_encoder(df, feature_name) TwoSigmaFinModTools._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)
def dendrogram(df, number_of_clusters=int(df.shape[1] / 1.2)): # Create Dendrogram agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters) used_networks = np.arange(0, number_of_clusters, dtype=int) # Create a custom palette to identify the networks network_pal = sns.cubehelix_palette(len(used_networks), light=.9, dark=.1, reverse=True, start=1, rot=-2) network_lut = dict(zip(map(str, df.columns), network_pal)) # Convert the palette to vectors that will be drawn on the side of the matrix networks = df.columns.get_level_values(None) network_colors = pd.Series(networks, index=df.columns).map(network_lut) sns.set(font="monospace") # Create custom colormap cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True) cg = sns.clustermap(df.astype(float).corr(), cmap=cmap, linewidths=.5, row_colors=network_colors, col_colors=network_colors) plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) plt.show()
def predicted_vs_actual_y_input_model(self, model, x_train_split, x_test_split, y_train_split, y_test_split, title_name): # Split the training data into an extra set of test # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train) print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split)) model.fit(x_train_split, y_train_split) y_predicted = model.predict(x_test_split) plt.figure(figsize=(10, 5)) plt.scatter(y_test_split, y_predicted, s=20) rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split) plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)])) plt.xlabel('Actual y') plt.ylabel('Predicted y') plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)]) plt.tight_layout()
def __init__(self, parent, data, labels, width=6, height=6, dpi=100): figure = Figure(figsize=(width, height), dpi=dpi, tight_layout=True) axes = figure.add_subplot(111) super(CorrelationPlot, self).__init__(figure) self.setParent(parent) sns.set(style="darkgrid") corr = data # cmap = sns.diverging_palette(220, 10, as_cmap=True) # corrplot(data, names=labels, annot=True, sig_stars=False, # diag_names=True, cmap=cmap, ax=axes, cbar=True) df = pd.DataFrame(data=data, columns=labels) corr = df.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Draw the heatmap with the mask and correct aspect ratio vmax = np.abs(corr.values[~mask]).max() # vmax = np.abs(corr).max() sns.heatmap(corr, mask=mask, cmap=plt.cm.PuOr, vmin=-vmax, vmax=vmax, square=True, linecolor="lightgray", linewidths=1, ax=axes) for i in range(len(corr)): axes.text(i + 0.5, i + 0.5, corr.columns[i], ha="center", va="center", rotation=0) for j in range(i + 1, len(corr)): s = "{:.3f}".format(corr.values[i, j]) axes.text(j + 0.5, i + 0.5, s, ha="center", va="center") axes.axis("off") # If uncommented, fills widget self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding) self.updateGeometry() self.setMinimumSize(self.size())
def plot_aic_bic_fig(tasks): """ Creates AIC-BIC plot, as a 2-row x 3-col grid of point plots with 95% confidence intervals. Parameters ---------- tasks: list(dict) Returns ------- Matplotlib Figure object """ sns.set(context='talk', style='whitegrid') # Filter list of dicts to reduce the size of Pandas DataFrame df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied', 'bic', 'aic'])) df['covar_type'] = [x.capitalize() for x in df['covar_type']] df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']] df['aic'] = df['aic'].astype('float') df['bic'] = df['bic'].astype('float') df = pd.melt(df, id_vars=['k', 'covar_type', 'covar_tied'], value_vars=['aic', 'bic'], var_name='metric') f = sns.factorplot(x='k', y='value', col='covar_type', row='covar_tied', hue='metric', data=df, row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True, ci=95, n_boot=100) f.set_titles("{col_name}-{row_name}") f.set_xlabels("Num. of Clusters (K)") return f.fig
def plot_single_cluster_fig(data, columns, labels, bic, k, show_ticks=True): """ Creates cluster plot for the best label assignment based on BIC score. Parameters ---------- data: Pandas DataFrame - User data file as a Pandas DataFrame columns: list(str) - Column numbers from to use as the plot's x and y axes. labels: list(int) - labels of the single task bic: int - task's BIC score show_ticks: bool - Show or hide tick marks on x and y axes. Returns ------- Matplotlib Figure object. """ sns.set(context='talk', style='white') columns = columns[:2] fig = plt.figure() lim_left = data[columns[0]].min() lim_right = data[columns[0]].max() lim_bottom = data[columns[1]].min() lim_top = data[columns[1]].max() plt.scatter(data[columns[0]], data[columns[1]], c=labels, cmap=plt.cm.rainbow, s=10) plt.xlabel(columns[0]) plt.ylabel(columns[1]) plt.xlim(left=lim_left, right=lim_right) plt.ylim(bottom=lim_bottom, top=lim_top) if show_ticks is False: plt.xticks([]) plt.yticks([]) title = "K={}\nBIC: {:,.1f}".format(k, bic) plt.title(title) plt.tight_layout() return fig