Python seaborn 模块,set() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用seaborn.set()

项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def visualize_results(self):
        # Visualize logistic curve using seaborn
        sns.set(style="darkgrid")
        sns.regplot(x="pageviews_cumsum",
                    y="is_conversion",
                    data=self.df,
                    logistic=True,
                    n_boot=500,
                    y_jitter=.01,
                    scatter_kws={"s": 60})
        sns.set(font_scale=1.3)
        sns.plt.title('Logistic Regression Curve')
        sns.plt.ylabel('Conversion probability')
        sns.plt.xlabel('Cumulative sum of pageviews')
        sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
        sns.plt.show()
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def outlier_identification(self, model, x_train, y_train):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print('\nOutlier shapes')
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        residuals = np.absolute(y_predicted - y_test_split)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        outliers_mask = residuals >= rmse_pred_vs_actual
        outliers_mask = np.concatenate([np.zeros((np.shape(y_train_split)[0],), dtype=bool), outliers_mask])
        not_an_outlier = outliers_mask == 0
        # Resample the training set from split, since the set was randomly split
        x_out = np.insert(x_train_split, np.shape(x_train_split)[0], x_test_split, axis=0)
        y_out = np.insert(y_train_split, np.shape(y_train_split)[0], y_test_split, axis=0)
        return x_out[not_an_outlier, ], y_out[not_an_outlier, ]
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_y_xgb(self, xgb, best_nrounds, xgb_params, x_train_split, x_test_split, y_train_split,
                                  y_test_split, title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:activity-browser    作者:LCA-ActivityBrowser    | 项目源码 | 文件源码
def __init__(self, parent):
        fig = Figure(figsize=(4, 4), dpi=100, tight_layout=True)
        super(DefaultGraph, self).__init__(fig)
        self.setParent(parent)
        sns.set(style="dark")

        for index, s in zip(range(9), np.linspace(0, 3, 10)):
            axes = fig.add_subplot(3, 3, index + 1)
            x, y = np.random.randn(2, 50)
            cmap = sns.cubehelix_palette(start=s, light=1, as_cmap=True)
            sns.kdeplot(x, y, cmap=cmap, shade=True, cut=5, ax=axes)
            axes.set_xlim(-3, 3)
            axes.set_ylim(-3, 3)
            axes.set_xticks([])
            axes.set_yticks([])

        fig.suptitle("Activity Browser", y=0.5, fontsize=30, backgroundcolor=(1, 1, 1, 0.5))

        self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding)
        self.updateGeometry()
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def plot_correlation_fig(data):
    """
    Creates a correlation heat map for all columns in user data.

    Parameters
    ----------
    data: Pandas DataFrame
        User data file as a Pandas DataFrame

    Returns
    -------
    Matplotlib Figure object.
    """
    sns.set(context='talk', style='white')
    fig = plt.figure()
    sns.heatmap(data.corr(), vmin=-1, vmax=1)
    plt.tight_layout()
    return fig
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def plot_count_fig(tasks):
    """
    Create count plot, as a 2-row x 3-col bar plot of data points for each k in each covar.

    Parameters
    ----------
    tasks: list(dict)

    Returns
    -------
    Matplotlib Figure object.
    """
    sns.set(context='talk', style='whitegrid')
    df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied']))
    df = df.loc[:, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']]
    df['covar_type'] = [x.capitalize() for x in df['covar_type']]
    df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
    f = sns.factorplot(x='k', kind='count', col='covar_type', row='covar_tied', data=df,
                      row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
                      palette='Blues_d')
    f.set_titles("{col_name}-{row_name}")
    f.set_xlabels("Num. of Clusters (K)")
    return f.fig
项目:krafters    作者:GianlucaBortoli    | 项目源码 | 文件源码
def generateRawPlot(test):

    # set figure size
    plt.figure(figsize=(15, 6))
    handles = []
    # draw plot
    for raw in test:
        label = raw.pop(0)
        xAxis = range(len(raw))
        yAxis = [float(i) for i in raw]
        handle, = plt.plot(xAxis, yAxis, label=label)
        handles.append(handle)
    # put axis labels
    plt.xlabel("operations")
    plt.ylabel("time (s)")
    plt.legend(handles=handles)
项目:krafters    作者:GianlucaBortoli    | 项目源码 | 文件源码
def generateMassPlot(test):
    # set figure size
    plt.figure(figsize=(15, 6))
    handles = []
    # draw plot
    for raw in test:
        label = raw.pop(0)
        yAxis = [i / (len(raw)) for i in range(len(raw) + 1)]
        values = sorted([float(i) for i in raw])
        xAxis = [0] + values
        handle, = plt.plot(xAxis, yAxis, label=label)
        handles.append(handle)
    # put axis labels
    plt.xlabel("time (s)")
    plt.ylabel("probability of completion")
    plt.legend(handles=handles)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def cor_df(data, cols=None, xticklabels=False, yticklabels=False, close=True):
    '''
    ??: ???????????
    ???: 
    data: ?????dataframe??
    cols: ?????list??????data????
    close: ????????
    ???: 
    cormat: ??????dataframe??
    heatmap: ????fig??
    '''
    if cols is None:
        cols=list(data.columns)
    corrmat = data[cols].corr()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    sns.set(context='paper', font='monospace')
    sns.heatmap(corrmat, vmax=0.8, square=True, ax=ax, xticklabels=xticklabels, yticklabels=yticklabels)
    ax.set_title('Heatmap of Correlation Matrix')
    if close:
        plt.close('all')
    return corrmat, fig


#Distribution
项目:implicit    作者:benfred    | 项目源码 | 文件源码
def benchmark_spark(ratings, factors, iterations=5):
    conf = (SparkConf()
            .setAppName("implicit_benchmark")
            .setMaster('local[*]')
            .set('spark.driver.memory', '16G')
            )
    context = SparkContext(conf=conf)
    spark = SparkSession(context)

    times = {}
    try:
        ratings = convert_sparse_to_dataframe(spark, context, ratings)

        for rank in factors:
            als = ALS(rank=rank, maxIter=iterations,
                      alpha=1, implicitPrefs=True,
                      userCol="row", itemCol="col", ratingCol="data")
            start = time.time()
            als.fit(ratings)
            elapsed = time.time() - start
            times[rank] = elapsed / iterations
            print("spark. factors=%i took %.3f" % (rank, elapsed/iterations))
    finally:
        spark.stop()

    return times
项目:implicit    作者:benfred    | 项目源码 | 文件源码
def generate_speed_graph(data, filename="als_speed.png", keys=['gpu', 'cg2', 'cg3', 'cholesky'],
                         labels=None, colours=None):
    labels = labels or {}
    colours = colours or {}

    seaborn.set()
    fig, ax = plt.subplots()

    factors = data['factors']
    for key in keys:
        ax.plot(factors, data[key],
                color=colours.get(key, COLOURS.get(key)),
                marker='o', markersize=6)

        ax.text(factors[-1] + 5, data[key][-1], labels.get(key, LABELS[key]), fontsize=10)

    ax.set_ylabel("Seconds per Iteration")
    ax.set_xlabel("Factors")
    plt.savefig(filename, bbox_inches='tight', dpi=300)
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def find_n_most_similar_articles(self):
        """
        Find the n most similar articles with the highest similarity score for each  article in the DataFrame.
        :return:
        """
        # Iterate over each article in DataFrame
        for index, row in self.df_article_vectors.iterrows():
            # Get the similarity scores of the current article compared to all other articles
            similarity_scores = self.similarity_score_dict[index]
            # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
            for i in range(0, self.n_most_similar):
                # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
                most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
                most_similar_article_score = similarity_scores[most_similar_article_index]
                del similarity_scores[most_similar_article_index]
                # Find corresponding title and set it as most similar article i in DataFrame
                title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
                title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
                self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
项目:newsrecommender    作者:Newsrecommender    | 项目源码 | 文件源码
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nl.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def outlier_prediction(x_train, y_train):
        # Use built-in isolation forest or use predicted vs. actual
        # Compute squared residuals of every point
        # Make a threshold criteria for inclusion

        # The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
        rng = np.random.RandomState(42)
        clf_all_features = IsolationForest(max_samples=100, random_state=rng)
        clf_all_features.fit(x_train)

        # Predict if a particular sample is an outlier using all features for higher dimensional data set.
        y_pred_train = clf_all_features.predict(x_train)

        # Exclude suggested outlier samples for improvement of prediction power/score
        outlier_map_out_train = np.array(map(lambda x: x == 1, y_pred_train))
        x_train_modified = x_train[outlier_map_out_train, ]
        y_train_modified = y_train[outlier_map_out_train, ]

        return x_train_modified, y_train_modified
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def drop_variable(self, df):
        # if HousePrices._is_one_hot_encoder:
            # Drop all categorical feature helping columns ('Num')
            # Todo: is it defined when importing data set? _feature_names_num
            # for feature_name in HousePrices._feature_names_num:
            #     df = df.drop([feature_name], axis=1)

        # is_with_feature_agglomeration = 0
        # if is_with_feature_agglomeration:
        #     print(df.shape)
        #     df = HousePrices.feature_agglomeration(df)
        #     print(df.shape)

        # df = df.drop(['Fireplaces'], axis=1)
        df = df.drop(['Id'], axis=1)

        if not any(tuple(df.columns == 'SalePrice')):
            # All feature var names occuring in test data is assigned the public varaible df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns
        return df
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price(self, x_train, y_train, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        lasso = LassoCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
                                0.3, 0.6, 1],
                        max_iter=50000, cv=10)
        # lasso = RidgeCV(alphas=[0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        #                         0.3, 0.6, 1], cv=10)

        lasso.fit(x_train_split, y_train_split)
        y_predicted = lasso.predict(X=x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:HousePrices    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:deepcpg    作者:cangermueller    | 项目源码 | 文件源码
def ranges_to_list(x, start=0, stop=None):
    s = set()
    for xi in x:
        xi = str(xi)
        if xi.find('-') >= 0:
            t = xi.split('-')
            if len(t) != 2:
                raise ValueError('Invalid range!')
            if len(t[0]) == 0:
                t[0] = start
            if len(t[1]) == 0:
                t[1] = stop
            s |= set(range(int(t[0]), int(t[1]) + 1))
        else:
            s.add(int(xi))
    s = sorted(list(s))
    return s
项目:openai_lab    作者:kengz    | 项目源码 | 文件源码
def scoped_mpl_import():
    import matplotlib
    matplotlib.rcParams['backend'] = MPL_BACKEND

    import matplotlib.pyplot as plt
    plt.rcParams['toolbar'] = 'None'  # mute matplotlib toolbar

    import seaborn as sns
    sns.set(style="whitegrid", color_codes=True, font_scale=1.0,
            rc={'lines.linewidth': 1.0,
                'backend': matplotlib.rcParams['backend']})
    palette = sns.color_palette("Blues_d")
    palette.reverse()
    sns.set_palette(palette)

    return (matplotlib, plt, sns)
项目:PythonPackages    作者:wanhanwan    | 项目源码 | 文件源码
def cross_section_cndl(data, factor_name):
    '''???????????????
    ??????????????

    ??
    ------------------------------
    data:DataFrame(index:[Date,IDs],factor1,factor2,...)

    factor_name:str
    '''
    data = data.reset_index()
    sns.set(style='ticks')

    ax = sns.boxplot(x='Date', y=factor_name, data=data, palette='PRGn')
    sns.despine(offset=10, trim=True)

    return ax

# ??2
# ?????, ?????????????
项目:eezzy    作者:3Blades    | 项目源码 | 文件源码
def factor_plot(dataFrame, factors, prediction, color="Set3"):
    # First, plot the total for each factor. Then, plot the total for each
    # factor for the prediction variable (so in a conversion example, how
    # many people converted, revenue per country, etc.)

    # These refer to the rows and columns of the axis numpy array; not the
    # data itself.

    row = 0
    column = 0
    sns.set(style="whitegrid")
    # TODO: Set the width based on the max number of unique
    # values for the factors.

    plots = plt.subplots(len(factors), 2, figsize=(8,12))
    # It should
    for factor in factors:
        sns.countplot(x=factor, palette="Set3", data=dataFrame,
                      ax=plots[1][row][column])
        # Then print the total for each prediction
        sns.barplot(x=factor, y=prediction, data=dataFrame,
        ax=plots[1][row][column+1])
        row += 1
    plt.tight_layout() # Need this or else plots will crash into each other
项目:astetik    作者:mikkokotila    | 项目源码 | 文件源码
def swarm(data,x,y,xscale='linear',yscale='linear'):

    # set default pretty settings from Seaborn

    sns.set(style="white", palette="muted")
    sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 0.2}) 

    # createthe plot

    g = sns.swarmplot(x=x, y=y, data=data, palette='RdYlGn')

    plt.tick_params(axis='both', which='major', pad=10)

    g.set(xscale=xscale)
    g.set(yscale=yscale)

    # Setting plot limits

    start = data[y].min().min()
    plt.ylim(start,);

    sns.despine()
项目:astetik    作者:mikkokotila    | 项目源码 | 文件源码
def correlation(data,title=''):

    corr = data.corr(method='spearman')
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True

    sns.set(style="white")
    sns.set_context("notebook", font_scale=2, rc={"lines.linewidth": 0.3})

    rcParams['figure.figsize'] = 25, 12
    rcParams['font.family'] = 'Verdana'
    rcParams['figure.dpi'] = 300

    g = sns.heatmap(corr, mask=mask, linewidths=1, cmap="RdYlGn", annot=False)
    g.set_xticklabels(data,rotation=25,ha="right");
    plt.tick_params(axis='both', which='major', pad=15);
项目:MLAlgorithms    作者:rushter    | 项目源码 | 文件源码
def plot(self, ax=None, holdon=False):
        sns.set(style="white")

        data = self.X

        if ax is None:
            _, ax = plt.subplots()



        for i, index in enumerate(self.clusters):
            point = np.array(data[index]).T
            ax.scatter(*point, c=sns.color_palette("hls", self.K + 1)[i])

        for point in self.centroids:
            ax.scatter(*point, marker='x', linewidths=10)

        if not holdon:
            plt.show()
项目:Waskom_PNAS_2017    作者:WagnerLabPapers    | 项目源码 | 文件源码
def plot_mds(subjects, experiments, axes):

    for subj, exp, ax in zip(subjects, experiments, axes):

        res_fname = "correlation_analysis/{}_{}_ifs.pkz".format(subj, exp)
        res = moss.load_pkl(res_fname)
        sorter = np.argsort(np.abs(res.prefs))

        x_, y_ = res.mds_coords.T.dot(res.prefs)
        t = np.arctan2(y_, x_)
        rot = [[np.cos(t), np.sin(t)], [-np.sin(t), np.cos(t)]]
        x, y = np.dot(rot, res.mds_coords[sorter].T)

        cmap = get_colormap(exp)

        ax.scatter(x, y, c=res.prefs[sorter],
                   cmap=cmap, vmin=-1.75, vmax=1.75,
                   s=8, linewidth=0)

        ax.set(xlim=(-.9, .9), ylim=(-.9, .9), aspect="equal")
        ax.set_axis_off()
项目:python-machine-learning-book    作者:jeremyn    | 项目源码 | 文件源码
def visualize_housing_data(df):
    sns.set(style='whitegrid', context='notebook')
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']

    sns.pairplot(df[cols], size=2.5)

    plt.show()

    correlation_matrix = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=1.5)
    heatmap = sns.heatmap(
        correlation_matrix,
        cbar=True,
        annot=True,
        square=True,
        fmt='.2f',
        annot_kws={'size': 15},
        yticklabels=cols,
        xticklabels=cols,
    )

    plt.show()
项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def visualize_results(df):
    # Visualize logistic curve using seaborn
    sns.set(style="darkgrid")
    sns.regplot(x="pageviews_cumsum",
                y="is_conversion",
                data=df,
                logistic=True,
                n_boot=500,
                y_jitter=.01,
                scatter_kws={"s": 60})
    sns.set(font_scale=1.3)
    sns.plt.title('Logistic Regression Curve')
    sns.plt.ylabel('Conversion probability')
    sns.plt.xlabel('Cumulative sum of pageviews')
    sns.plt.subplots_adjust(right=0.93, top=0.90, left=0.10, bottom=0.10)
    sns.plt.show()


# Run the final program
项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems
项目:themarketingtechnologist    作者:thomhopmans    | 项目源码 | 文件源码
def find_n_most_similar_articles(self):
        """
        Find the n most similar articles with the highest similarity score for each TMT article in the DataFrame.
        :return:
        """
        # Iterate over each article in DataFrame
        for index, row in self.df_article_vectors.iterrows():
            # Get the similarity scores of the current article compared to all other articles
            similarity_scores = self.similarity_score_dict[index]
            # Find the highest similarity scores in the similarity_score_dict until we have found the n most similar.
            for i in range(0, self.n_most_similar):
                # Find most similar article, i.e. with highest cosine similarity. Note: if Euclidean distance, then min!
                most_similar_article_index = max(similarity_scores, key=similarity_scores.get)
                most_similar_article_score = similarity_scores[most_similar_article_index]
                del similarity_scores[most_similar_article_index]
                # Find corresponding title and set it as most similar article i in DataFrame
                title = self.df_article_vectors.loc[most_similar_article_index]['title'].encode('utf-8')
                title_plus_score = "{} ({:.2f})".format(title, most_similar_article_score)
                self.df_article_vectors.set_value(index, 'most_similar_'+str(i+1), title_plus_score)
项目:Penny-Dreadful-Tools    作者:PennyDreadfulMTG    | 项目源码 | 文件源码
def image(path, costs):
    ys = ['0', '1', '2', '3', '4', '5', '6', '7+', 'X']
    xs = [costs.get(k, 0) for k in ys]
    sns.set_style('white')
    sns.set(font='Concourse C3', font_scale=3)
    g = sns.barplot(ys, xs, palette=['grey'] * len(ys))
    g.axes.yaxis.set_ticklabels([])
    rects = g.patches
    sns.set(font='Concourse C3', font_scale=2)
    for rect, label in zip(rects, xs):
        if label == 0:
            continue
        height = rect.get_height()
        g.text(rect.get_x() + rect.get_width()/2, height + 0.5, label, ha='center', va='bottom')
    g.margins(y=0, x=0)
    sns.despine(left=True, bottom=True)
    g.get_figure().savefig(path, transparent=True, pad_inches=0, bbox_inches='tight')
    plt.clf() # Clear all data from matplotlib so it does not persist across requests.
    return path
项目:nf1_inactivation    作者:greenelab    | 项目源码 | 文件源码
def plot_decision_function(score_df, partition, output_file):
    """
    Plots the decision function for a given partition (either 'train' or
    'test') and saves a figure to file.

    Arguments:
    :param score_df: a specific folds decision scores and status
    :param partition: either 'train' or 'test' will plot performance
    :param output_file: file to output the figure
    """
    ax = sns.kdeplot(score_df.ix[(score_df.status == 1) &
                                 (score_df.partition == partition), :]
                     .decision, color='red', label='Deficient',
                     shade=True)
    ax = sns.kdeplot(score_df.ix[(score_df.status == 0) &
                                 (score_df.partition == partition), :]
                     .decision, color='blue', label='Wild-Type',
                     shade=True)
    ax.set(xlabel='Decision Function', ylabel='Density')
    ax.set_title('Classifier Decision Function')
    sns.despine()
    plt.tight_layout()
    plt.savefig(output_file)
    plt.close()
项目:LeaguePredictor    作者:dgarwin    | 项目源码 | 文件源码
def sns_triangle(matrix, plt_title, only_class=None):

    sns.set(style="white")
    # Generate a mask for the upper triangle
    mask = np.zeros_like(matrix, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(matrix.as_matrix(), mask=mask, cmap=cmap, vmax=.3,
                square=True, xticklabels=5, yticklabels=5,
                linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
    title(plt_title)
    xlabel('Preprocessed Features')
    ylabel('Preprocessed Features')
    if only_class is None:
        only_class = ''
    savefig('images/triangle'+only_class+'.png')
项目:tredparse    作者:humanlongevity    | 项目源码 | 文件源码
def setup_theme(context='notebook', style="darkgrid", palette='deep', font='Helvetica'):
    try:
        import seaborn as sns
        extra_rc = {"lines.linewidth": 1,
                    "lines.markeredgewidth": 1,
                    "patch.edgecolor": 'k',
                    }
        sns.set(context=context, style=style, palette=palette, rc=extra_rc)
    except (ImportError, SyntaxError):
        pass

    rc('text', usetex=True)

    if font == "Helvetica":
        rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
    elif font == "Palatino":
        rc('font', **{'family':'serif','serif': ['Palatino']})
    elif font == "Schoolbook":
        rc('font', **{'family':'serif','serif': ['Century Schoolbook L']})
项目:meucci-python    作者:returnandrisk    | 项目源码 | 文件源码
def plot_corr_heatmap(corr, labels, heading):

    sns.set(style="white")

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(8, 8))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
                square=True, xticklabels=labels, yticklabels=labels,
                linewidths=.5, ax=ax, cbar_kws={"shrink": .5}, annot=True)
    ax.set_title(heading)
    plt.show()
项目:crop-seq    作者:epigen    | 项目源码 | 文件源码
def get_level_colors(index):
    pallete = sns.color_palette("colorblind") * int(1e6)

    colors = list()

    if hasattr(index, "levels"):
        for level in index.levels:
            color_dict = dict(zip(level, pallete))
            level_colors = [color_dict[x] for x in index.get_level_values(level.name)]
            colors.append(level_colors)
    else:
        color_dict = dict(zip(set(index), pallete))
        index_colors = [color_dict[x] for x in index]
        colors.append(index_colors)

    return colors
项目:learning-to-prune    作者:timvieira    | 项目源码 | 文件源码
def sanity_check(Ds):
    names, Ds = zip(*Ds)

    D0 = Ds[0]
    Ps = set(D0.policy.unique())
    Es = set(D0.example.unique())

    # Sanity check.
    for name, dd in zip(names, Ds):
        # same policy and examples
        if (set(dd.policy.unique()) != Ps
            or set(dd.example.unique()) != Es):
            print colors.bold % colors.red % '======================================'
            print colors.bold % colors.red % 'WARNING: some policies arent finished.'
            print colors.bold % colors.red % '======================================'
            print name, 'want/got sizes %s/%s' % (len(Ps), len(set(dd.policy.unique())))

        #assert set(dd.policy.unique()) == Ps  # same policies
        #assert set(dd.example.unique()) == Es  # same examples

    bestof = aggregate_multiple_runtime_trials(Ds, Ps)

    return D0, Ds, bestof
项目:guacml    作者:guacml    | 项目源码 | 文件源码
def target_plot(self):
        target_type = self.input_data.metadata.loc[self.target].type
        target_data = self.input_data.df[self.target]
        sns.set(style="white", color_codes=True)
        if not self.run_time_config['is_time_series']:
            if target_type == ColType.BINARY:
                plt.figure(figsize=(6, 1))
                sns.barplot(target_data.sum() / target_data.shape[0])
                plt.xlim([0, 1])
                plt.title(target_data.name + ' rate')
            elif target_type == ColType.NUMERIC or target_type == ColType.ORDINAL:
                plt.figure(figsize=(6, 2))
                ax = sns.distplot(target_data, hist_kws=dict(edgecolor='black'))
                ax.set_xlim(target_data.min(), target_data.max())
                plt.title(target_data.name + ' histogram')
        else:
            self.time_series_target_plot()
项目:guacml    作者:guacml    | 项目源码 | 文件源码
def predictions_vs_actual_regression(model_results, model_name, size=6, bins=None,
                                     gridsize=30, outlier_ratio=None, **kwargs):
    holdout = model_results.holdout_data
    target = model_results.target

    if outlier_ratio is not None:
        holdout = utils.remove_outlier_rows(holdout, 'prediction', outlier_ratio)
        holdout = utils.remove_outlier_rows(holdout, target, outlier_ratio)

    sns.set(style="white", color_codes=True)

    marginal_kws = dict(hist_kws=dict(edgecolor='black'))
    plt.suptitle('{0}: Predictions vs Actual'.format(model_name), fontsize=14)
    grid = sns.jointplot('prediction', target, holdout, 'hexbin', gridsize=gridsize,
                         size=size, bins=bins, space=0, marginal_kws=marginal_kws, **kwargs)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)  # shrink fig so cbar is visible
    cax = grid.fig.add_axes([.95, .18, .04, .5])  # x, y, width, height
    color_bar = sns.plt.colorbar(cax=cax)

    if bins is None:
        color_bar.set_label('count')
    elif bins == 'log':
        color_bar.set_label('log_10(count)')
    return grid
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def scatter(x,y,xlabel='x',ylabel='y',title=None,line=False,name=None,show=False):
    sns.set()
    title = "%s vs %s"%(xlabel,ylabel) if title is None else title
    plt.scatter(x,y)
    if line:
        plt.plot(x,y)
    plt.title(title)
    plt.ylabel('y: %s'%ylabel)
    plt.xlabel('x: %s'%xlabel)
    if name is not None:
        #fig = plt.Figure()
        plt.savefig(name)
    if show:
        plt.show()
    plt.clf()
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def distribution(data,xlabel="data",ylabel="percentage",name=None):
    ax = plt.axes()
    ax.set(xlabel=xlabel,ylabel=ylabel)
    ds = sns.distplot(data,ax=ax)
    plt.show()
    if name is not None:
        ds.get_figure().savefig(name)
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def corr_heatmap(df,cols=None,name=None):
    sns.set()
    if cols is None:
        cols = [i for i in df.columns.values if df[i].dtype!='object']
    df = df[cols].corr()
    print(df.shape)
    ds = sns.heatmap(df, annot=False)
    plt.show()
    if name is not None:
        ds.get_figure().savefig(name)
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def get_average_metrics(metrics_list):
    """Get algorithm performance over all folds"""

    eval_metrics = {}

    track_metric = 0
    for fold in metrics_list:
        track_metric += fold["auc"]

    # Divide by number of folds
    track_metric /= len(metrics_list)
    eval_metrics.update({"auc": track_metric})

    for metric in ("fpr", "tpr"):
        # the fpr, tpr output from scikit-learn may not have the same
        # number of elements in the arrays, set to Null for now
        eval_metrics.update({metric: [0, 0]})  # TODO

    for threshold in THRESHOLDS:
        eval_metrics[threshold] = {}
        for metric in ("precision", "recall", "f1"):
            track_metric = 0
            for fold in metrics_list:
                track_metric += fold[threshold][metric]

            # Divide by number of folds
            track_metric /= len(metrics_list)
            eval_metrics[threshold].update({metric: track_metric})

    return eval_metrics
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def precision_recall_at_x_proportion(test_labels, test_predictions, x_proportion=0.01,
                                     return_cutoff=False):
    """Compute precision, recall, F1 for a specified fraction of the test set.

    :params list test_labels: true labels on test set
    :params list test_predicted: predicted labels on test set
    :params float x_proportion: proportion of the test set to flag
    :params bool return_cutoff: if True return the cutoff probablility
    :returns float precision: fraction correctly flagged
    :returns float recall: fraction of the positive class recovered
    :returns float f1: 
    """

    cutoff_index = int(len(test_predictions) * x_proportion)
    cutoff_index = min(cutoff_index, len(test_predictions) - 1)

    sorted_by_probability = np.sort(test_predictions)[::-1]
    cutoff_probability = sorted_by_probability[cutoff_index]

    test_predictions_binary = [1 if x > cutoff_probability else 0 for x in test_predictions]

    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
        test_labels, test_predictions_binary)

    # Only interested in metrics for label 1
    precision, recall, f1 = precision[1], recall[1], f1[1]

    if return_cutoff:
        return precision, recall, f1, cutoff_probability
    else:
        return precision, recall, f1
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def plot_allkfolds_ROC(timestamp, cv, fpr_arr, tpr_arr):

    sns.set(style="white", palette="muted", color_codes=True)

    mean_tpr = 0.0
    mean_fpr = 0.0
    all_roc_auc = []
    bins_roc = np.linspace(0, 1, 300)
    with plt.style.context(('seaborn-muted')):
        fig, ax = plt.subplots(figsize=(10, 8))
        for i, (train, test) in enumerate(cv):
            mean_tpr += interp(bins_roc, fpr_arr[i], tpr_arr[i])
            mean_tpr[0] = 0.0
            mean_fpr += interp(bins_roc, fpr_arr[i], tpr_arr[i])
            mean_fpr[0] = 0.0
            roc_auc = metrics.auc(fpr_arr[i], tpr_arr[i])
            all_roc_auc.append(roc_auc)
            ax.plot(fpr_arr[i], tpr_arr[i], lw=1, label='KFold %d (AUC = %0.2f)' % (i, roc_auc))
        ax.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')

        mean_tpr /= len(cv)
        mean_tpr[-1] = 1.0
        mean_auc = np.mean(all_roc_auc)
        ax.plot(bins_roc, mean_tpr, 'k--',
             label='Mean ROC (AUC = %0.2f)' % mean_auc, lw=2)

        ax.set_xlim([-0.05, 1.05])
        ax.set_ylim([-0.05, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('Receiver Operating Characteristic')
        ax.legend(loc="lower right")
        plt.savefig('{}_roc.png'.format(timestamp))
    plt.close('all') 
    return mean_auc
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def feature_mapping_to_numerical_values(self, df):
        TwoSigmaFinModTools._is_one_hot_encoder = 0
        mask = ~df.isnull()
        # Assume that training set has all possible feature_var_names
        # Although it may occur in real life that a training set may hold a feature_var_name. But it is probably
        # avoided since such features cannot
        # be part of the trained learning algo.
        # Add missing feature_var_names of training set not occurring in test set. Add these with zeros in columns.
        if not any(tuple(df.columns == 'y')):
            # All one-hot encoded feature var names occurring in test data is assigned the public variable
            # df_test_all_feature_var_names.
            self.df_test_all_feature_var_names = df.columns

        _feature_names_num = np.zeros((TwoSigmaFinModTools._non_numerical_feature_names.shape[0],), dtype=object)
        ith = 0
        for feature_name in TwoSigmaFinModTools._non_numerical_feature_names:
            # Create a feature_nameNum list
            feature_name_num = ''.join([feature_name, 'Num'])
            _feature_names_num[ith] = feature_name_num
            ith += 1
            TwoSigmaFinModTools.encode_labels_in_numeric_format(df, feature_name)

            if TwoSigmaFinModTools._is_one_hot_encoder:
                is_with_label_binarizer = 0
                if is_with_label_binarizer:
                    mapper_df = DataFrameMapper([(feature_name, LabelBinarizer())], df_out=True)
                    feature_var_values = mapper_df.fit_transform(df.copy())
                    print(df[feature_name].isnull().sum().sum())
                    print(df[feature_name][mask[feature_name]].isnull().sum().sum())
                    for ite in feature_var_values.columns:
                        df[ite] = feature_var_values[ite]
                else:
                    TwoSigmaFinModTools.one_hot_encoder(df, feature_name)
        TwoSigmaFinModTools._feature_names_num = pd.Series(data=_feature_names_num, dtype=object)
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def dendrogram(df, number_of_clusters=int(df.shape[1] / 1.2)):
        # Create Dendrogram
        agglomerated_features = FeatureAgglomeration(n_clusters=number_of_clusters)
        used_networks = np.arange(0, number_of_clusters, dtype=int)

        # Create a custom palette to identify the networks
        network_pal = sns.cubehelix_palette(len(used_networks),
                                            light=.9, dark=.1, reverse=True,
                                            start=1, rot=-2)
        network_lut = dict(zip(map(str, df.columns), network_pal))

        # Convert the palette to vectors that will be drawn on the side of the matrix
        networks = df.columns.get_level_values(None)
        network_colors = pd.Series(networks, index=df.columns).map(network_lut)
        sns.set(font="monospace")
        # Create custom colormap
        cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)
        cg = sns.clustermap(df.astype(float).corr(), cmap=cmap, linewidths=.5, row_colors=network_colors,
                            col_colors=network_colors)
        plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
        plt.show()
项目:PortfolioTimeSeriesAnalysis    作者:MizioAnd    | 项目源码 | 文件源码
def predicted_vs_actual_y_input_model(self, model, x_train_split, x_test_split, y_train_split, y_test_split,
                                          title_name):
        # Split the training data into an extra set of test
        # x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        model.fit(x_train_split, y_train_split)
        y_predicted = model.predict(x_test_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual y')
        plt.ylabel('Predicted y')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()
项目:activity-browser    作者:LCA-ActivityBrowser    | 项目源码 | 文件源码
def __init__(self, parent, data, labels, width=6, height=6, dpi=100):
        figure = Figure(figsize=(width, height), dpi=dpi, tight_layout=True)
        axes = figure.add_subplot(111)

        super(CorrelationPlot, self).__init__(figure)
        self.setParent(parent)

        sns.set(style="darkgrid")

        corr = data
        # cmap = sns.diverging_palette(220, 10, as_cmap=True)
        # corrplot(data, names=labels, annot=True, sig_stars=False,
        #      diag_names=True, cmap=cmap, ax=axes, cbar=True)

        df = pd.DataFrame(data=data, columns=labels)
        corr = df.corr()
        # Generate a mask for the upper triangle
        mask = np.zeros_like(corr, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True
        # Draw the heatmap with the mask and correct aspect ratio
        vmax = np.abs(corr.values[~mask]).max()
        # vmax = np.abs(corr).max()
        sns.heatmap(corr, mask=mask, cmap=plt.cm.PuOr, vmin=-vmax, vmax=vmax,
                    square=True, linecolor="lightgray", linewidths=1, ax=axes)
        for i in range(len(corr)):
            axes.text(i + 0.5, i + 0.5, corr.columns[i],
                      ha="center", va="center", rotation=0)
            for j in range(i + 1, len(corr)):
                s = "{:.3f}".format(corr.values[i, j])
                axes.text(j + 0.5, i + 0.5, s,
                          ha="center", va="center")
        axes.axis("off")
        # If uncommented, fills widget
        self.setSizePolicy(QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding)
        self.updateGeometry()
        self.setMinimumSize(self.size())
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def plot_aic_bic_fig(tasks):
    """
    Creates AIC-BIC plot, as a 2-row x 3-col grid of point plots with 95% confidence intervals.

    Parameters
    ----------
    tasks: list(dict)

    Returns
    -------
    Matplotlib Figure object
    """
    sns.set(context='talk', style='whitegrid')
    # Filter list of dicts to reduce the size of Pandas DataFrame
    df = pd.DataFrame(filter_dict_list_by_keys(tasks, ['k', 'covar_type', 'covar_tied', 'bic', 'aic']))
    df['covar_type'] = [x.capitalize() for x in df['covar_type']]
    df['covar_tied'] = [['Untied', 'Tied'][x] for x in df['covar_tied']]
    df['aic'] = df['aic'].astype('float')
    df['bic'] = df['bic'].astype('float')
    df = pd.melt(df, id_vars=['k', 'covar_type', 'covar_tied'], value_vars=['aic', 'bic'], var_name='metric')
    f = sns.factorplot(x='k', y='value', col='covar_type', row='covar_tied', hue='metric', data=df,
                       row_order=['Tied', 'Untied'], col_order=['Full', 'Diag', 'Spher'], legend=True, legend_out=True,
                       ci=95, n_boot=100)
    f.set_titles("{col_name}-{row_name}")
    f.set_xlabels("Num. of Clusters (K)")
    return f.fig
项目:kmeans-service    作者:MAYHEM-Lab    | 项目源码 | 文件源码
def plot_single_cluster_fig(data, columns, labels, bic, k, show_ticks=True):
    """
    Creates cluster plot for the best label assignment based on BIC score.

    Parameters
    ----------
    data: Pandas DataFrame - User data file as a Pandas DataFrame
    columns: list(str) - Column numbers from to use as the plot's x and y axes.
    labels: list(int) - labels of the single task
    bic: int - task's BIC score
    show_ticks: bool - Show or hide tick marks on x and y axes.

    Returns
    -------
    Matplotlib Figure object.

    """
    sns.set(context='talk', style='white')
    columns = columns[:2]

    fig = plt.figure()
    lim_left = data[columns[0]].min()
    lim_right = data[columns[0]].max()
    lim_bottom = data[columns[1]].min()
    lim_top = data[columns[1]].max()

    plt.scatter(data[columns[0]], data[columns[1]], c=labels, cmap=plt.cm.rainbow, s=10)
    plt.xlabel(columns[0])
    plt.ylabel(columns[1])
    plt.xlim(left=lim_left, right=lim_right)
    plt.ylim(bottom=lim_bottom, top=lim_top)
    if show_ticks is False:
        plt.xticks([])
        plt.yticks([])
    title = "K={}\nBIC: {:,.1f}".format(k, bic)
    plt.title(title)
    plt.tight_layout()
    return fig