Python pandas 模块,read_sql() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_sql()

项目:SharesData    作者:xjkj123    | 项目源码 | 文件源码
def __init__(self,code,sql=False):
        if int(code[0]) is 0 or 3:
            name = "sz" + code
        if int(code[0]) is 6:
            name = "sh" + code

        if sql:
            self.sharedf=ShareClass().GetDayData(code)
        else:
            conn = create_engine(
                'mysql://' + config.user + ':' + config.password + '@' + config.ip + '/daydata?charset=utf8')

            x = 'select * from ' + name + '_tencent;'  # sql???
            self.sharedf = pandas.read_sql(x, con=conn)


        self.name=name
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def loadData(self):
        # Load the toilet collection data to pandas
        collects = pd.read_sql('SELECT * FROM premodeling.toiletcollection', self.conn, coerce_float=True, params=None)
        pprint.pprint(collects.keys())

        collects = collects[['ToiletID','ToiletExID','Collection_Date','Area','Feces_kg_day','year','month']]
        pprint.pprint(collects.keys())

        # Load the density data to pandas
        density = pd.read_sql('SELECT * FROM premodeling.toiletdensity', self.conn, coerce_float=True, params=None)
        pprint.pprint(density.keys())

        # Return the data
        self.collects = collects
        self.density = density
        return(collects, density)
项目:triage    作者:dssg    | 项目源码 | 文件源码
def as_dataframe(self, model_group_ids):
        """Return model-group-id subset of table as dataframe

        Args:
            model_group_ids (list) the desired model group ids

        Returns: (pandas.DataFrame) The data from the table corresponding
            to those model group ids
        """
        return pd.read_sql(
            'select * from {} where model_group_id in ({})'.format(
                self.distance_table,
                str_in_sql(model_group_ids)
            ),
            self.db_engine
        )
项目:py-investment    作者:kprestel    | 项目源码 | 文件源码
def simple_moving_average(self, period=50, column='adj_close'):
        table_name = 'sma_test'
        # stmt = text('SELECT * FROM sma_test WHERE asset_id = :asset_id')
        # stmt.bindparams(asset_id=self.id)
        # try:
            # TODO: parse dates
            # df = pd.read_sql(sql, con=conn, params={
            #     'asset_id': self.id
            # })
        # except OperationalError:
        #     logger.exception('error in query')
        sma_ts = pd.Series(
                self.ohlcv[column].rolling(center=False, window=period,
                                           min_periods=period - 1).mean()).dropna()
        print('creating')
        print(sma_ts)
        return sma_ts
            # return sma_ts
        # else:
        #     print('found')
        #     print(df)
        #     return df
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def standardize_variable_names(table, RULES):
    """
    Script to standardize the variable names in the tables
    PARAM DataFrame table: A table returned from pd.read_sql
    PARAM list[tuples]: A list of tuples with string replacements, i.e., (string, replacement)
    RET table
    """
    variableNames = list(table.columns.values)
    standardizedNames = {} # Pandas renames columns with a dictionary object
    for v in variableNames:
        f = v
        for r in RULES:
            f = re.sub(r[0],r[1],f)
        print '%s to %s' %(v,f)
        standardizedNames[v] = f
    table = table.rename(columns=standardizedNames)
    return table
项目:wikilinks    作者:trovdimi    | 项目源码 | 文件源码
def export_data_unresolved():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_work_view = db.get_work_view()
    connection = db_work_view._db_connection


    df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)

    df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
    df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
    df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')


    df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)


    sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])

    sample_unresoleved['n'].fillna(0, inplace=True)
    sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
项目:wikilinks    作者:trovdimi    | 项目源码 | 文件源码
def pickle_correlations_zeros_january():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
    print 'loaded links'
    df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501  where link_type_derived= "internal-link";',  conn)
    print 'loaded counts'
    result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
    print 'merged counts'
    print result
    article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    article_counts['counts'].fillna(0.0, inplace=True)
    print article_counts
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
项目:pyuniprot    作者:cebel    | 项目源码 | 文件源码
def _limit_and_df(self, query, limit, as_df=False):
        """adds a limit (limit==None := no limit) to any query and allow a return as pandas.DataFrame

        :param bool as_df: if is set to True results return as pandas.DataFrame
        :param `sqlalchemy.orm.query.Query` query: SQL Alchemy query 
        :param int or tuple[int] limit: maximum number of results
        :return: query result of pyuniprot.manager.models.XY objects
        """
        if limit:

            if isinstance(limit, int):
                query = query.limit(limit)

            if isinstance(limit, Iterable) and len(limit) == 2 and [int, int] == [type(x) for x in limit]:
                page, page_size = limit
                query = query.limit(page_size)
                query = query.offset(page * page_size)

        if as_df:
            results = read_sql(query.statement, self.engine)

        else:
            results = query.all()

        return results
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_metrics(self):
        """Pull requested evaluation metrics for each model meeting the criteria
        specified in __init__.

        :returns: evaluation metrics for all models in the batches
        :rtype: pandas.DataFrame
        """
        metrics_query = """
            SELECT DISTINCT
                metrics.metric || '_' || metrics.parameter as p_metric,
                metrics.value,
                metrics.unique_timestamp
            FROM
                output.metrics as metrics
            WHERE
                metric || '_' || parameter in {0} AND
                unique_timestamp in {1};
        """.format(tuple(self.most_important_metrics),
                   tuple(self.models.index.astype(str).values))
        return(pd.read_sql(metrics_query, self.engine))
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_features(self):
        """ Pull features from the database.

        :returns: features
        :rtype: pandas DataFrame
        """
        query = """
            SELECT
                *
            FROM
                {0}
            WHERE
                {1} is not null;
        """.format(self.model['test_table_name'][0],
                   self.model['labelling'].astype(str)[0])
        return(pd.read_sql(query, self.engine))
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_mh_appt_dates(self, people):
        """ Get the dates of mental health appointments for people labeled 
        correctly by the model.

        :param people: list of ids to search for
        :type people: tuple
        :returns: mental health appointment data
        :rtype: pandas DataFrame
        """
        mh_services_query = """
            select
                dedupe_id,
                svc_date
            from
                clean.jocomentalhealthservices
            where
                svc_date < '{0}' and
                dedupe_id in {1}
        """.format(self.features['labeling_end_date'].unique()[0], people)
        return(pd.read_sql(mh_services_query, self.engine).set_index('dedupe_id'))
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def get_y_values(self):
        """ Return a dataframe containing the true classes and predicted
        probabilities for each case, sorted by descending probability.

        :returns: dataframe of true classes and predicted probabilities
        :rtype: pandas DataFrame
        """
        y_query = """
                SELECT
                    label as y_true,
                    prediction_prob as scores
                FROM
                    output.predictions
                WHERE
                    unique_timestamp = '{0}';
            """.format(self.model_id)
        y_values = pd.read_sql(y_query, con).sort_values(by = 'scores',
                                                         ascending = False)

        return y_values
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def tables_exist(fake_today,prediction_window,feature_timestamp):
    table_list = pd.read_sql('''
        SELECT
            table_name
        FROM
            information_schema.tables
        WHERE
            table_schema = 'feature_tables' AND
            table_name LIKE 'features_train_{}_{}_at_{}'
        ORDER BY
            table_name desc
        LIMIT
            1;
    '''.format(fake_today, prediction_window, feature_timestamp), con)

    if len(table_list) > 0:
        return True
    else:
        return False
项目:johnson-county-ddj-public    作者:dssg    | 项目源码 | 文件源码
def load_table(self, train_or_test, feature_timestamp):
        # get feature table name
        if feature_timestamp == '%':
            feature_timestamp = pd.read_sql('''
                SELECT
                    split_part(table_name, '_at_', 2)
                FROM
                    information_schema.tables
                WHERE
                    table_schema = 'feature_tables'
                ORDER BY
                    1 desc
                LIMIT
                    1;
            ''', con).iat[0,0]
        feature_table_name = ('{}."features_{}_{}_{}_at_{}"').format(config_db['feature_schema'],
                train_or_test, self.fake_today, self.prediction_window,
                feature_timestamp)

        # load table
        print 'loading {}'.format(feature_table_name)
        query = ('SELECT * FROM {}').format(feature_table_name)
        full_feature_table = pd.read_sql(query, con = con)
        return full_feature_table, feature_table_name
项目:ego.powerflow    作者:openego    | 项目源码 | 文件源码
def by_scenario(self, name):
        """
        """

        ormclass = self._mapped[name]
        query = self.session.query(ormclass).filter(
            ormclass.scn_name == self.scn_name)

        if self.version:
            query = query.filter(ormclass.version == self.version)

        # TODO: Better handled in db
        if name == 'Transformer':
            name = 'Trafo'

        df = pd.read_sql(query.statement,
                         self.session.bind,
                         index_col=name.lower() + '_id')

        if 'source' in df:
            df.source = df.source.map(self.id_to_source())

        return df
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def is_part_of_speech(self, pos):
        if hasattr(self.resource, QUERY_ITEM_POS):
            current_token = tokens.COCAToken(pos, replace=False)
            rc_feature = getattr(self.resource, QUERY_ITEM_POS)
            _, table, _ = self.resource.split_resource_feature(rc_feature)
            S = "SELECT {} FROM {} WHERE {} {} '{}' LIMIT 1".format(
                getattr(self.resource, "{}_id".format(table)),
                getattr(self.resource, "{}_table".format(table)),
                getattr(self.resource, rc_feature),
                self.resource.get_operator(current_token),
                pos)
            engine = self.resource.get_engine()
            df = pd.read_sql(S.replace("%", "%%"), engine)
            engine.dispose()
            return len(df.index) > 0
        else:
            return False
项目:coquery    作者:gkunter    | 项目源码 | 文件源码
def get_unique(self):
        if not self.db_name:
            return
        sql = sqlhelper.sql_url(options.cfg.current_server, self.db_name)
        if self._uniques:
            S = "SELECT DISTINCT {} FROM {}".format(self.column, self.table)
            self.df = pd.read_sql(S, sqlalchemy.create_engine(sql))
            self.df = self.df.sort_values(self.column, ascending=True)
        else:
            S = "SELECT {} FROM {}".format(self.column, self.table)
            self.df = pd.read_sql(S, sqlalchemy.create_engine(sql))
        items = (self.df[self.column].apply(utf8)
                                     .apply(QtWidgets.QTableWidgetItem))
        self.ui.tableWidget.setRowCount(len(items))
        self.ui.tableWidget.setColumnCount(1)
        for row, item in enumerate(items):
            self.ui.tableWidget.setItem(row, 0, item)
项目:ModelFlow    作者:yuezPrincetechs    | 项目源码 | 文件源码
def read_sql(self,sql,chunksize=None,return_generator=True):
        '''
        ?????????
        :param sql: sql???
        :param chunksize: int?????????????????????????
        :param return_generator: bool?????????False????????????????chunksize?int?????
        :return:
        '''
        if chunksize is not None and chunksize<=0:
            chunksize=None
        result=pd.read_sql(sql,self.engine,chunksize=chunksize)
        if return_generator:
            return result
        else:
            if chunksize is None:
                return result
            else:
                result=list(result)
                if len(result)==0:
                    return pd.DataFrame()
                else:
                    result=pd.concat(result,axis=0)
                    return result
项目:smiles-neural-network    作者:PMitura    | 项目源码 | 文件源码
def getData(con):

    query = 'SELECT {} FROM {}'.format(
        ','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]),
        DOWNLOAD_TABLE)

    if WHERE:
        query += ' WHERE {}'.format(WHERE)

    if LIMIT:
        query += ' LIMIT {}'.format(LIMIT)

    print(query)

    df = pd.read_sql(
        sql = query,
        con = con)

    return df
项目:smiles-neural-network    作者:PMitura    | 项目源码 | 文件源码
def getData(con, lo):
    query = DOWNLOAD_QUERY.format(
        ','.join(['"{}"'.format(x) for x in DOWNLOAD_COLS]),
        DOWNLOAD_TABLE,
        LELIMIT,
        lo)
    print(query)

    df = pd.read_sql(
        sql = query,
        con = con)

    df.columns = ['canonical_smiles']

    # fingerDf = computeDescriptors(df)
    fingerDf = computeMACCS(df)

    mergedDf = pd.concat([df, fingerDf], axis=1)

    return mergedDf
项目:Trending-Places-in-OpenStreetMap    作者:geometalab    | 项目源码 | 文件源码
def retrieve_data(self, date, world_or_region=WORLD, table_name='trending_places'):
        """
        Fetched records saved in a Database

        Parameters
        ----------
        date
        world_or_region
        table_name

        Returns
        -------
        pandas data frame
        """
        with self.con:
            return pd.read_sql(FETCH_QUERY % (table_name, date, world_or_region),
                               self.con, parse_dates=['last_day'])
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def _update(stock, conn):
    try:
        print "update ----- :", stock
        query = "select * from '%s' order by date" % stock
        df = pd.read_sql(query, conn)
        df = df.set_index('date')

        print "sql saved:", df.tail(1),df.ix[-1],df.ix[-1].name
        if dt.now().weekday() == 5:
            today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=1))[:10]
        elif dt.now().weekday() == 6:
            today = str(pd.Timestamp(dt.now()) - pd.Timedelta(days=2))[:10]
        else:
            today = str(pd.Timestamp(dt.now()))[:10]
        print "today:",today
        if today != df.ix[-1].name[:10]:
            df = ts.get_h_data(stock, start=df.ix[-1].name[:10], retry_count=5, pause=1)
            print "read from tu:",df.head(1)
            df[['open', 'high', 'close', 'low', 'volume']].to_sql(stock, conn, if_exists='append')
            import time
            time.sleep(10)
    except Exception, arg:
        print "exceptionu:", stock, arg
        errorlist.append(stock)
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def _clean(stock, conn):
    try:
        print "clean ------ :", stock
        query = "select * from '%s' order by date" % stock
        df = pd.read_sql(query, conn)
        print "before",df.tail(5)
        cur = conn.cursor()
        query = "delete from '%s' where rowid not in(select max(rowid) from '%s' group by date)" % (stock, stock)
        cur.execute(query)
        conn.commit()
        query = "select * from '%s' order by date" % stock
        df = pd.read_sql(query, conn)
        print "after",df.tail(5)
    except Exception, arg:
        print "exceptionc:", stock, arg
        raise SystemExit(-1)
项目:strategy    作者:kanghua309    | 项目源码 | 文件源码
def RNNPredict(mask,trigger_date=None,source='History.db'):
    class RNNPredict(CustomFactor):
        inputs = [];
        window_length = 1
        def compute(self, today, assets, out, *inputs):
            if trigger_date != None and today != pd.Timestamp(trigger_date,tz='UTC'):  # ????????factor??????????????factor????computer??????????????? ???
                return
            if os.path.splitext(source)[1] == '.db':
                conn = sqlite3.connect(source, check_same_thread=False)  #?????????????????????
                query = "select * from predict where date >= '%s' order by date limit 1 " % str(today)[:19]
                df = pd.read_sql(query, conn)
                df = df.set_index('date')
                conn.close()
            elif os.path.splitext(source)[1] == '.csv':
                 df = pd.read_csv("predict.csv", index_col=0, parse_dates=True)
                 df = df[df.index >= pd.Timestamp(str(today))]
                 print today,df
            else:
                raise ValueError
            new_index = [sid(asset).symbol + "_return" for asset in assets]
            df = df.reindex(columns = new_index)
            out[:] = df.ix[0].values
            print "RNNpredict:", today, out

    return RNNPredict(mask=mask)
项目:f1_2017    作者:aflaisler    | 项目源码 | 文件源码
def mysql_into_df(self):
        # first Extract all the table names
        # connect = self.db_connection
        df_tables = pd.read_sql('show tables;', self.db_connection)
        # store the tables in a dictionary
        d = {}
        col_name = df_tables.columns[0]
        # load individual table into the dictionary
        for table in df_tables[col_name]:
            key = table
            value = pd.read_sql('SELECT * FROM ' + table +
                                ';', self.db_connection)
            d[key] = value
        self.dictTables = d

    # Load qualification table separatedly to infare the timestamp
项目:stock    作者:Rockyzsu    | 项目源码 | 文件源码
def get_lowest(self, code, date,current_date):
        '''
        ??????????
        :param code: ????
        :param date: ??
        :return:
        '''
        date = date + '-01-01'
        cmd = 'select * from `{}` where datetime > \'{}\' and datetime <\'{}\''.format(code, date,current_date)

        try:
            df = pd.read_sql(cmd, history_engine,index_col='index')
        except Exception,e:
            print e
            return None,None
        #print df.dtypes
        # ???????????????
        if len(df)<1:
            return None,None
        df['low']=df['low'].astype('float64')
        idx= df['low'].idxmin()
        min_date= df.loc[idx]
        return min_date['datetime'],min_date['low']
项目:Odin-Securities    作者:JamesBrofos    | 项目源码 | 文件源码
def standard_sessions(start_date, end_date):
    """Return the datetimes corresponding to the trading sessions in a specified
    time period during which the stock market was officially open.

    Parameters
    ----------
    start_date: Datetime object.
        The datetime indicating the beginning of the trading time period.
    end_date (optional): Datetime object.
        The datetime indicating the ending of the trading time period.
    """
    qry = """
    SELECT p.datetime FROM prices AS p JOIN symbols as s ON p.symbol_id = s.id
    WHERE s.symbol='^GSPC' AND p.datetime >= '{}' AND p.datetime <= '{}'
    """.format(start_date, end_date)
    return pd.read_sql(qry, conn)
项目:UK_Imbalance_Price_Forecasting    作者:ADGEfficiency    | 项目源码 | 文件源码
def query_sql(db_path, table_name):
    """
    Uses pandas to pull data from our sqlite database

    args
        db_path (str) : location of the database
        table_name (str) : name of the table we want

    returns
        data (pd.DataFrame) :
    """
    print('Pulling data for table {} from {}'.format(db_path, table_name))
    #  connect to our database
    conn = sqlite3.connect(db_path)
    #  pull data by selecting the entire table
    data = pd.read_sql(sql='SELECT * from '+str(table_name), con=conn)
    data.set_index('index', drop=True, inplace=True)
    #  close the connection
    conn.close()
    return data
项目:pongr    作者:wseaton    | 项目源码 | 文件源码
def matches():
    singles = pd.read_sql('select * from game where deleted = 0', con=engine)
    doubles = pd.read_sql('select * from doubles_game where deleted = 0', con=engine)

    tz = pytz.timezone('America/New_York')

    for frame in [singles, doubles]:

        frame['timestamp'] = frame['timestamp'].apply(datetime.utcfromtimestamp)
        frame['timestamp'] = frame['timestamp'].apply(datetime.replace, tzinfo=tz)
        frame['timestamp'] = frame['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S %Z')

    singles = singles.to_dict('records')
    doubles = doubles.to_dict('records')

    return render_template('gamelog.html', singles_games=singles, doubles_games=doubles)
项目:pawprint    作者:cbredev    | 项目源码 | 文件源码
def read(self, *fields, **conditionals):
        """
        Pull raw data into a dataframe. If no conditions are passed, pull the whole table.
        Otherwise, filter based on the conditions specified ( currently only equality ).
        """

        # Parse the list of fields to return
        field_query = self._parse_fields(*fields)

        # Parse the conditions
        conditionals_query = self._parse_conditionals(**conditionals)

        query = "SELECT {} FROM {} {}".format(field_query, self.table, conditionals_query)

        if "DISTINCT" not in query:
            query += " ORDER BY {}".format(self.timestamp_field)

        return pd.read_sql(query, self.db)
项目:pawprint    作者:cbredev    | 项目源码 | 文件源码
def test_write(self):
        """Test the tracking of an event."""

        tracker = pawprint.Tracker(db=db, table=table, schema={"id": "INT"})
        tracker.create_table()

        # Check the table's empty
        assert pd.io.sql.execute("SELECT COUNT(*) FROM {}".format(table), db).fetchall() == [(0,)]

        # Add some data and check if the row count increases by one
        tracker.write(id=1337)
        assert pd.io.sql.execute("SELECT COUNT(*) FROM {}".format(table), db).fetchall() == [(1,)]

        # Pull the data and ensure it's correct
        data = pd.read_sql("SELECT * FROM {}".format(table), db)
        assert isinstance(data, pd.DataFrame)
        assert len(data.columns) == 1
        assert data.columns[0] == "id"
        assert data.id[0] == 1337
项目:ghdata    作者:OSSHealth    | 项目源码 | 文件源码
def issues_with_close(self, repoid):
        """
        How long on average each week it takes to close an issue

        :param repoid: The id of the project in the projects table. Use repoid() to get this.
        :return: DataFrame with issues/day
        """
        issuesSQL = s.sql.text("""
            SELECT issues.id as "id",
                   issues.created_at as "date",
                   DATEDIFF(closed.created_at, issues.created_at)  AS "days_to_close"
            FROM issues

           JOIN
                (SELECT * FROM issue_events
                 WHERE issue_events.action = "closed") closed
            ON issues.id = closed.issue_id

            WHERE issues.repo_id = :repoid""")
        return pd.read_sql(issuesSQL, self.db, params={"repoid": str(repoid)})
项目:ghdata    作者:OSSHealth    | 项目源码 | 文件源码
def committer_locations(self, repoid):
        """
        Return committers and their locations

        @todo: Group by country code instead of users, needs the new schema

        :param repoid: The id of the project in the projects table.
        :return: DataFrame with users and locations sorted by commtis
        """
        rawContributionsSQL = s.sql.text("""
            SELECT users.login, users.location, COUNT(*) AS "commits"
            FROM commits
            JOIN project_commits
            ON commits.id = project_commits.commit_id
            JOIN users
            ON users.id = commits.author_id
            WHERE project_commits.project_id = :repoid
            AND LENGTH(users.location) > 1
            GROUP BY users.id
            ORDER BY commits DESC
        """)
        return pd.read_sql(rawContributionsSQL, self.db, params={"repoid": str(repoid)})
项目:ghdata    作者:OSSHealth    | 项目源码 | 文件源码
def issue_response_time(self, repoid):
        """
        How long it takes for issues to be responded to by people who have commits associate with the project

        :param repoid: The id of the project in the projects table.
        :return: DataFrame with the issues' id the date it was
                 opened, and the date it was first responded to
        """
        issuesSQL = s.sql.text("""
            SELECT issues.created_at               AS "created_at",
                   MIN(issue_comments.created_at)  AS "responded_at"
            FROM issues
            JOIN issue_comments
            ON issue_comments.issue_id = issues.id
            WHERE issue_comments.user_id IN
                (SELECT users.id
                FROM users
                JOIN commits
                WHERE commits.author_id = users.id
                AND commits.project_id = :repoid)
            AND issues.repo_id = :repoid
            GROUP BY issues.id
        """)
        return pd.read_sql(issuesSQL, self.db, params={"repoid": str(repoid)})
项目:gtfspy    作者:CxAalto    | 项目源码 | 文件源码
def get_trip_stop_coordinates(self, trip_I):
        """
        Get coordinates for a given trip_I

        Parameters
        ----------
        trip_I : int
            the integer id of the trip

        Returns
        -------
        stop_coords : pandas.DataFrame
            with columns "lats" and "lons"
        """
        query = """SELECT lat, lon
                    FROM stop_times
                    JOIN stops
                    USING(stop_I)
                        WHERE trip_I={trip_I}
                    ORDER BY stop_times.seq""".format(trip_I=trip_I)
        stop_coords = pd.read_sql(query, self.conn)
        return stop_coords
项目:gtfspy    作者:CxAalto    | 项目源码 | 文件源码
def remove_dangling_shapes(db_conn):
    """
    Remove dangling entries from the shapes directory.

    Parameters
    ----------
    db_conn: sqlite3.Connection
        connection to the GTFS object
    """
    db_conn.execute(DELETE_SHAPES_NOT_REFERENCED_IN_TRIPS_SQL)
    SELECT_MIN_MAX_SHAPE_BREAKS_BY_TRIP_I_SQL = \
        "SELECT trips.trip_I, shape_id, min(shape_break) as min_shape_break, max(shape_break) as max_shape_break FROM trips, stop_times WHERE trips.trip_I=stop_times.trip_I GROUP BY trips.trip_I"
    trip_min_max_shape_seqs= pandas.read_sql(SELECT_MIN_MAX_SHAPE_BREAKS_BY_TRIP_I_SQL, db_conn)

    rows = []
    for row in trip_min_max_shape_seqs.itertuples():
        shape_id, min_shape_break, max_shape_break = row.shape_id, row.min_shape_break, row.max_shape_break
        if min_shape_break is None or max_shape_break is None:
            min_shape_break = float('-inf')
            max_shape_break = float('-inf')
        rows.append( (shape_id, min_shape_break, max_shape_break) )
    DELETE_SQL_BASE = "DELETE FROM shapes WHERE shape_id=? AND (seq<? OR seq>?)"
    db_conn.executemany(DELETE_SQL_BASE, rows)
    remove_dangling_shapes_references(db_conn)
项目:qtpandas    作者:draperjames    | 项目源码 | 文件源码
def read_sql(sql, con, filePath, index_col=None, coerce_float=True,
             params=None, parse_dates=None, columns=None, chunksize=None):
    """
    Read SQL query or database table into a DataFrameModel.
    Provide a filePath argument in addition to the *args/**kwargs from
    pandas.read_sql and get a DataFrameModel.

    NOTE: The chunksize option is overridden to None always (for now).

    Reference:
    http://pandas.pydata.org/pandas-docs/version/0.18.1/generated/pandas.read_sql.html
    pandas.read_sql(sql, con, index_col=None, coerce_float=True,
                    params=None, parse_dates=None, columns=None, chunksize=None)



    :return: DataFrameModel
    """

    # TODO: Decide if chunksize is worth keeping and how to handle?
    df = pandas.read_sql(sql, con, index_col, coerce_float,
                    params, parse_dates, columns, chunksize=None)
    return DataFrameModel(df, filePath=filePath)
项目:pyctd    作者:cebel    | 项目源码 | 文件源码
def _limit_and_df(self, query, limit, as_df=False):
        """adds a limit (limit==None := no limit) to any query and allow a return as pandas.DataFrame

        :param bool as_df: if is set to True results return as pandas.DataFrame
        :param `sqlalchemy.orm.query.Query` query: SQL Alchemy query 
        :param int limit: maximum number of results
        :return: query result of pyctd.manager.models.XY objects
        """
        if limit:
            query = query.limit(limit)

        if as_df:
            results = read_sql(query.statement, self.engine)
        else:
            results = query.all()

        return results
项目:sims_featureScheduler    作者:lsst    | 项目源码 | 文件源码
def sqlite2observations(filename='observations.db'):
    """
    Restore a databse of observations.
    """
    con = db.connect(filename)
    df = pd.read_sql('select * from observations;', con)
    blank = empty_observation()
    result = df.as_matrix()
    final_result = np.empty(result.shape[0], dtype=blank.dtype)

    # XXX-ugh, there has to be a better way.
    for i, key in enumerate(blank.dtype.names):
        final_result[key] = result[:, i+1]

    to_convert = ['RA', 'dec', 'alt', 'az', 'rotSkyPos', 'moonAlt', 'sunAlt']
    for key in to_convert:
        final_result[key] = np.radians(final_result[key])

    return final_result
项目:QUANTAXIS    作者:yutiansut    | 项目源码 | 文件源码
def fill(self):
        self.init_db(self.engine)
        df = pd.read_sql("select * from fundamental", self.engine).sort_values(['report_date', 'quarter'])
        df['trade_date'] = df['report_date'] = pd.to_datetime(df['report_date'])

        with click.progressbar(df.groupby('code'),
                               label='writing data',
                               item_show_func=lambda x: x[0] if x else None) as bar:
            bar.is_hidden = False
            for stock, group in bar:
                group = group.drop_duplicates(subset='trade_date', keep="last").set_index('trade_date')
                sessions = pd.date_range(group.index[0], group.index[-1])
                d = group.reindex(sessions, copy=False).fillna(method='pad')
                d.to_sql('full', self.engine, if_exists='append', index_label='trade_date')
项目:SharesData    作者:xjkj123    | 项目源码 | 文件源码
def ReadSqlData(self,name, db):
        conn = create_engine(
            'mysql://' + config.user + ':' + config.password + '@' + config.ip + '/' + db + '?charset=utf8')

        x = 'select * from ' + name + ';'  # sql???
        return pandas.read_sql(x, con=conn)
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def load_world(self, world_type):
        """For open world validation, we must keep track of which onion service
        a trace came from. However for closed world validation, we can select
        traces without consideration of which site they belong to.

        :returns: a pandas DataFrame df containing the dataset
        """

        select_hs_urls = ', t3.hs_url' if world_type is 'open' else ''

        labeled_query = ('select t1.*, t3.is_sd {} '
                           'from features.frontpage_features t1 '
                           'inner join raw.frontpage_examples t2 '
                           'on t1.exampleid = t2.exampleid '
                           'inner join raw.hs_history t3 '
                           'on t3.hsid = t2.hsid').format(select_hs_urls)

        df = pd.read_sql(labeled_query, self.engine)
        return df
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def get_exampleids(self):
        """Get list of exampleids"""
        query = "SELECT DISTINCT exampleid FROM raw.frontpage_traces"
        df = pd.read_sql(query, self.engine)
        return df.exampleid.values
项目:fingerprint-securedrop    作者:freedomofpress    | 项目源码 | 文件源码
def get_ordered_trace_cells(self, exampleid):
        """Get trace for a given exampleid"""
        df = pd.read_sql("""SELECT ingoing, t_trace FROM raw.frontpage_traces
                           WHERE exampleid={}
                           ORDER BY t_trace""".format(exampleid),
                         self.engine)
        return df
项目:LLSIF-AutoTeamBuilder    作者:Joshua1989    | 项目源码 | 文件源码
def update_live_data(download=False):
    def live_summary(live_setting_id):
        group_dict = {1:"?'s", 2:'Aqours'}
        attr_dict = {1:'Smile', 2:'Pure', 3:'Cool'}
        diff_dict = {1:'Easy', 2:'Normal', 3:'Hard', 4:'Expert', 6:'Master'}
        setting = df_live_setting.loc[live_setting_id]
        track_info = df_live_track.loc[setting['live_track_id']]
        live_info = {
            'cover': cover_path(setting['live_icon_asset']),
            'name': track_info['name'],
            'group': group_dict[track_info['member_category']],
            'attr': attr_dict[setting['attribute_icon_id']],
            'note_number': int(setting['s_rank_combo']),
            'diff_level': diff_dict[setting['difficulty']],
            'diff_star': int(setting['stage_level']),
            'file_dir': live_path(setting['notes_setting_asset'])
        }
        return live_info

    print('Downloading latest live.db_')
    opener = urllib.request.URLopener()
    opener.addheader('User-Agent', 'whatever')
    opener.retrieve(live_db_download_url, live_db_dir)

    print('Generating basic live stats')
    conn = sqlite3.connect(live_db_dir)
    df_live_track = pd.read_sql('SELECT * FROM live_track_m', con=conn, index_col='live_track_id')
    df_live_setting = pd.read_sql('SELECT * FROM live_setting_m', con=conn, index_col='live_setting_id')
    # live_data = [live_summary(live_setting_id) for live_setting_id, row in df_live_setting.iterrows() if row['difficulty']!=5]
    live_data = [live_summary(live_setting_id) for live_setting_id, row in df_live_setting.iterrows() if row['difficulty']!=5 and live_setting_id != 10779]

    with open(live_archive_dir, 'w') as fp:
        json.dump(live_data, fp)
    print('Basic live data has been saved in', live_archive_dir)
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def demand_daily_data(db, rows=[], feature='', function='lag', unique=['ToiletID','Collection_Date'], conditions=None):
    """
    A function to generate by day variables for a feature

    Args:
       DICT DB      Connection object (see grab_collections_data)
       LIST ROWS        List of rows
       STR FEATURE      A feature name to create daily records for
       STR FUNCTION     Apply either the LAG or LEAVE function (in the future, maybe some other functions)
       LIST UNIQUE      List of unique identifiers
       STR CONDITIONS   Apply the conditions string (see grab_collections_data)
    Returns:
       DF DAILY_DATA    Pandas data frame of daily variables
    """

    # Reprocess the unique list to account for capitalization
    unique = ','.join(['"%s"' %(uu) for uu in unique])

    # Construct the sql statement using window functions (e.g., OVER and LAG/LEAVE)
    statement = 'SELECT %s' %(unique)
    for rr in rows:
        statement += ', %s("%s", %i, NULL) OVER(order by %s) as "%s_%s%i" ' %(function,
                                              feature,
                                              rr,
                                              unique,
                                              feature,
                                              function,
                                              rr)
    # Complete the statement
    statement += "FROM %s.%s %s ORDER BY %s" %(db['database'],
                           db['table'],
                           conditions,
                           unique)
    # Execute the statement
    daily_data = pd.read_sql(statement,
                con=db['connection'],
                coerce_float=True,
                params=None)
    # Return the lagged/leave data
    return(daily_data)
项目:sanergy-public    作者:dssg    | 项目源码 | 文件源码
def grab_from_features_and_labels(db, fold, config):

    """
    A function that subsets the features df and labels df stored in the Postgres, into train and test features and labels, based on the fold info (train start, train end, test start, test end )

    Args
    DICT FOLD start and end date for both train and test set, in the fomat{"train":(start, end),"test":(start, end)}
    Returns
    df features train
    df labels train
    df features test
    df labels test
    """
    RESPONSE_RENAMER = {'response_f':'response', 'response_u':'response'}
    dataset = pd.read_sql('select * from modeling.dataset where (("Collection_Date" >= '+"'"+fold['train_start'].strftime('%Y-%m-%d')+"'"+') and ("Collection_Date" <= '+"'"+fold['test_end'].strftime('%Y-%m-%d')+"'"+'))', db['connection'], coerce_float=True, params=None)
    toilet_routes = pd.read_sql('select * from modeling.toilet_route', db['connection'], coerce_float=True, params=None)

    #TODO: Fix this...
    dataset = dataset.fillna(0) #A hack to make it run for now...
    #Drop the toilets that do not have contiguous data.
    # Note that missing collections are filled with NaN'd rows, so if a toilet is not contiguous, it must mean that it appeared or disappeared during the fold period -> ignore it.
    toilet_groups = dataset.groupby(config['cols']['toiletname'])
    toilets = dataset[config['cols']['toiletname']].unique()
    number_of_days = max(toilet_groups.size())
    contiguous_toilets = [t for t in toilets if (toilet_groups.size()[t] == number_of_days)]
    dataset = dataset.loc[dataset[config['cols']['toiletname']].isin(contiguous_toilets)]
    #Sort for the purposes of later functions...
    dataset = dataset.sort_values(by=['Collection_Date','ToiletID'])


    features_train = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)
    features_test = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"]))].drop(['response_f','response_u',config['Xy']['response_f']['variable'], config['Xy']['response_u']['variable']],axis=1)

    labels_train_u = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_train_f = dataset.loc[((dataset['Collection_Date']>=fold["train_start"]) & (dataset['Collection_Date']<=fold["train_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_f = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_f','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    labels_test_u = dataset.loc[((dataset['Collection_Date']>=fold["test_start"]) & (dataset['Collection_Date']<=fold["test_end"])),['response_u','Collection_Date','ToiletID']].rename(columns=RESPONSE_RENAMER)
    return(features_train, labels_train_f, labels_train_u, features_test, labels_test_f, labels_test_u, toilet_routes)
项目:wikilinks    作者:trovdimi    | 项目源码 | 文件源码
def pickle_vis_data_pandas():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()


    df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
    print len(df)

    no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
    print len(no_dup)

    feature = no_dup.loc[no_dup['visual_region']=='lead']
    print len(feature)
    feature.reset_index(inplace=True)


    feature = no_dup.loc[no_dup['visual_region']=='infobox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='navbox']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)

    feature = no_dup.loc[no_dup['visual_region']=='left-body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)

    feature = no_dup.loc[no_dup['visual_region']=='body']
    print len(feature)
    feature.reset_index(inplace=True)
    feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
项目:wikilinks    作者:trovdimi    | 项目源码 | 文件源码
def get_redirecsfromXML(self, dump_date):
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        conn = db._create_connection()
        df = pd.read_sql(('select * from redirects'),conn)
        return df.set_index('source_article_name')['target_article_name'].to_dict()
项目:wikilinks    作者:trovdimi    | 项目源码 | 文件源码
def pickle_correlations_zeros():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id, IFNULL(counts, 0) as counts from link_features group by source_article_id, target_article_id', conn)
    print 'group'
    article_counts = df.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'article_counts.tsv', sep='\t', index=False)