在我的 PostgreSQL 9.6.2 数据库中,我有一个查询,它根据一些股票数据构建一个计算字段表。它为表中的每一行计算 1 到 10 年的移动平均窗口,并将其用于周期性调整。特别是 CAPE、CAPB、CAPC、CAPS 和 CAPD。
即,对于每一行,您计算avg(earnings)过去 1 到 10 年的数据,然后对其他几个变量执行相同的操作。
我目前正在使用横向连接来计算每一行的聚合,但是事情非常慢,而且我不完全确定如何加快速度,无论是索引/重写查询等。
例如,当我将查询分层为仅包含 ~ 25k 行时,运行需要 15 分钟,这似乎太慢了。(AWS 上的 RDS 免费套餐)
-- Initialize Cyclical Adjustments -- This query populates the database with numerous peak/min and CAPE type calculations. -- We do this by selecting each valid row within the table by security then laterally -- selecting the calculations for each of those rows. 'Valid' rows are determined by -- date calculations that make sure every field that has insufficient data behind it -- (several queries want 5+ years of time series data) is filled with NULL to avoid -- inaccuracies. WITH earliest_point AS ( SELECT security_id, min(date) as min_date FROM bloomberg.security_data GROUP BY security_id ) SELECT rec.record_id, rec.security_id, date, -- Each of these cases decides if we have enough data in the database to populate the field. If there are at least -- x years in the database (where x = 1:10) we do the price / aggregate computation. Otherwise, we shortcut to NULL. -- NOTE: The NULLIF prevents us from dividing by zero. CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.max_earnings, 0) ELSE NULL END AS price_to_peak_earnings, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.min_earnings, 0) ELSE NULL END AS price_to_minimum_earnings, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.max_book, 0) ELSE NULL END AS price_to_peak_book, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.min_book, 0) ELSE NULL END AS price_to_minimum_book, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.max_sales, 0) ELSE NULL END AS price_to_peak_sales, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.min_sales, 0) ELSE NULL END AS price_to_minimum_sales, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.max_cashflow, 0) ELSE NULL END AS price_to_peak_cashflow, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.min_cashflow, 0) ELSE NULL END AS price_to_minimum_cashflow, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.max_dividends, 0) ELSE NULL END AS price_to_peak_dividends, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.min_dividends, 0) ELSE NULL END AS price_to_minimum_dividends, CASE WHEN ep.min_date < rec.date - '1 years'::INTERVAL THEN price / NULLIF(ru.cap1_avg_earnings, 0) ELSE NULL END AS cape1, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_earnings, 0) ELSE NULL END AS cape2, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_earnings, 0) ELSE NULL END AS cape2, CASE WHEN ep.min_date < rec.date - '3 years'::INTERVAL THEN price / NULLIF(ru.cap3_avg_earnings, 0) ELSE NULL END AS cape3, CASE WHEN ep.min_date < rec.date - '4 years'::INTERVAL THEN price / NULLIF(ru.cap4_avg_earnings, 0) ELSE NULL END AS cape4, CASE WHEN ep.min_date < rec.date - '5 years'::INTERVAL THEN price / NULLIF(ru.cap5_avg_earnings, 0) ELSE NULL END AS cape5, CASE WHEN ep.min_date < rec.date - '6 years'::INTERVAL THEN price / NULLIF(ru.cap6_avg_earnings, 0) ELSE NULL END AS cape6, CASE WHEN ep.min_date < rec.date - '7 years'::INTERVAL THEN price / NULLIF(ru.cap7_avg_earnings, 0) ELSE NULL END AS cape7, CASE WHEN ep.min_date < rec.date - '8 years'::INTERVAL THEN price / NULLIF(ru.cap8_avg_earnings, 0) ELSE NULL END AS cape8, CASE WHEN ep.min_date < rec.date - '9 years'::INTERVAL THEN price / NULLIF(ru.cap9_avg_earnings, 0) ELSE NULL END AS cape9, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.cap10_avg_earnings, 0) ELSE NULL END AS cape10, CASE WHEN ep.min_date < rec.date - '1 years'::INTERVAL THEN price / NULLIF(ru.cap1_avg_book, 0) ELSE NULL END AS capb1, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_book, 0) ELSE NULL END AS capb2, CASE WHEN ep.min_date < rec.date - '3 years'::INTERVAL THEN price / NULLIF(ru.cap3_avg_book, 0) ELSE NULL END AS capb3, CASE WHEN ep.min_date < rec.date - '4 years'::INTERVAL THEN price / NULLIF(ru.cap4_avg_book, 0) ELSE NULL END AS capb4, CASE WHEN ep.min_date < rec.date - '5 years'::INTERVAL THEN price / NULLIF(ru.cap5_avg_book, 0) ELSE NULL END AS capb5, CASE WHEN ep.min_date < rec.date - '6 years'::INTERVAL THEN price / NULLIF(ru.cap6_avg_book, 0) ELSE NULL END AS capb6, CASE WHEN ep.min_date < rec.date - '7 years'::INTERVAL THEN price / NULLIF(ru.cap7_avg_book, 0) ELSE NULL END AS capb7, CASE WHEN ep.min_date < rec.date - '8 years'::INTERVAL THEN price / NULLIF(ru.cap8_avg_book, 0) ELSE NULL END AS capb8, CASE WHEN ep.min_date < rec.date - '9 years'::INTERVAL THEN price / NULLIF(ru.cap9_avg_book, 0) ELSE NULL END AS capb9, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.cap10_avg_book, 0) ELSE NULL END AS capb10, CASE WHEN ep.min_date < rec.date - '1 years'::INTERVAL THEN price / NULLIF(ru.cap1_avg_sales, 0) ELSE NULL END AS caps1, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_sales, 0) ELSE NULL END AS caps2, CASE WHEN ep.min_date < rec.date - '3 years'::INTERVAL THEN price / NULLIF(ru.cap3_avg_sales, 0) ELSE NULL END AS caps3, CASE WHEN ep.min_date < rec.date - '4 years'::INTERVAL THEN price / NULLIF(ru.cap4_avg_sales, 0) ELSE NULL END AS caps4, CASE WHEN ep.min_date < rec.date - '5 years'::INTERVAL THEN price / NULLIF(ru.cap5_avg_sales, 0) ELSE NULL END AS caps5, CASE WHEN ep.min_date < rec.date - '6 years'::INTERVAL THEN price / NULLIF(ru.cap6_avg_sales, 0) ELSE NULL END AS caps6, CASE WHEN ep.min_date < rec.date - '7 years'::INTERVAL THEN price / NULLIF(ru.cap7_avg_sales, 0) ELSE NULL END AS caps7, CASE WHEN ep.min_date < rec.date - '8 years'::INTERVAL THEN price / NULLIF(ru.cap8_avg_sales, 0) ELSE NULL END AS caps8, CASE WHEN ep.min_date < rec.date - '9 years'::INTERVAL THEN price / NULLIF(ru.cap9_avg_sales, 0) ELSE NULL END AS caps9, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.cap10_avg_sales, 0) ELSE NULL END AS caps10, CASE WHEN ep.min_date < rec.date - '1 years'::INTERVAL THEN price / NULLIF(ru.cap1_avg_cashflow, 0) ELSE NULL END AS capc1, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_cashflow, 0) ELSE NULL END AS capc2, CASE WHEN ep.min_date < rec.date - '3 years'::INTERVAL THEN price / NULLIF(ru.cap3_avg_cashflow, 0) ELSE NULL END AS capc3, CASE WHEN ep.min_date < rec.date - '4 years'::INTERVAL THEN price / NULLIF(ru.cap4_avg_cashflow, 0) ELSE NULL END AS capc4, CASE WHEN ep.min_date < rec.date - '5 years'::INTERVAL THEN price / NULLIF(ru.cap5_avg_cashflow, 0) ELSE NULL END AS capc5, CASE WHEN ep.min_date < rec.date - '6 years'::INTERVAL THEN price / NULLIF(ru.cap6_avg_cashflow, 0) ELSE NULL END AS capc6, CASE WHEN ep.min_date < rec.date - '7 years'::INTERVAL THEN price / NULLIF(ru.cap7_avg_cashflow, 0) ELSE NULL END AS capc7, CASE WHEN ep.min_date < rec.date - '8 years'::INTERVAL THEN price / NULLIF(ru.cap8_avg_cashflow, 0) ELSE NULL END AS capc8, CASE WHEN ep.min_date < rec.date - '9 years'::INTERVAL THEN price / NULLIF(ru.cap9_avg_cashflow, 0) ELSE NULL END AS capc9, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.cap10_avg_cashflow, 0) ELSE NULL END AS capc10, CASE WHEN ep.min_date < rec.date - '1 years'::INTERVAL THEN price / NULLIF(ru.cap1_avg_dividends, 0) ELSE NULL END AS capd1, CASE WHEN ep.min_date < rec.date - '2 years'::INTERVAL THEN price / NULLIF(ru.cap2_avg_dividends, 0) ELSE NULL END AS capd2, CASE WHEN ep.min_date < rec.date - '3 years'::INTERVAL THEN price / NULLIF(ru.cap3_avg_dividends, 0) ELSE NULL END AS capd3, CASE WHEN ep.min_date < rec.date - '4 years'::INTERVAL THEN price / NULLIF(ru.cap4_avg_dividends, 0) ELSE NULL END AS capd4, CASE WHEN ep.min_date < rec.date - '5 years'::INTERVAL THEN price / NULLIF(ru.cap5_avg_dividends, 0) ELSE NULL END AS capd5, CASE WHEN ep.min_date < rec.date - '6 years'::INTERVAL THEN price / NULLIF(ru.cap6_avg_dividends, 0) ELSE NULL END AS capd6, CASE WHEN ep.min_date < rec.date - '7 years'::INTERVAL THEN price / NULLIF(ru.cap7_avg_dividends, 0) ELSE NULL END AS capd7, CASE WHEN ep.min_date < rec.date - '8 years'::INTERVAL THEN price / NULLIF(ru.cap8_avg_dividends, 0) ELSE NULL END AS capd8, CASE WHEN ep.min_date < rec.date - '9 years'::INTERVAL THEN price / NULLIF(ru.cap9_avg_dividends, 0) ELSE NULL END AS capd9, CASE WHEN ep.min_date < rec.date - '10 years'::INTERVAL THEN price / NULLIF(ru.cap10_avg_dividends, 0) ELSE NULL END AS capd10 FROM bloomberg.security_data rec -- Include the earliest point we have for this security in the record JOIN earliest_point ep ON ep.security_id = rec.security_id, -- LATERAL SELECT is executed for each row in the above query, with the row (rec) as a parameter LATERAL ( SELECT -- Price to Peak/Minimum <field> calculations max(earnings) AS max_earnings, min(earnings) AS min_earnings, max(book) AS max_book, min(book) AS min_book, max(sales) AS max_sales, min(sales) AS min_sales, max(cashflow) AS max_cashflow, min(cashflow) AS min_cashflow, max(dividends) AS max_dividends, min(dividends) AS min_dividends, -- Each of the following computes the aggregates for the -- CAPE/B/S/C/D cyclical adjustments. avg(earnings) FILTER (WHERE date >= rec.date - '1 years'::interval) AS cap1_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '1 years'::interval) AS cap1_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '1 years'::interval) AS cap1_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '1 years'::interval) AS cap1_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '1 years'::interval) AS cap1_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '2 years'::interval) AS cap2_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '2 years'::interval) AS cap2_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '2 years'::interval) AS cap2_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '2 years'::interval) AS cap2_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '2 years'::interval) AS cap2_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '3 years'::interval) AS cap3_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '3 years'::interval) AS cap3_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '3 years'::interval) AS cap3_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '3 years'::interval) AS cap3_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '3 years'::interval) AS cap3_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '4 years'::interval) AS cap4_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '4 years'::interval) AS cap4_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '4 years'::interval) AS cap4_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '4 years'::interval) AS cap4_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '4 years'::interval) AS cap4_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '5 years'::interval) AS cap5_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '5 years'::interval) AS cap5_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '5 years'::interval) AS cap5_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '5 years'::interval) AS cap5_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '5 years'::interval) AS cap5_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '6 years'::interval) AS cap6_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '6 years'::interval) AS cap6_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '6 years'::interval) AS cap6_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '6 years'::interval) AS cap6_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '6 years'::interval) AS cap6_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '7 years'::interval) AS cap7_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '7 years'::interval) AS cap7_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '7 years'::interval) AS cap7_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '7 years'::interval) AS cap7_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '7 years'::interval) AS cap7_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '8 years'::interval) AS cap8_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '8 years'::interval) AS cap8_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '8 years'::interval) AS cap8_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '8 years'::interval) AS cap8_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '8 years'::interval) AS cap8_avg_dividends, avg(earnings) FILTER (WHERE date >= rec.date - '9 years'::interval) AS cap9_avg_earnings, avg(book) FILTER (WHERE date >= rec.date - '9 years'::interval) AS cap9_avg_book, avg(sales) FILTER (WHERE date >= rec.date - '9 years'::interval) AS cap9_avg_sales, avg(cashflow) FILTER (WHERE date >= rec.date - '9 years'::interval) AS cap9_avg_cashflow, avg(dividends) FILTER (WHERE date >= rec.date - '9 years'::interval) AS cap9_avg_dividends, avg(earnings) AS cap10_avg_earnings, avg(book) AS cap10_avg_book, avg(sales) AS cap10_avg_sales, avg(cashflow) AS cap10_avg_cashflow, avg(dividends) AS cap10_avg_dividends FROM bloomberg.security_data DATA WHERE security_id = rec.security_id AND date >= rec.date - '10 years'::interval AND date <= rec.date ) ru;
因为我是 PostgreSQL 的新手,所以任何关于如何加快速度的想法都将不胜感激。
以下是数据库设置供参考:
CREATE SCHEMA bloomberg; CREATE TABLE bloomberg.securities ( security_id character varying(45) PRIMARY KEY, name_short character varying(45) NOT NULL, name character varying(45) NOT NULL, name_security character varying(45) NOT NULL ); CREATE TABLE bloomberg.security_data ( record_id bigserial PRIMARY KEY, date date NOT NULL, security_id character varying(45) NOT NULL, price double precision, total_return double precision, earnings double precision, book double precision, sales double precision, cashflow double precision, dividends double precision, CONSTRAINT security_id FOREIGN KEY (security_id) REFERENCES bloomberg.securities (security_id) MATCH SIMPLE ON UPDATE CASCADE ON DELETE CASCADE ); CREATE INDEX security_data_data on bloomberg.security_data (date); CREATE INDEX security_data_security_id on bloomberg.security_data (security_id);
这应该是带有LATERAL子查询的更快变体。未经测试。
SELECT s.record_id, s.security_id, s.date , s.price / l.pmax AS price_to_peak_earnings , s.price / l.pmin AS price_to_minimum_earnings -- , ... , s.price / l.cape1 AS cape1 , s.price / l.cape2 AS cape2 -- , ... , s.price / l.cape10 AS cape10 , s.price / l.capb1 AS capb1 , s.price / l.capb2 AS capb2 -- , ... , s.price / l.capb10 AS capb10 -- , ... FROM ( SELECT * , (date - interval '1 y')::date AS date1 , (date - interval '2 y')::date AS date2 -- ... , (date - interval '10 y')::date AS date10 FROM ( SELECT *, min(date) OVER (PARTITION BY security_id) AS min_date FROM security_data ) s1 ) s LEFT JOIN LATERAL ( SELECT CASE WHEN s.date10 >= s.min_date THEN NULLIF(max(earnings) , 0) END AS pmax , CASE WHEN s.date10 >= s.min_date THEN NULLIF(min(earnings) , 0) END AS pmin -- ... , NULLIF(avg(earnings) FILTER (WHERE date >= s.date1), 0) AS cape1 -- no case , CASE WHEN s.date2 >= s.min_date THEN NULLIF(avg(earnings) FILTER (WHERE date >= s.date2), 0) END AS cape2 -- ... , CASE WHEN s.date10 >= s.min_date THEN NULLIF(avg(earnings) , 0) END AS cape10 -- no filter , NULLIF(avg(book) FILTER (WHERE date >= s.date1), 0) AS capb1 , CASE WHEN s.date2 >= s.min_date THEN NULLIF(avg(book) FILTER (WHERE date >= s.date2), 0) END AS capb2 -- ... , CASE WHEN s.date10 >= s.min_date THEN NULLIF(avg(book) , 0) END AS capb10 -- ... FROM security_data WHERE security_id = s.security_id AND date >= s.date10 AND date < s.date ) l ON s.date1 >= s.min_date -- no computations if < 1 year of trailing data ORDER BY s.security_id, s.date;
仍然不会很快,因为每一行都需要多个单独的聚合。这里的瓶颈将是 CPU。