假设我有下表:
child_id parent_id 1 2 2 3 3 - 4 5 5 6 6 - 7 8
并且我想创建下表:
child_id parent_id branch_id 1 2 1 2 3 1 3 - 1 4 5 2 5 6 2 6 - 2 7 8 3
其中branch_id表示通过parent_id链接在一起的分组。
但是, 不能保证行顺序 , 分支可能包含数百行 。这排除了LAG()函数的简单使用。
鉴于BigQuery的SQL的局限性,我该如何实现?
下面的示例说明了如何在这种情况下使用BigQuery脚本
DECLARE rows_count, run_away_stop INT64 DEFAULT 0; CREATE TEMP TABLE input AS SELECT 1 child_id, 2 parent_id UNION ALL SELECT 2, 3 UNION ALL SELECT 3, NULL UNION ALL SELECT 4, 5 UNION ALL SELECT 5, 6 UNION ALL SELECT 6, NULL UNION ALL SELECT 7, 8 ; CREATE TEMP TABLE ttt AS SELECT ARRAY(SELECT val FROM UNNEST([child_id, IFNULL(parent_id, child_id)]) val ORDER BY val ) arr FROM input; LOOP SET (run_away_stop, rows_count) = (SELECT AS STRUCT run_away_stop + 1, COUNT(1) FROM ttt); CREATE OR REPLACE TEMP TABLE ttt AS SELECT ANY_VALUE(arr) arr FROM ( SELECT ARRAY(SELECT DISTINCT val FROM UNNEST(arr) val ORDER BY val) arr FROM ( SELECT ANY_VALUE(arr1) arr1, ARRAY_CONCAT_AGG(arr) arr FROM ( SELECT t1.arr arr1, t2.arr arr2, ARRAY(SELECT DISTINCT val FROM UNNEST(ARRAY_CONCAT( t1.arr, t2.arr)) val ORDER BY val) arr FROM ttt t1, ttt t2 WHERE (SELECT COUNT(1) FROM UNNEST(t1.arr) val JOIN UNNEST(t2.arr) val USING(val)) > 0 ) GROUP BY FORMAT('%t', arr1) ) ) GROUP BY FORMAT('%t', arr); IF (rows_count = (SELECT COUNT(1) FROM ttt) AND run_away_stop > 1) OR run_away_stop > 10 THEN BREAK; END IF; END LOOP; SELECT input.*, branch_id FROM input JOIN ( SELECT ROW_NUMBER() OVER() AS branch_id, arr AS IDs FROM ttt ) ON child_id IN UNNEST(IDs)
最终输出
Row child_id parent_id branch_id 1 7 8 1 2 4 5 2 3 6 null 2 4 5 6 2 5 3 null 3 6 2 3 3 7 1 2 3