diff --git a/src/backend/cdb/cdbllize.c b/src/backend/cdb/cdbllize.c index f7db8e65a6bf940e154935352acfb25daf9def12..ef3202daa9dadfb53dc3fce743b20590585df936 100644 --- a/src/backend/cdb/cdbllize.c +++ b/src/backend/cdb/cdbllize.c @@ -447,6 +447,12 @@ static Node *ParallelizeCorrelatedSubPlanUpdateFlowMutator(Node *node) * \_SeqScan (no quals) * This transformed plan can be executed in a parallel setting since the correlation * is now part of the result node which executes in the same slice as the outer plan node. + * + * XXX: This relies on the planner to not generate other kinds of scans, like + * IndexScans. We don't have the machinery in place to rescan those with different + * parameters. We could support e.g. IndexScans as long as the index qual doesn't + * refer to the outer parameter, but the planner isn't currently smart enough to + * distinguish that, so we just disable index scans altogether in a subplan. */ static Node* ParallelizeCorrelatedSubPlanMutator(Node *node, ParallelizeCorrelatedPlanWalkerContext *ctx) { @@ -475,7 +481,8 @@ static Node* ParallelizeCorrelatedSubPlanMutator(Node *node, ParallelizeCorrelat if (IsA(node, SeqScan) || IsA(node, AppendOnlyScan) || IsA(node, AOCSScan) - || IsA(node, ShareInputScan)) + || IsA(node, ShareInputScan) + || IsA(node, ExternalScan)) { Plan *scanPlan = (Plan *) node; /** diff --git a/src/test/regress/expected/subselect_gp2.out b/src/test/regress/expected/subselect_gp2.out new file mode 100644 index 0000000000000000000000000000000000000000..db30d47afe2f90f7c98458aa66d41cce283cbe83 --- /dev/null +++ b/src/test/regress/expected/subselect_gp2.out @@ -0,0 +1,32 @@ +-- Test using an external table in a subquery. +-- +-- We used to have a bug where the scan on the external table was not +-- broadcast to all nodes, so each segment scanned only its own portion +-- of the external table, when the scan was in a subquery. In that case, +-- the count(*) calculated for each value below was 1, but it should be +-- equal to the number of segments, because this external table produces +-- the same rows on every segment. +CREATE EXTERNAL WEB TABLE echotable (c1 int, c2 int, c3 int) EXECUTE +'echo "1,2,3"; echo "4,5,6";' FORMAT 'TEXT' (DELIMITER ','); +create table test_ext_foo (c1 int, c2 int4); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into test_ext_foo select g, g from generate_series(1, 20) g; +-- This should return 2 and 5, as the two rows are duplicated in +-- every segment (assuming you have at least two segments in your +-- cluster). +select c2 from echotable group by c2 having count(*) >= 2; + c2 +---- + 2 + 5 +(2 rows) + +select * from test_ext_foo as o +where (select count(*) from echotable as i where i.c2 = o.c2) >= 2; + c1 | c2 +----+---- + 5 | 5 + 2 | 2 +(2 rows) + diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index 0814be6a259aa38c0cb565e4bb863d496294b219..1a230348c2771349b1790320617a7f460403bdcc 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -32,7 +32,7 @@ test: sort_finish_pending test: gpdiffcheck gptokencheck information_schema gp_hashagg sequence_gp tidscan -test: rangefuncs_cdb gp_dqa external_table subselect_gp indexjoin distributed_transactions olap_group olap_window_seq with_clause as_alias regex_gp partition1 +test: rangefuncs_cdb gp_dqa external_table subselect_gp subselect_gp2 indexjoin distributed_transactions olap_group olap_window_seq with_clause as_alias regex_gp partition1 # 'partition' runs for a long time, so try to keep it together with other # long-running tests. Unfortunately, 'partition' also assumes that there diff --git a/src/test/regress/sql/subselect_gp2.sql b/src/test/regress/sql/subselect_gp2.sql new file mode 100644 index 0000000000000000000000000000000000000000..790ba89251f4050aaddceb1971610ac342d828b4 --- /dev/null +++ b/src/test/regress/sql/subselect_gp2.sql @@ -0,0 +1,22 @@ +-- Test using an external table in a subquery. +-- +-- We used to have a bug where the scan on the external table was not +-- broadcast to all nodes, so each segment scanned only its own portion +-- of the external table, when the scan was in a subquery. In that case, +-- the count(*) calculated for each value below was 1, but it should be +-- equal to the number of segments, because this external table produces +-- the same rows on every segment. + +CREATE EXTERNAL WEB TABLE echotable (c1 int, c2 int, c3 int) EXECUTE +'echo "1,2,3"; echo "4,5,6";' FORMAT 'TEXT' (DELIMITER ','); + +create table test_ext_foo (c1 int, c2 int4); +insert into test_ext_foo select g, g from generate_series(1, 20) g; + +-- This should return 2 and 5, as the two rows are duplicated in +-- every segment (assuming you have at least two segments in your +-- cluster). +select c2 from echotable group by c2 having count(*) >= 2; + +select * from test_ext_foo as o +where (select count(*) from echotable as i where i.c2 = o.c2) >= 2;