Fix using external table in a subplan.

ParallelizeCorrelatedSubPlanMutator() turns each Scan on a base relation into a "Result - Material - Broadcast - Scan" pattern, but it missed ExternalScans. External tables are supposed to be treated as distributed, i.e. each segment holds different part of the external table, so they need to be treated like regular tables.

Fix using external table in a subplan.
ParallelizeCorrelatedSubPlanMutator() turns each Scan on a base relation into a "Result - Material - Broadcast - Scan" pattern, but it missed ExternalScans. External tables are supposed to be treated as distributed, i.e. each segment holds different part of the external table, so they need to be treated like regular tables.
8ae5a93f · Heikki Linnakangas · 1be38f8f · 8ae5a93f · 8ae5a93f · 8ae5a93f
4 changed file
--- a/src/backend/cdb/cdbllize.c
+++ b/src/backend/cdb/cdbllize.c
@@ -447,6 +447,12 @@ static Node *ParallelizeCorrelatedSubPlanUpdateFlowMutator(Node *node)
 * 				 	 	 \_SeqScan (no quals)
 * 	This transformed plan can be executed in a parallel setting since the correlation
 * 	is now part of the result node which executes in the same slice as the outer plan node.
+ *
+ * XXX: This relies on the planner to not generate other kinds of scans, like
+ * IndexScans. We don't have the machinery in place to rescan those with different
+ * parameters. We could support e.g. IndexScans as long as the index qual doesn't
+ * refer to the outer parameter, but the planner isn't currently smart enough to
+ * distinguish that, so we just disable index scans altogether in a subplan.
 */
 static Node* ParallelizeCorrelatedSubPlanMutator(Node *node, ParallelizeCorrelatedPlanWalkerContext *ctx)
 {
@@ -475,7 +481,8 @@ static Node* ParallelizeCorrelatedSubPlanMutator(Node *node, ParallelizeCorrelat
 	if (IsA(node, SeqScan)
 		|| IsA(node, AppendOnlyScan)
 		|| IsA(node, AOCSScan)
-		|| IsA(node, ShareInputScan))
+		|| IsA(node, ShareInputScan)
+		|| IsA(node, ExternalScan))
 	{
 		Plan *scanPlan = (Plan *) node;
 		/**

--- a/src/test/regress/expected/subselect_gp2.out
+++ b/src/test/regress/expected/subselect_gp2.out
+-- Test using an external table in a subquery.
+--
+-- We used to have a bug where the scan on the external table was not
+-- broadcast to all nodes, so each segment scanned only its own portion
+-- of the external table, when the scan was in a subquery. In that case,
+-- the count(*) calculated for each value below was 1, but it should be
+-- equal to the number of segments, because this external table produces
+-- the same rows on every segment.
+CREATE EXTERNAL WEB TABLE echotable (c1 int, c2 int, c3 int) EXECUTE
+'echo "1,2,3"; echo "4,5,6";' FORMAT 'TEXT' (DELIMITER ',');
+create table test_ext_foo (c1 int, c2 int4);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into test_ext_foo select g, g from generate_series(1, 20) g;
+-- This should return 2 and 5, as the two rows are duplicated in
+-- every segment (assuming you have at least two segments in your
+-- cluster).
+select c2 from echotable group by c2 having count(*) >= 2;
+ c2 
+----
+  2
+  5
+(2 rows)
+
+select * from test_ext_foo as o
+where (select count(*) from echotable as i where i.c2 = o.c2) >= 2;
+ c1 | c2 
+----+----
+  5 |  5
+  2 |  2
+(2 rows)
+
--- a/src/test/regress/greenplum_schedule
+++ b/src/test/regress/greenplum_schedule
@@ -32,7 +32,7 @@ test: sort_finish_pending

 test: gpdiffcheck gptokencheck information_schema gp_hashagg sequence_gp tidscan

-test: rangefuncs_cdb gp_dqa external_table subselect_gp indexjoin distributed_transactions olap_group olap_window_seq with_clause as_alias regex_gp partition1
+test: rangefuncs_cdb gp_dqa external_table subselect_gp subselect_gp2 indexjoin distributed_transactions olap_group olap_window_seq with_clause as_alias regex_gp partition1

 # 'partition' runs for a long time, so try to keep it together with other
 # long-running tests. Unfortunately, 'partition' also assumes that there

--- a/src/test/regress/sql/subselect_gp2.sql
+++ b/src/test/regress/sql/subselect_gp2.sql
+-- Test using an external table in a subquery.
+--
+-- We used to have a bug where the scan on the external table was not
+-- broadcast to all nodes, so each segment scanned only its own portion
+-- of the external table, when the scan was in a subquery. In that case,
+-- the count(*) calculated for each value below was 1, but it should be
+-- equal to the number of segments, because this external table produces
+-- the same rows on every segment.
+
+CREATE EXTERNAL WEB TABLE echotable (c1 int, c2 int, c3 int) EXECUTE
+'echo "1,2,3"; echo "4,5,6";' FORMAT 'TEXT' (DELIMITER ',');
+
+create table test_ext_foo (c1 int, c2 int4);
+insert into test_ext_foo select g, g from generate_series(1, 20) g;
+
+-- This should return 2 and 5, as the two rows are duplicated in
+-- every segment (assuming you have at least two segments in your
+-- cluster).
+select c2 from echotable group by c2 having count(*) >= 2;
+
+select * from test_ext_foo as o
+where (select count(*) from echotable as i where i.c2 = o.c2) >= 2;