Fix volatile functions handling by ORCA

Following commits have been cherry-picked again: b1f543f3. b0359e69. a341621d. The contrib/dblink tests were failing with ORCA after the above commits. The issue has been fixed now in ORCA v3.1.0. Hence we re-enabled these commits and bumping the ORCA version.

Fix volatile functions handling by ORCA
Following commits have been cherry-picked again: b1f543f3. b0359e69. a341621d. The contrib/dblink tests were failing with ORCA after the above commits. The issue has been fixed now in ORCA v3.1.0. Hence we re-enabled these commits and bumping the ORCA version.
e17c6f9a · Dhanashree Kashid · Sambitesh Dash · 1d254cf1 · e17c6f9a · e17c6f9a
14 changed file
--- a/config/orca.m4
+++ b/config/orca.m4
@@ -40,10 +40,10 @@ AC_RUN_IFELSE([AC_LANG_PROGRAM([[
 #include <string.h>
 ]],
 [
-return strncmp("2.75.", GPORCA_VERSION_STRING, 5);
+return strncmp("3.1.", GPORCA_VERSION_STRING, 4);
 ])],
 [AC_MSG_RESULT([[ok]])],
-[AC_MSG_ERROR([Your ORCA version is expected to be 2.75.XXX])]
+[AC_MSG_ERROR([Your ORCA version is expected to be 3.1.XXX])]
 )
 AC_LANG_POP([C++])
 ])# PGAC_CHECK_ORCA_VERSION

--- a/configure
+++ b/configure
@@ -13625,7 +13625,7 @@ int
 main ()
 {

-return strncmp("2.75.", GPORCA_VERSION_STRING, 5);
+return strncmp("3.1.", GPORCA_VERSION_STRING, 4);

  ;
  return 0;
@@ -13635,7 +13635,7 @@ if ac_fn_cxx_try_run "$LINENO"; then :
  { $as_echo "$as_me:${as_lineno-$LINENO}: result: ok" >&5
 $as_echo "ok" >&6; }
 else
-  as_fn_error $? "Your ORCA version is expected to be 2.75.XXX" "$LINENO" 5
+  as_fn_error $? "Your ORCA version is expected to be 3.1.XXX" "$LINENO" 5

 fi
 rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \

--- a/depends/conanfile_orca.txt
+++ b/depends/conanfile_orca.txt
 [requires]
-orca/v2.75.0@gpdb/stable
+orca/v3.1.0@gpdb/stable

 [imports]
 include, * -> build/include

--- a/gpAux/releng/releng.mk
+++ b/gpAux/releng/releng.mk
@@ -121,7 +121,7 @@ sync_tools: opt_write_test /opt/releng/apache-ant
 	-Divyrepo.user=$(IVYREPO_USER) -Divyrepo.passwd="$(IVYREPO_PASSWD)" -quiet resolve);

 ifeq "$(findstring aix,$(BLD_ARCH))" ""
-	LD_LIBRARY_PATH='' wget --no-check-certificate -q -O - https://github.com/greenplum-db/gporca/releases/download/v2.75.0/bin_orca_centos5_release.tar.gz | tar zxf - -C $(BLD_TOP)/ext/$(BLD_ARCH)
+	LD_LIBRARY_PATH='' wget --no-check-certificate -q -O - https://github.com/greenplum-db/gporca/releases/download/v3.1.0/bin_orca_centos5_release.tar.gz | tar zxf - -C $(BLD_TOP)/ext/$(BLD_ARCH)
 endif

 clean_tools: opt_write_test

--- a/src/backend/gpopt/config/CConfigParamMapping.cpp
+++ b/src/backend/gpopt/config/CConfigParamMapping.cpp
@@ -316,6 +316,13 @@ CConfigParamMapping::SConfigMappingElem CConfigParamMapping::m_elements[] =
 		GPOS_WSZ_LIT("Enable motion hazard handling during NLJ optimization and generate streaming material when appropriate")
 		},

+		{
+		EopttraceDisableNonMasterGatherForDML,
+		&optimizer_enable_gather_on_segment_for_dml,
+		true,  // m_fNegate
+		GPOS_WSZ_LIT("Enable DML optimization by enforcing a non-master gather when appropriate")
+		},
+
 		{
 		EopttraceEnforceCorrelatedExecution,
 		&optimizer_enforce_subplans,

--- a/src/backend/gpopt/gpdbwrappers.cpp
+++ b/src/backend/gpopt/gpdbwrappers.cpp
@@ -2797,6 +2797,27 @@ gpdb::CdbHashConst
 	return 0;
 }

+// pick a segment randomly from a pool of segments using GPDB's hash function
+int32
+gpdb::CdbHashRandom
+	(
+	int num_segments
+	)
+{
+	GP_WRAP_START;
+	{
+		CdbHash    *pcdbhash = makeCdbHash(num_segments);
+
+		cdbhashinit(pcdbhash);
+
+		cdbhashnokey(pcdbhash);
+
+		return cdbhashreduce(pcdbhash);
+	}
+	GP_WRAP_END;
+	return 0;
+}
+
 // hash a list of const values with GPDB's hash function
 int32 
 gpdb::CdbHashConstList

--- a/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp
+++ b/src/backend/gpopt/translate/CTranslatorDXLToPlStmt.cpp
@@ -1960,6 +1960,21 @@ CTranslatorDXLToPlStmt::TranslateDXLMotion
 			motion->motionType = MOTIONTYPE_FIXED;
 			// get segment id
 			INT segid = CDXLPhysicalGatherMotion::Cast(motion_dxlop)->IOutputSegIdx();
+
+			// if it's a gather on a segment, pick a segment from
+			// available segments using GPDB's hash function.
+			// This function outputs a segment index in a round
+			// robin fashion using a random segment index as the
+			// starting point.
+			// This ensures that concurrent DML queries issued via
+			// a same session, use a different output segment each
+			// time a gather on segment is needed.
+			if (segid >= 0)
+			{
+				segid = gpdb::CdbHashRandom(m_num_of_segments);
+				GPOS_ASSERT(segid >= 0);
+			}
+
 			motion->numOutputSegs = 1;
 			motion->outputSegIdx = (INT *) gpdb::GPDBAlloc(sizeof(INT));
 			*(motion->outputSegIdx) = segid;

--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -388,6 +388,7 @@ bool		optimizer_enable_direct_dispatch;
 bool		optimizer_enable_hashjoin_redistribute_broadcast_children;
 bool		optimizer_enable_broadcast_nestloop_outer_child;
 bool		optimizer_enable_streaming_material;
+bool		optimizer_enable_gather_on_segment_for_dml;
 bool		optimizer_enable_assert_maxonerow;
 bool		optimizer_enable_constant_expression_evaluation;
 bool		optimizer_enable_bitmapscan;
@@ -2678,6 +2679,16 @@ struct config_bool ConfigureNamesBool_gp[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"optimizer_enable_gather_on_segment_for_dml", PGC_USERSET, DEVELOPER_OPTIONS,
+			gettext_noop("Enable DML optimization by enforcing a non-master gather in the optimizer."),
+			NULL,
+			GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
+		},
+		&optimizer_enable_gather_on_segment_for_dml,
+		true,
+		NULL, NULL, NULL
+	},
 	{
 		{"optimizer_enforce_subplans", PGC_USERSET, DEVELOPER_OPTIONS,
 			gettext_noop("Enforce correlated execution in the optimizer"),

--- a/src/include/gpopt/gpdbwrappers.h
+++ b/src/include/gpopt/gpdbwrappers.h
@@ -594,7 +594,10 @@ namespace gpdb {
 	
 	// hash a const value with GPDB's hash function
 	int32 CdbHashConst(Const *constant, int num_segments);
-	
+
+	// pick a random segment from a pool of segments using GPDB's hash function
+	int32 CdbHashRandom(int num_segments);
+
 	// hash a list of const values with GPDB's hash function
 	int32 CdbHashConstList(List *constants, int num_segments);
 	

--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -452,6 +452,7 @@ extern bool optimizer_enable_multiple_distinct_aggs;
 extern bool optimizer_enable_hashjoin_redistribute_broadcast_children;
 extern bool optimizer_enable_broadcast_nestloop_outer_child;
 extern bool optimizer_enable_streaming_material;
+extern bool optimizer_enable_gather_on_segment_for_dml;
 extern bool optimizer_enable_assert_maxonerow;
 extern bool optimizer_enable_constant_expression_evaluation;
 extern bool optimizer_enable_bitmapscan;

--- a/src/test/regress/expected/gporca.out
+++ b/src/test/regress/expected/gporca.out
@@ -10665,6 +10665,106 @@ select c1 from t_outer where not c1 =all (select c2 from t_inner);
 (10 rows)

 reset optimizer_enable_streaming_material;
+--
+-- Test to ensure sane behavior when DML queries are optimized by ORCA by
+-- enforcing a non-master gather motion, controlled by
+-- optimizer_enable_gather_on_segment_for_DML GUC
+--
+--
+-- CTAS with global-local aggregation
+--
+-- start_ignore
+create table test1 (a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into test1 select generate_series(1,100),generate_series(1,100);
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'avg' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+select * from t_new;
+        avg         
+--------------------
+ 2.0000000000000000
+(1 row)
+
+-- start_ignore
+drop table t_new;
+set optimizer_enable_gather_on_segment_for_DML=off;
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'avg' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+select * from t_new;
+        avg         
+--------------------
+ 2.0000000000000000
+(1 row)
+
+-- start_ignore
+reset optimizer_enable_gather_on_segment_for_DML;
+-- end_ignore
+--
+-- Insert with outer references in the subquery
+--
+-- start_ignore
+create table x_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table y_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table z_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into x_tab values(1);
+insert into y_tab values(0);
+insert into z_tab values(1);
+-- end_ignore
+insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
+select * from x_tab;
+ a 
+---
+ 1
+ 1
+(2 rows)
+
+--
+-- Insert with Union All with an universal child
+--
+insert into y_tab select 1 union all select a from x_tab limit 10;
+select * from y_tab;
+ a 
+---
+ 1
+ 1
+ 1
+ 0
+(4 rows)
+
+--
+-- Insert with a function containing a SQL
+--
+create or replace function test_func_pg_stats()
+returns integer
+as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
+language plpgsql volatile READS SQL DATA;
+insert into y_tab select test_func_pg_stats() from x_tab limit 2;
+select count(*) from y_tab;
+ count 
+-------
+     6
+(1 row)
+
+--
+-- Delete with Hash Join with a universal child
+--
+delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
+select * from x_tab;
+ a 
+---
+(0 rows)
+
 -- start_ignore
 drop table bar;
 -- end_ignore

--- a/src/test/regress/expected/gporca_optimizer.out
+++ b/src/test/regress/expected/gporca_optimizer.out
@@ -10730,6 +10730,104 @@ select c1 from t_outer where not c1 =all (select c2 from t_inner);
 (10 rows)

 reset optimizer_enable_streaming_material;
+--
+-- Test to ensure sane behavior when DML queries are optimized by ORCA by
+-- enforcing a non-master gather motion, controlled by
+-- optimizer_enable_gather_on_segment_for_DML GUC
+--
+--
+-- CTAS with global-local aggregation
+--
+-- start_ignore
+create table test1 (a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into test1 select generate_series(1,100),generate_series(1,100);
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
+select * from t_new;
+        avg         
+--------------------
+ 2.0000000000000000
+(1 row)
+
+-- start_ignore
+drop table t_new;
+set optimizer_enable_gather_on_segment_for_DML=off;
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
+select * from t_new;
+        avg         
+--------------------
+ 2.0000000000000000
+(1 row)
+
+-- start_ignore
+reset optimizer_enable_gather_on_segment_for_DML;
+-- end_ignore
+--
+-- Insert with outer references in the subquery
+--
+-- start_ignore
+create table x_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table y_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create table z_tab(a int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into x_tab values(1);
+insert into y_tab values(0);
+insert into z_tab values(1);
+-- end_ignore
+insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
+select * from x_tab;
+ a 
+---
+ 1
+ 1
+(2 rows)
+
+--
+-- Insert with Union All with an universal child
+--
+insert into y_tab select 1 union all select a from x_tab limit 10;
+select * from y_tab;
+ a 
+---
+ 0
+ 1
+ 1
+ 1
+(4 rows)
+
+--
+-- Insert with a function containing a SQL
+--
+create or replace function test_func_pg_stats()
+returns integer
+as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
+language plpgsql volatile READS SQL DATA;
+insert into y_tab select test_func_pg_stats() from x_tab limit 2;
+select count(*) from y_tab;
+ count 
+-------
+     6
+(1 row)
+
+--
+-- Delete with Hash Join with a universal child
+--
+delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
+select * from x_tab;
+ a 
+---
+(0 rows)
+
 -- start_ignore
 drop table bar;
 ERROR:  table "bar" does not exist

--- a/src/test/regress/expected/update_gp_optimizer.out
+++ b/src/test/regress/expected/update_gp_optimizer.out
@@ -107,8 +107,8 @@ WHERE t1.user_vie_project_code_pk = keo1.user_vie_project_code_pk;
                                       ->  Hash
                                             ->  Broadcast Motion 3:3  (slice7; segments: 3)
                                                   ->  Hash Join
-                                                         Hash Cond: ((keo1_1.user_vie_project_code_pk)::text = (keo2.projects_pk)::text)
-                                                         ->  Redistribute Motion 1:3  (slice5)
+                                                         Hash Cond: keo1_1.user_vie_project_code_pk::text = keo2.projects_pk::text
+                                                         ->  Redistribute Motion 1:3  (slice5; segments: 1)
                                                               ->  Hash Join
                                                                     Hash Cond: ((keo1_1.user_vie_fiscal_year_period_sk)::text = (max((keo3.sky_per)::text)))
                                                                     ->  Gather Motion 3:1  (slice1; segments: 3)
@@ -169,7 +169,7 @@ EXPLAIN (COSTS OFF) DELETE FROM keo5 WHERE x IN (SELECT x FROM keo5 WHERE EXISTS
                           Join Filter: true
                           ->  Table Scan on keo5 keo5_2
                           ->  Materialize
-                                 ->  Broadcast Motion 1:3  (slice2)
+                                 ->  Broadcast Motion 1:3  (slice2; segments: 1)
                                       ->  Limit
                                             ->  Gather Motion 3:1  (slice1; segments: 3)
                                                   ->  Table Scan on keo5 keo5_1

--- a/src/test/regress/sql/gporca.sql
+++ b/src/test/regress/sql/gporca.sql
@@ -1910,6 +1910,72 @@ set optimizer_enable_streaming_material = off;
 select c1 from t_outer where not c1 =all (select c2 from t_inner);
 reset optimizer_enable_streaming_material;

+--
+-- Test to ensure sane behavior when DML queries are optimized by ORCA by
+-- enforcing a non-master gather motion, controlled by
+-- optimizer_enable_gather_on_segment_for_DML GUC
+--
+
+--
+-- CTAS with global-local aggregation
+--
+-- start_ignore
+create table test1 (a int, b int);
+insert into test1 select generate_series(1,100),generate_series(1,100);
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+select * from t_new;
+
+-- start_ignore
+drop table t_new;
+set optimizer_enable_gather_on_segment_for_DML=off;
+-- end_ignore
+create table t_new as select avg(a) from test1 join (select i from unnest(array[1,2,3]) i) t on (test1.a = t.i);
+select * from t_new;
+
+-- start_ignore
+reset optimizer_enable_gather_on_segment_for_DML;
+-- end_ignore
+
+--
+-- Insert with outer references in the subquery
+--
+-- start_ignore
+create table x_tab(a int);
+create table y_tab(a int);
+create table z_tab(a int);
+
+insert into x_tab values(1);
+insert into y_tab values(0);
+insert into z_tab values(1);
+-- end_ignore
+
+insert into x_tab select * from x_tab where exists (select * from x_tab where x_tab.a = (select x_tab.a + y_tab.a from y_tab));
+select * from x_tab;
+
+--
+-- Insert with Union All with an universal child
+--
+insert into y_tab select 1 union all select a from x_tab limit 10;
+select * from y_tab;
+
+--
+-- Insert with a function containing a SQL
+--
+create or replace function test_func_pg_stats()
+returns integer
+as $$ declare cnt int; begin execute 'select count(*) from pg_statistic' into cnt; return cnt; end $$
+language plpgsql volatile READS SQL DATA;
+
+insert into y_tab select test_func_pg_stats() from x_tab limit 2;
+select count(*) from y_tab;
+
+--
+-- Delete with Hash Join with a universal child
+--
+delete from x_tab where exists (select z_tab.a from z_tab join (select 1 as g) as tab on z_tab.a = tab.g);
+select * from x_tab;
+
 -- start_ignore
 drop table bar;
 -- end_ignore