Fix NDVRemain and FreqRemain calculation

For text, varchar, char and bpchar, ORCA does not collect the MCV and Histogram information, so the calculation of NDVRemain and FreqRemain must be updated to account for it. For such columns, NDVRemain is the stadistinct as available in the pg_statistic, and FreqRemain is everything except the NULL frequency. Earlier, NDVRemain and FreqRemain for such columns would yield 0 resulting in poor cardinality estimation and suboptimal plans. Signed-off-by: N Ekta Khanna <ekhanna@pivotal.io>

Fix NDVRemain and FreqRemain calculation
For text, varchar, char and bpchar, ORCA does not collect the MCV and Histogram information, so the calculation of NDVRemain and FreqRemain must be updated to account for it. For such columns, NDVRemain is the stadistinct as available in the pg_statistic, and FreqRemain is everything except the NULL frequency. Earlier, NDVRemain and FreqRemain for such columns would yield 0 resulting in poor cardinality estimation and suboptimal plans. Signed-off-by: N Ekta Khanna <ekhanna@pivotal.io>
4a5c58a5 · Bhuvnesh Chaudhary · Bhuvnesh · 0a5756cc · 4a5c58a5 · 4a5c58a5
4 changed file
--- a/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp
+++ b/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp
@@ -2477,10 +2477,11 @@ CTranslatorRelcacheToDXL::PimdobjColStats

 	CDouble dNDVBuckets(0.0);
 	CDouble dFreqBuckets(0.0);
+	CDouble dDistinctRemain(0.0);
+	CDouble dFreqRemain(0.0);

 	// We only want to create statistics buckets if the column is NOT a text, varchar, char or bpchar type
 	// For the above column types we will use NDVRemain and NullFreq to do cardinality estimation.
-
 	if (CTranslatorUtils::FCreateStatsBucket(oidAttType))
 	{
 		// transform all the bits and pieces from pg_statistic
@@ -2511,18 +2512,23 @@ CTranslatorRelcacheToDXL::PimdobjColStats

 		CUtils::AddRefAppend(pdrgpdxlbucket, pdrgpdxlbucketTransformed);
 		pdrgpdxlbucketTransformed->Release();
-	}

-	// there will be remaining tuples if the merged histogram and the NULLS do not cover
-	// the total number of distinct values
-	CDouble dDistinctRemain(0.0);
-	CDouble dFreqRemain(0.0);
-
- 	if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) &&
-	 	(0 < dDistinct - dNDVBuckets - iNullNDV))
+		// there will be remaining tuples if the merged histogram and the NULLS do not cover
+		// the total number of distinct values
+		if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) &&
+			(0 < dDistinct - dNDVBuckets - iNullNDV))
+		{
+			dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV));
+			dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency));
+		}
+	}
+	else
 	{
- 		dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV));
- 		dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency));
+		// in case of text, varchar, char or bpchar, there are no stats buckets, so the
+		// remaining frequency is everything excluding NULLs, and distinct remaining is the
+		// stadistinct as available in pg_statistic
+		dDistinctRemain = dDistinct;
+ 		dFreqRemain = 1 - dNullFrequency;
 	}

 	// free up allocated datum and float4 arrays

--- a/src/test/regress/expected/bfv_statistic.out
+++ b/src/test/regress/expected/bfv_statistic.out
@@ -447,3 +447,49 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
 (0 rows)

 RESET allow_system_table_mods;
+-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
+-- resulting in better cardinality numbers for the joins in orca
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+NOTICE:  table "test_join_card1" does not exist, skipping
+DROP TABLE IF EXISTS test_join_card2;
+NOTICE:  table "test_join_card2" does not exist, skipping
+-- end_ignore
+CREATE TABLE test_join_card1 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE test_join_card2 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE test_join_card3 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
+INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
+INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
+INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
+ANALYZE test_join_card1;
+ANALYZE test_join_card2;
+ANALYZE test_join_card3;
+EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
+                                                 QUERY PLAN                                                 
+------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)  (cost=717.00..1479.01 rows=2910 width=22)
+   ->  Hash Join  (cost=717.00..1479.01 rows=970 width=22)
+         Hash Cond: t2.b::text = t1.b::text
+         ->  Broadcast Motion 3:3  (slice2; segments: 3)  (cost=240.00..907.44 rows=5820 width=12)
+               ->  Hash Join  (cost=240.00..674.64 rows=1940 width=12)
+                     Hash Cond: t2.b::text = t3.b::text
+                     ->  Broadcast Motion 3:3  (slice1; segments: 3)  (cost=0.00..308.95 rows=5999 width=4)
+                           ->  Seq Scan on test_join_card2 t2  (cost=0.00..68.99 rows=2000 width=4)
+                     ->  Hash  (cost=115.00..115.00 rows=3334 width=8)
+                           ->  Seq Scan on test_join_card3 t3  (cost=0.00..115.00 rows=3334 width=8)
+         ->  Hash  (cost=227.00..227.00 rows=6667 width=10)
+               ->  Seq Scan on test_join_card1 t1  (cost=0.00..227.00 rows=6667 width=10)
+ Optimizer: legacy query optimizer
+(13 rows)
+
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+DROP TABLE IF EXISTS test_join_card2;
+-- end_ignore
--- a/src/test/regress/expected/bfv_statistic_optimizer.out
+++ b/src/test/regress/expected/bfv_statistic_optimizer.out
@@ -447,3 +447,53 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
 (0 rows)

 RESET allow_system_table_mods;
+-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
+-- resulting in better cardinality numbers for the joins in orca
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+NOTICE:  table "test_join_card1" does not exist, skipping
+DROP TABLE IF EXISTS test_join_card2;
+NOTICE:  table "test_join_card2" does not exist, skipping
+-- end_ignore
+CREATE TABLE test_join_card1 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE test_join_card2 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+CREATE TABLE test_join_card3 (a varchar, b varchar);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
+INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
+INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
+INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
+ANALYZE test_join_card1;
+ANALYZE test_join_card2;
+ANALYZE test_join_card3;
+EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice4; segments: 3)  (cost=0.00..1297.58 rows=5999 width=22)
+   ->  Hash Join  (cost=0.00..1297.09 rows=2000 width=22)
+         Hash Cond: test_join_card2.b::text = test_join_card3.b::text
+         ->  Hash Join  (cost=0.00..864.43 rows=2000 width=14)
+               Hash Cond: test_join_card1.b::text = test_join_card2.b::text
+               ->  Redistribute Motion 3:3  (slice1; segments: 3)  (cost=0.00..431.48 rows=6667 width=10)
+                     Hash Key: test_join_card1.b::text
+                     ->  Table Scan on test_join_card1  (cost=0.00..431.15 rows=6667 width=10)
+               ->  Hash  (cost=431.08..431.08 rows=2000 width=4)
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.08 rows=2000 width=4)
+                           Hash Key: test_join_card2.b
+                           ->  Table Scan on test_join_card2  (cost=0.00..431.04 rows=2000 width=4)
+         ->  Hash  (cost=431.20..431.20 rows=3334 width=8)
+               ->  Redistribute Motion 3:3  (slice3; segments: 3)  (cost=0.00..431.20 rows=3334 width=8)
+                     Hash Key: test_join_card3.b::text
+                     ->  Table Scan on test_join_card3  (cost=0.00..431.07 rows=3334 width=8)
+ Optimizer: PQO version 2.56.0
+(17 rows)
+
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+DROP TABLE IF EXISTS test_join_card2;
+-- end_ignore
--- a/src/test/regress/sql/bfv_statistic.sql
+++ b/src/test/regress/sql/bfv_statistic.sql
@@ -277,3 +277,25 @@ UPDATE pg_statistic SET stavalues1='{1,2,3}'::int[] WHERE starelid ='bfv_statist
 SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;

 RESET allow_system_table_mods;
+
+-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
+-- resulting in better cardinality numbers for the joins in orca
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+DROP TABLE IF EXISTS test_join_card2;
+-- end_ignore
+CREATE TABLE test_join_card1 (a varchar, b varchar);
+CREATE TABLE test_join_card2 (a varchar, b varchar);
+CREATE TABLE test_join_card3 (a varchar, b varchar);
+INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
+INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
+INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
+INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
+ANALYZE test_join_card1;
+ANALYZE test_join_card2;
+ANALYZE test_join_card3;
+EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
+-- start_ignore
+DROP TABLE IF EXISTS test_join_card1;
+DROP TABLE IF EXISTS test_join_card2;
+-- end_ignore