diff --git a/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp b/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp index 0bd720ea054d4b5225d59f6975f5c4fbfe3db033..8cecf3576bf7fbdfddc972ce41f3dd1db3861826 100644 --- a/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp +++ b/src/backend/gpopt/translate/CTranslatorRelcacheToDXL.cpp @@ -2477,10 +2477,11 @@ CTranslatorRelcacheToDXL::PimdobjColStats CDouble dNDVBuckets(0.0); CDouble dFreqBuckets(0.0); + CDouble dDistinctRemain(0.0); + CDouble dFreqRemain(0.0); // We only want to create statistics buckets if the column is NOT a text, varchar, char or bpchar type // For the above column types we will use NDVRemain and NullFreq to do cardinality estimation. - if (CTranslatorUtils::FCreateStatsBucket(oidAttType)) { // transform all the bits and pieces from pg_statistic @@ -2511,18 +2512,23 @@ CTranslatorRelcacheToDXL::PimdobjColStats CUtils::AddRefAppend(pdrgpdxlbucket, pdrgpdxlbucketTransformed); pdrgpdxlbucketTransformed->Release(); - } - // there will be remaining tuples if the merged histogram and the NULLS do not cover - // the total number of distinct values - CDouble dDistinctRemain(0.0); - CDouble dFreqRemain(0.0); - - if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) && - (0 < dDistinct - dNDVBuckets - iNullNDV)) + // there will be remaining tuples if the merged histogram and the NULLS do not cover + // the total number of distinct values + if ((1 - CStatistics::DEpsilon > dFreqBuckets + dNullFrequency) && + (0 < dDistinct - dNDVBuckets - iNullNDV)) + { + dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV)); + dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency)); + } + } + else { - dDistinctRemain = std::max(CDouble(0.0), (dDistinct - dNDVBuckets - iNullNDV)); - dFreqRemain = std::max(CDouble(0.0), (1 - dFreqBuckets - dNullFrequency)); + // in case of text, varchar, char or bpchar, there are no stats buckets, so the + // remaining frequency is everything excluding NULLs, and distinct remaining is the + // stadistinct as available in pg_statistic + dDistinctRemain = dDistinct; + dFreqRemain = 1 - dNullFrequency; } // free up allocated datum and float4 arrays diff --git a/src/test/regress/expected/bfv_statistic.out b/src/test/regress/expected/bfv_statistic.out index 9d58cf26f26fa3743406e40c981184ff28f2cf11..cee4632939d678b529f70eba7aba711486959b24 100644 --- a/src/test/regress/expected/bfv_statistic.out +++ b/src/test/regress/expected/bfv_statistic.out @@ -447,3 +447,49 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b; (0 rows) RESET allow_system_table_mods; +-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining +-- resulting in better cardinality numbers for the joins in orca +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +NOTICE: table "test_join_card1" does not exist, skipping +DROP TABLE IF EXISTS test_join_card2; +NOTICE: table "test_join_card2" does not exist, skipping +-- end_ignore +CREATE TABLE test_join_card1 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE TABLE test_join_card2 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE TABLE test_join_card3 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i; +INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i; +INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i; +INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i; +ANALYZE test_join_card1; +ANALYZE test_join_card2; +ANALYZE test_join_card3; +EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------------------ + Gather Motion 3:1 (slice3; segments: 3) (cost=717.00..1479.01 rows=2910 width=22) + -> Hash Join (cost=717.00..1479.01 rows=970 width=22) + Hash Cond: t2.b::text = t1.b::text + -> Broadcast Motion 3:3 (slice2; segments: 3) (cost=240.00..907.44 rows=5820 width=12) + -> Hash Join (cost=240.00..674.64 rows=1940 width=12) + Hash Cond: t2.b::text = t3.b::text + -> Broadcast Motion 3:3 (slice1; segments: 3) (cost=0.00..308.95 rows=5999 width=4) + -> Seq Scan on test_join_card2 t2 (cost=0.00..68.99 rows=2000 width=4) + -> Hash (cost=115.00..115.00 rows=3334 width=8) + -> Seq Scan on test_join_card3 t3 (cost=0.00..115.00 rows=3334 width=8) + -> Hash (cost=227.00..227.00 rows=6667 width=10) + -> Seq Scan on test_join_card1 t1 (cost=0.00..227.00 rows=6667 width=10) + Optimizer: legacy query optimizer +(13 rows) + +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +DROP TABLE IF EXISTS test_join_card2; +-- end_ignore diff --git a/src/test/regress/expected/bfv_statistic_optimizer.out b/src/test/regress/expected/bfv_statistic_optimizer.out index 9f1b53fdd02f64bae628cf3087a6315d0ab944fe..6b3817a0c685dc9e1ae60207208ecfb34a3a118c 100644 --- a/src/test/regress/expected/bfv_statistic_optimizer.out +++ b/src/test/regress/expected/bfv_statistic_optimizer.out @@ -447,3 +447,53 @@ SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b; (0 rows) RESET allow_system_table_mods; +-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining +-- resulting in better cardinality numbers for the joins in orca +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +NOTICE: table "test_join_card1" does not exist, skipping +DROP TABLE IF EXISTS test_join_card2; +NOTICE: table "test_join_card2" does not exist, skipping +-- end_ignore +CREATE TABLE test_join_card1 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE TABLE test_join_card2 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +CREATE TABLE test_join_card3 (a varchar, b varchar); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i; +INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i; +INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i; +INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i; +ANALYZE test_join_card1; +ANALYZE test_join_card2; +ANALYZE test_join_card3; +EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b; + QUERY PLAN +--------------------------------------------------------------------------------------------------------------- + Gather Motion 3:1 (slice4; segments: 3) (cost=0.00..1297.58 rows=5999 width=22) + -> Hash Join (cost=0.00..1297.09 rows=2000 width=22) + Hash Cond: test_join_card2.b::text = test_join_card3.b::text + -> Hash Join (cost=0.00..864.43 rows=2000 width=14) + Hash Cond: test_join_card1.b::text = test_join_card2.b::text + -> Redistribute Motion 3:3 (slice1; segments: 3) (cost=0.00..431.48 rows=6667 width=10) + Hash Key: test_join_card1.b::text + -> Table Scan on test_join_card1 (cost=0.00..431.15 rows=6667 width=10) + -> Hash (cost=431.08..431.08 rows=2000 width=4) + -> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.08 rows=2000 width=4) + Hash Key: test_join_card2.b + -> Table Scan on test_join_card2 (cost=0.00..431.04 rows=2000 width=4) + -> Hash (cost=431.20..431.20 rows=3334 width=8) + -> Redistribute Motion 3:3 (slice3; segments: 3) (cost=0.00..431.20 rows=3334 width=8) + Hash Key: test_join_card3.b::text + -> Table Scan on test_join_card3 (cost=0.00..431.07 rows=3334 width=8) + Optimizer: PQO version 2.56.0 +(17 rows) + +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +DROP TABLE IF EXISTS test_join_card2; +-- end_ignore diff --git a/src/test/regress/sql/bfv_statistic.sql b/src/test/regress/sql/bfv_statistic.sql index 0f18318f8de91383f894903e50c6448b9f8632f4..693fd963e391e3d8496f5062c3bdfa59ddb4e884 100644 --- a/src/test/regress/sql/bfv_statistic.sql +++ b/src/test/regress/sql/bfv_statistic.sql @@ -277,3 +277,25 @@ UPDATE pg_statistic SET stavalues1='{1,2,3}'::int[] WHERE starelid ='bfv_statist SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b; RESET allow_system_table_mods; + +-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining +-- resulting in better cardinality numbers for the joins in orca +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +DROP TABLE IF EXISTS test_join_card2; +-- end_ignore +CREATE TABLE test_join_card1 (a varchar, b varchar); +CREATE TABLE test_join_card2 (a varchar, b varchar); +CREATE TABLE test_join_card3 (a varchar, b varchar); +INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i; +INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i; +INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i; +INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i; +ANALYZE test_join_card1; +ANALYZE test_join_card2; +ANALYZE test_join_card3; +EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b; +-- start_ignore +DROP TABLE IF EXISTS test_join_card1; +DROP TABLE IF EXISTS test_join_card2; +-- end_ignore