Only request stats of columns needed for cardinality estimation [#150424379]

GPORCA should not spend time extracting column statistics that are not needed for cardinality estimation. This commit eliminates this overhead of requesting and generating the statistics for columns that are not used in cardinality estimation unnecessarily. E.g: `CREATE TABLE foo (a int, b int, c int);` For table foo, the query below only needs for stats for column `a` which is the distribution column and column `c` which is the column used in where clause. `select * from foo where c=2;` However, prior to that commit, the column statistics for column `b` is also calculated and passed for the cardinality estimation. The only information needed by the optimizer is the `width` of column `b`. For this tiny information, we transfer every stats information for that column. This commit and its counterpart commit in GPDB ensures that the column width information is passed and extracted in the `dxl:Relation` metadata information. Preliminary results for short running queries provides up to 65x performance improvement. Signed-off-by: N Jemish Patel <jpatel@pivotal.io>

Only request stats of columns needed for cardinality estimation [#150424379]
GPORCA should not spend time extracting column statistics that are not needed for cardinality estimation. This commit eliminates this overhead of requesting and generating the statistics for columns that are not used in cardinality estimation unnecessarily. E.g: `CREATE TABLE foo (a int, b int, c int);` For table foo, the query below only needs for stats for column `a` which is the distribution column and column `c` which is the column used in where clause. `select * from foo where c=2;` However, prior to that commit, the column statistics for column `b` is also calculated and passed for the cardinality estimation. The only information needed by the optimizer is the `width` of column `b`. For this tiny information, we transfer every stats information for that column. This commit and its counterpart commit in GPDB ensures that the column width information is passed and extracted in the `dxl:Relation` metadata information. Preliminary results for short running queries provides up to 65x performance improvement. Signed-off-by: N Jemish Patel <jpatel@pivotal.io>
05a26924 · Omer Arap · 7ccedc81 · 05a26924 · 05a26924 · 05a26924
9 changed file
--- a/libgpopt/src/mdcache/CMDAccessor.cpp
+++ b/libgpopt/src/mdcache/CMDAccessor.cpp
@@ -1140,14 +1140,8 @@ CMDAccessor::RecordColumnStats
 	}
 }

-//---------------------------------------------------------------------------
-//	@function:
-//		CMDAccessor::Pmdcolstats
-//
-//	@doc:
-//		Return the column statistics meta data object for a given column of a table
-//
-//---------------------------------------------------------------------------
+
+// Return the column statistics meta data object for a given column of a table
 const IMDColStats *
 CMDAccessor::Pmdcolstats
 	(
@@ -1227,6 +1221,7 @@ CMDAccessor::Pstats

 	// extract column widths
 	CColRefSetIter crsiWidth(*pcrsWidth);
+
 	while (crsiWidth.FAdvance())
 	{
 		CColRef *pcrWidth = crsiWidth.Pcr();
@@ -1239,11 +1234,7 @@ CMDAccessor::Pstats
 		INT iAttno = pcrtable->IAttno();
 		ULONG ulPos = pmdrel->UlPosFromAttno(iAttno);

-		// extract the width information
-		const IMDColStats *pmdcolstats = Pmdcolstats(pmp, pmdidRel, ulPos);
-		GPOS_ASSERT(NULL != pmdcolstats);
-
-		CDouble *pdWidth = GPOS_NEW(pmp) CDouble(pmdcolstats->DWidth());
+		CDouble *pdWidth = GPOS_NEW(pmp) CDouble(pmdrel->DColWidth(ulPos));
 		phmuldoubleWidth->FInsert(GPOS_NEW(pmp) ULONG(ulColId), pdWidth);
 	}


--- a/libgpopt/src/operators/CPhysicalScan.cpp
+++ b/libgpopt/src/operators/CPhysicalScan.cpp
@@ -255,8 +255,8 @@ CPhysicalScan::ComputeTableStats
 {
 	GPOS_ASSERT(NULL == m_pstatsBaseTable);

-	CColRefSet *pcrsHist = GPOS_NEW(pmp) CColRefSet(pmp, m_pdrgpcrOutput);
-	CColRefSet *pcrsWidth = GPOS_NEW(pmp) CColRefSet(pmp);
+	CColRefSet *pcrsHist = GPOS_NEW(pmp) CColRefSet(pmp);
+	CColRefSet *pcrsWidth = GPOS_NEW(pmp) CColRefSet(pmp, m_pdrgpcrOutput);

 	CMDAccessor *pmda = COptCtxt::PoctxtFromTLS()->Pmda();
 	m_pstatsBaseTable = pmda->Pstats(pmp, m_ptabdesc->Pmdid(), pcrsHist, pcrsWidth);

--- a/libnaucrates/include/naucrates/md/CMDRelationCtasGPDB.h
+++ b/libnaucrates/include/naucrates/md/CMDRelationCtasGPDB.h
@@ -96,6 +96,9 @@ namespace gpmd
 			// vartypemod list
 			DrgPi *m_pdrgpiVarTypeMod;

+			// array of column widths
+			DrgPdouble *m_pdrgpdoubleColWidths;
+
 			// private copy ctor
 			CMDRelationCtasGPDB(const CMDRelationCtasGPDB &);

@@ -177,7 +180,11 @@ namespace gpmd
 			// number of columns
 			virtual
 			ULONG UlColumns() const;
-			
+
+			// width of a column with regards to the position
+			virtual
+			DOUBLE DColWidth(ULONG ulPos) const;
+
 			// does relation have dropped columns
 			virtual
 			BOOL FHasDroppedColumns() const

--- a/libnaucrates/include/naucrates/md/CMDRelationExternalGPDB.h
+++ b/libnaucrates/include/naucrates/md/CMDRelationExternalGPDB.h
@@ -103,6 +103,9 @@ namespace gpmd
 			// the original positions of all the non-dropped columns
 			DrgPul *m_pdrgpulNonDroppedCols;

+			// array of column widths including dropped columns
+			DrgPdouble *m_pdrgpdoubleColWidths;
+
 			// format type for the relation
 			const CWStringConst *PstrFormatType() const;

@@ -156,7 +159,11 @@ namespace gpmd
 			// number of columns
 			virtual
 			ULONG UlColumns() const;
-			
+
+			// width of a column with regards to the position
+			virtual
+			DOUBLE DColWidth(ULONG ulPos) const;
+
 			// does relation have dropped columns
 			virtual
 			BOOL FHasDroppedColumns() const; 

--- a/libnaucrates/include/naucrates/md/CMDRelationGPDB.h
+++ b/libnaucrates/include/naucrates/md/CMDRelationGPDB.h
@@ -117,7 +117,10 @@ namespace gpmd

 			// the original positions of all the non-dropped columns
 			DrgPul *m_pdrgpulNonDroppedCols;
-			
+
+			// array of column widths including dropped columns
+			DrgPdouble *m_pdrgpdoubleColWidths;
+
 			// private copy ctor
 			CMDRelationGPDB(const CMDRelationGPDB &);
 		
@@ -180,7 +183,11 @@ namespace gpmd
 			// number of columns
 			virtual 
 			ULONG UlColumns() const;
-			
+
+			// width of a column with regards to the position
+			virtual
+			DOUBLE DColWidth(ULONG ulPos) const;
+
 			// does relation have dropped columns
 			virtual
 			BOOL FHasDroppedColumns() const; 

--- a/libnaucrates/include/naucrates/md/IMDRelation.h
+++ b/libnaucrates/include/naucrates/md/IMDRelation.h
@@ -106,7 +106,11 @@ namespace gpmd
 			// number of columns
 			virtual 
 			ULONG UlColumns() const = 0;
-			
+
+			// width of a column with regards to the position
+			virtual
+			DOUBLE DColWidth(ULONG ulPos) const = 0;
+
 			// does relation have dropped columns
 			virtual
 			BOOL FHasDroppedColumns() const = 0; 

--- a/libnaucrates/src/md/CMDRelationCtasGPDB.cpp
+++ b/libnaucrates/src/md/CMDRelationCtasGPDB.cpp
@@ -68,7 +68,8 @@ CMDRelationCtasGPDB::CMDRelationCtasGPDB
 	
 	m_phmiulAttno2Pos = GPOS_NEW(m_pmp) HMIUl(m_pmp);
 	m_pdrgpulNonDroppedCols = GPOS_NEW(m_pmp) DrgPul(m_pmp);
-	
+	m_pdrgpdoubleColWidths = GPOS_NEW(pmp) DrgPdouble(pmp);
+
 	const ULONG ulArity = pdrgpmdcol->UlLength();
 	for (ULONG ul = 0; ul < ulArity; ul++)
 	{
@@ -90,6 +91,8 @@ CMDRelationCtasGPDB::CMDRelationCtasGPDB
 									GPOS_NEW(m_pmp) INT(pmdcol->IAttno()),
 									GPOS_NEW(m_pmp) ULONG(ul)
 									);
+
+		m_pdrgpdoubleColWidths->Append(GPOS_NEW(pmp) CDouble(pmdcol->UlLength()));
 	}
 	m_pstr = CDXLUtils::PstrSerializeMDObj(m_pmp, this, false /*fSerializeHeader*/, false /*fIndent*/);
 }
@@ -110,6 +113,7 @@ CMDRelationCtasGPDB::~CMDRelationCtasGPDB()
 	m_pmdid->Release();
 	m_pdrgpmdcol->Release();
 	m_pdrgpdrgpulKeys->Release();
+	m_pdrgpdoubleColWidths->Release();
 	CRefCount::SafeRelease(m_pdrgpulDistrColumns);
 	CRefCount::SafeRelease(m_phmiulAttno2Pos);
 	CRefCount::SafeRelease(m_pdrgpulNonDroppedCols);
@@ -189,6 +193,16 @@ CMDRelationCtasGPDB::UlColumns() const
 	return m_pdrgpmdcol->UlLength();
 }

+// Return the width of a column with regards to the position
+DOUBLE
+CMDRelationCtasGPDB::DColWidth
+	(
+	ULONG ulPos
+	)
+const
+{
+	return (*m_pdrgpdoubleColWidths)[ulPos]->DVal();
+}

 //---------------------------------------------------------------------------
 //	@function:

--- a/libnaucrates/src/md/CMDRelationExternalGPDB.cpp
+++ b/libnaucrates/src/md/CMDRelationExternalGPDB.cpp
@@ -74,7 +74,8 @@ CMDRelationExternalGPDB::CMDRelationExternalGPDB
 	m_phmululNonDroppedCols = GPOS_NEW(m_pmp) HMUlUl(m_pmp);
 	m_phmiulAttno2Pos = GPOS_NEW(m_pmp) HMIUl(m_pmp);
 	m_pdrgpulNonDroppedCols = GPOS_NEW(m_pmp) DrgPul(m_pmp);
-	
+	m_pdrgpdoubleColWidths = GPOS_NEW(pmp) DrgPdouble(pmp);
+
 	ULONG ulPosNonDropped = 0;
 	const ULONG ulArity = pdrgpmdcol->UlLength();
 	for (ULONG ul = 0; ul < ulArity; ul++)
@@ -107,6 +108,7 @@ CMDRelationExternalGPDB::CMDRelationExternalGPDB
 									GPOS_NEW(m_pmp) INT(pmdcol->IAttno()),
 									GPOS_NEW(m_pmp) ULONG(ul)
 									);
+		m_pdrgpdoubleColWidths->Append(GPOS_NEW(pmp) CDouble(pmdcol->UlLength()));
 	}
 	m_pstr = CDXLUtils::PstrSerializeMDObj(m_pmp, this, false /*fSerializeHeader*/, false /*fIndent*/);
 }
@@ -129,6 +131,7 @@ CMDRelationExternalGPDB::~CMDRelationExternalGPDB()
 	CRefCount::SafeRelease(m_pdrgpdrgpulKeys);
 	m_pdrgpmdIndexInfo->Release();
 	m_pdrgpmdidTriggers->Release();
+	m_pdrgpdoubleColWidths->Release();
 	m_pdrgpmdidCheckConstraint->Release();
 	CRefCount::SafeRelease(m_pmdidFmtErrRel);

@@ -195,6 +198,17 @@ CMDRelationExternalGPDB::UlColumns() const
 	return m_pdrgpmdcol->UlLength();
 }

+// Return the width of a column with regards to the position
+DOUBLE
+CMDRelationExternalGPDB::DColWidth
+(
+	ULONG ulPos
+	)
+const
+{
+	return (*m_pdrgpdoubleColWidths)[ulPos]->DVal();
+}
+
 //---------------------------------------------------------------------------
 //	@function:
 //		CMDRelationExternalGPDB::FHasDroppedColumns

--- a/libnaucrates/src/md/CMDRelationGPDB.cpp
+++ b/libnaucrates/src/md/CMDRelationGPDB.cpp
@@ -86,7 +86,8 @@ CMDRelationGPDB::CMDRelationGPDB
 	m_phmululNonDroppedCols = GPOS_NEW(m_pmp) HMUlUl(m_pmp);
 	m_phmiulAttno2Pos = GPOS_NEW(m_pmp) HMIUl(m_pmp);
 	m_pdrgpulNonDroppedCols = GPOS_NEW(m_pmp) DrgPul(m_pmp);
-	
+	m_pdrgpdoubleColWidths = GPOS_NEW(pmp) DrgPdouble(pmp);
+
 	const ULONG ulArity = pdrgpmdcol->UlLength();
 	ULONG ulPosNonDropped = 0;
 	for (ULONG ul = 0; ul < ulArity; ul++)
@@ -117,6 +118,8 @@ CMDRelationGPDB::CMDRelationGPDB
 			(void) m_phmululNonDroppedCols->FInsert(GPOS_NEW(m_pmp) ULONG(ul), GPOS_NEW(m_pmp) ULONG(ulPosNonDropped));
 			ulPosNonDropped++;
 		}
+
+		m_pdrgpdoubleColWidths->Append(GPOS_NEW(pmp) CDouble(pmdcol->UlLength()));
 	}
 	m_pstr = CDXLUtils::PstrSerializeMDObj(m_pmp, this, false /*fSerializeHeader*/, false /*fIndent*/);
 }
@@ -142,6 +145,7 @@ CMDRelationGPDB::~CMDRelationGPDB()
 	m_pdrgpmdIndexInfo->Release();
 	m_pdrgpmdidTriggers->Release();
 	m_pdrgpmdidCheckConstraint->Release();
+	m_pdrgpdoubleColWidths->Release();
 	CRefCount::SafeRelease(m_pmdpartcnstr);
 	CRefCount::SafeRelease(m_phmululNonDroppedCols);
 	CRefCount::SafeRelease(m_phmiulAttno2Pos);
@@ -234,6 +238,17 @@ CMDRelationGPDB::UlColumns() const
 	return m_pdrgpmdcol->UlLength();
 }

+// Return the width of a column with regards to the position
+DOUBLE
+CMDRelationGPDB::DColWidth
+	(
+	ULONG ulPos
+	)
+	const
+{
+	return (*m_pdrgpdoubleColWidths)[ulPos]->DVal();
+}
+
 //---------------------------------------------------------------------------
 //	@function:
 //		CMDRelationGPDB::FHasDroppedColumns