提交 e04ae39d 编写于 作者: V Venkatesh Raghavan

Convert Non-correlated EXISTS subquery to a LIMIT 1 AND a JOIN

Enable GPORCA to generate better plans for non-correlated exists subquery in the WHERE clause

Consider the following exists subquery, `(select * from bar)`. GPORCA generates an elaborate count based implementation of this subquery. If bar is a fact table, the count is going to be expensive.

```
vraghavan=# explain select * from foo where foo.a = foo.b and exists (select * from bar);
                                                    QUERY PLAN
------------------------------------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice3; segments: 3)  (cost=0.00..1368262.79 rows=400324 width=8)
   ->  Nested Loop  (cost=0.00..1368250.86 rows=133442 width=8)
         Join Filter: true
         ->  Table Scan on foo  (cost=0.00..461.91 rows=133442 width=8)
               Filter: a = b
         ->  Materialize  (cost=0.00..438.57 rows=1 width=1)
               ->  Broadcast Motion 1:3  (slice2)  (cost=0.00..438.57 rows=3 width=1)
                     ->  Result  (cost=0.00..438.57 rows=1 width=1)
                           Filter: (count((count()))) > 0::bigint
                           ->  Aggregate  (cost=0.00..438.57 rows=1 width=8)
                                 ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..438.57 rows=1 width=8)
                                       ->  Aggregate  (cost=0.00..438.57 rows=1 width=8)
                                             ->  Table Scan on bar  (cost=0.00..437.95 rows=332395 width=1)
 Optimizer status: PQO version 2.35.1
(14 rows)
```
Planner on the other hand uses LIMIT as shown in the INIT plan.

```
vraghavan=# explain select * from foo where foo.a = foo.b and exists (select * from bar);
                                           QUERY PLAN
------------------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice2; segments: 3)  (cost=0.03..13611.14 rows=1001 width=8)
   ->  Result  (cost=0.03..13611.14 rows=334 width=8)
         One-Time Filter: $0
         InitPlan  (slice3)
           ->  Limit  (cost=0.00..0.03 rows=1 width=0)
                 ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..0.03 rows=1 width=0)
                       ->  Limit  (cost=0.00..0.01 rows=1 width=0)
                             ->  Seq Scan on bar  (cost=0.00..11072.84 rows=332395 width=0)
         ->  Seq Scan on foo  (cost=0.00..13611.11 rows=334 width=8)
               Filter: a = b
 Settings:  optimizer=off
 Optimizer status: legacy query optimizer
(12 rows)
```

While GPORCA doesnot support init-plan, we can nevertheless generate a better plan by using LIMIT instead of count. After this PR, GPORCA will generate the following plan with LIMIT clause.

```
vraghavan=# explain select * from foo where foo.a = foo.b and exists (select * from bar);
                                                 QUERY PLAN
------------------------------------------------------------------------------------------------------------
 Gather Motion 3:1  (slice3; segments: 3)  (cost=0.00..1368262.73 rows=400324 width=8)
   ->  Nested Loop EXISTS Join  (cost=0.00..1368250.80 rows=133442 width=8)
         Join Filter: true
         ->  Table Scan on foo  (cost=0.00..461.91 rows=133442 width=8)
               Filter: a = b
         ->  Materialize  (cost=0.00..438.57 rows=1 width=1)
               ->  Broadcast Motion 1:3  (slice2)  (cost=0.00..438.57 rows=3 width=1)
                     ->  Limit  (cost=0.00..438.57 rows=1 width=1)
                           ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..438.57 rows=1 width=1)
                                 ->  Limit  (cost=0.00..438.57 rows=1 width=1)
                                       ->  Table Scan on bar  (cost=0.00..437.95 rows=332395 width=1)
 Optimizer status: PQO version 2.35.1
(12 rows)
```
上级 4ad9ce70
......@@ -5,7 +5,7 @@ project(gpopt LANGUAGES CXX C)
set(GPORCA_VERSION_MAJOR 2)
set(GPORCA_VERSION_MINOR 35)
set(GPORCA_VERSION_PATCH 2)
set(GPORCA_VERSION_PATCH 3)
set(GPORCA_VERSION_STRING "${GPORCA_VERSION_MAJOR}.${GPORCA_VERSION_MINOR}.${GPORCA_VERSION_PATCH}")
# Whenever an ABI-breaking change is made to GPORCA, this should be incremented.
......
此差异已折叠。
......@@ -427,10 +427,10 @@
</dxl:LogicalGet>
</dxl:LogicalSelect>
</dxl:Query>
<dxl:Plan Id="0" SpaceSize="279471">
<dxl:Plan Id="0" SpaceSize="244760">
<dxl:GatherMotion InputSegments="0,1" OutputSegments="-1">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="16.184570" Rows="1.000000" Width="8"/>
<dxl:Cost StartupCost="0" TotalCost="18.187988" Rows="1.000000" Width="8"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="0" Alias="i">
......@@ -444,7 +444,7 @@
<dxl:SortingColumnList/>
<dxl:NestedLoopJoin JoinType="In" IndexNestedLoopJoin="false">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="15.180664" Rows="1.000000" Width="8"/>
<dxl:Cost StartupCost="0" TotalCost="17.184082" Rows="1.000000" Width="8"/>
</dxl:Properties>
<dxl:ProjList>
<dxl:ProjElem ColId="0" Alias="i">
......@@ -487,13 +487,25 @@
</dxl:TableScan>
<dxl:Materialize Eager="true">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="6.176758" Rows="2.000000" Width="1"/>
<dxl:Cost StartupCost="0" TotalCost="8.180176" Rows="2.000000" Width="1"/>
</dxl:Properties>
<dxl:ProjList/>
<dxl:Filter/>
<dxl:BroadcastMotion InputSegments="0,1" OutputSegments="0,1">
<dxl:BroadcastMotion InputSegments="-1" OutputSegments="0,1">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="5.174805" Rows="2.000000" Width="1"/>
<dxl:Cost StartupCost="0" TotalCost="7.178223" Rows="2.000000" Width="1"/>
</dxl:Properties>
<dxl:ProjList/>
<dxl:Filter/>
<dxl:SortingColumnList/>
<dxl:Limit>
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="6.176270" Rows="1.000000" Width="1"/>
</dxl:Properties>
<dxl:ProjList/>
<dxl:GatherMotion InputSegments="0,1" OutputSegments="-1">
<dxl:Properties>
<dxl:Cost StartupCost="0" TotalCost="5.174316" Rows="1.000000" Width="1"/>
</dxl:Properties>
<dxl:ProjList/>
<dxl:Filter/>
......@@ -631,6 +643,14 @@
</dxl:HashJoin>
</dxl:BroadcastMotion>
</dxl:HashJoin>
</dxl:GatherMotion>
<dxl:LimitCount>
<dxl:ConstValue TypeMdid="0.20.1.0" IsNull="false" IsByValue="true" Value="1"/>
</dxl:LimitCount>
<dxl:LimitOffset>
<dxl:ConstValue TypeMdid="0.20.1.0" IsNull="false" IsByValue="true" Value="0"/>
</dxl:LimitOffset>
</dxl:Limit>
</dxl:BroadcastMotion>
</dxl:Materialize>
</dxl:NestedLoopJoin>
......
......@@ -1075,6 +1075,9 @@ namespace gpopt
// get execution locality
static
EExecLocalityType ExecLocalityType(CDistributionSpec *pds);
// generate a limit expression on top of the given relational child with the given offset and limit count
static CExpression *PexprLimit(IMemoryPool *pmp, CExpression *pexpr, ULONG ulOffSet, ULONG ulCount);
}; // class CUtils
// hash set from expressions
......
......@@ -224,19 +224,6 @@ namespace gpopt
static
CExpression *PexprScalarIf(IMemoryPool *pmp, CColRef *pcrBool, CColRef *pcrSum, CColRef *pcrCount, CExpression *pexprSubquery);
// helper for creating a correlated apply expression for existential subquery
static
BOOL FConvertExistOrQuantToScalarSubquery
(
IMemoryPool *pmp,
CExpression *pexprOuter,
CExpression *pexprSubquery,
BOOL fDisjunctionOrNegation,
ESubqueryCtxt esqctxt,
CExpression **ppexprNewOuter,
CExpression **ppexprResidualScalar
);
// helper for creating a correlated apply expression for existential subquery
static
BOOL FCreateCorrelatedApplyForExistentialSubquery
......
此差异已折叠。
......@@ -942,68 +942,6 @@ CSubqueryHandler::FCreateOuterApply
return FCreateOuterApplyForScalarSubquery(pmp, pexprOuter, pexprInner, pexprSubquery, fOuterRefsUnderInner, ppexprNewOuter, ppexprResidualScalar);
}
//---------------------------------------------------------------------------
// @function:
// CSubqueryHandler::FConvertExistOrQuantToScalarSubquery
//
// @doc:
// Helper for converting quantified/existential to count(*)
// subqueries that will ultimately be executed using a correlated
// execution strategy;
//
//
//---------------------------------------------------------------------------
BOOL
CSubqueryHandler::FConvertExistOrQuantToScalarSubquery
(
IMemoryPool *pmp,
CExpression *pexprOuter,
CExpression *pexprSubquery,
BOOL fDisjunctionOrNegation,
ESubqueryCtxt esqctxt,
CExpression **ppexprNewOuter,
CExpression **ppexprResidualScalar
)
{
BOOL fExistential = CUtils::FExistentialSubquery(pexprSubquery->Pop());
GPOS_ASSERT(fExistential || CUtils::FQuantifiedSubquery(pexprSubquery->Pop()));
CExpression *pexprInnerNew = NULL;
if (fExistential)
{
CExpression *pexprNewSubquery = NULL;
CXformUtils::ExistentialToAgg(pmp, pexprSubquery, &pexprNewSubquery, ppexprResidualScalar);
(*pexprNewSubquery)[0]->AddRef();
pexprInnerNew = (*pexprNewSubquery)[0];
pexprNewSubquery->Release();
}
else
{
CExpression *pexprNewSubquery = NULL;
CXformUtils::QuantifiedToAgg(pmp, pexprSubquery, &pexprNewSubquery, ppexprResidualScalar);
(*pexprNewSubquery)[0]->AddRef();
pexprInnerNew = (*pexprNewSubquery)[0];
pexprNewSubquery->Release();
}
const CColRef *pcr = CScalarProjectElement::PopConvert((*(*pexprInnerNew)[1])[0]->Pop())->Pcr();
if (EsqctxtFilter == esqctxt && !fDisjunctionOrNegation)
{
*ppexprNewOuter =
CUtils::PexprLogicalApply<CLogicalInnerCorrelatedApply>(pmp, pexprOuter, pexprInnerNew, pcr, COperator::EopScalarSubquery);
}
else
{
// subquery occurs in a value context or disjunction, we need to create an outer apply expression
*ppexprNewOuter =
CUtils::PexprLogicalApply<CLogicalLeftOuterCorrelatedApply>(pmp, pexprOuter, pexprInnerNew, pcr, COperator::EopScalarSubquery);
}
return true;
}
//---------------------------------------------------------------------------
// @function:
// CSubqueryHandler::FCreateCorrelatedApplyForQuantifiedSubquery
......@@ -1649,12 +1587,23 @@ CSubqueryHandler::FRemoveExistentialSubquery
{
GPOS_ASSERT(EsqctxtFilter == esqctxt);
CDrvdPropRelational *pdpInner = CDrvdPropRelational::Pdprel(pexprInner->PdpDerive());
// for existential subqueries, any column produced by inner expression
// can be used to check for empty answers; we use first column for that
CColRef *pcr = CDrvdPropRelational::Pdprel(pexprInner->PdpDerive())->PcrsOutput()->PcrFirst();
CColRef *pcr = pdpInner->PcrsOutput()->PcrFirst();
if (COperator::EopScalarSubqueryExists == eopid)
{
CColRefSet *pcrsOuterRefs = pdpInner->PcrsOuter();
if (0 == pcrsOuterRefs->CElements())
{
// add a limit operator on top of the inner child if the subquery does not have
// any outer references. Adding Limit for the correlated case hinders pulling up
// predicates into an EXISTS join
pexprInner = CUtils::PexprLimit(pmp, pexprInner, 0, 1);
}
*ppexprNewOuter = CUtils::PexprLogicalApply<CLogicalLeftSemiApply>(pmp, pexprOuter, pexprInner, pcr, eopid);
}
else
......
......@@ -71,6 +71,8 @@ add_executable(gporca_test
src/unittest/gpopt/search/CSearchStrategyTest.cpp
include/unittest/gpopt/minidump/CAggTest.h
src/unittest/gpopt/minidump/CAggTest.cpp
include/unittest/gpopt/minidump/CExistsSubqueryTest.h
src/unittest/gpopt/minidump/CExistsSubqueryTest.cpp
include/unittest/gpopt/minidump/CCollapseProjectTest.h
src/unittest/gpopt/minidump/CCollapseProjectTest.cpp
include/unittest/gpopt/minidump/CArrayExpansionTest.h
......@@ -230,6 +232,7 @@ add_orca_test(CDirectDispatchTest)
add_orca_test(CTVFTest)
add_orca_test(CPullUpProjectElementTest)
add_orca_test(CAggTest)
add_orca_test(CExistsSubqueryTest)
add_orca_test(CCollapseProjectTest)
add_orca_test(CPruneColumnsTest)
add_orca_test(CMissingStatsTest)
......
//---------------------------------------------------------------------------
// Greenplum Database
// Copyright (C) 2017 Pivotal, Inc.
//
// @filename:
// CExistsSubqueryTest.h
//
// @doc:
// Test for exists and not exists subquery optimization
//---------------------------------------------------------------------------
#ifndef GPOPT_CExistsSubqueryTest_H
#define GPOPT_CExistsSubqueryTest_H
#include "gpos/base.h"
namespace gpopt
{
class CExistsSubqueryTest
{
private:
// counter used to mark last successful test
static
gpos::ULONG m_ulExistsSubQueryTestCounter;
public:
// unittests
static
gpos::GPOS_RESULT EresUnittest();
static
gpos::GPOS_RESULT EresUnittest_RunTests();
}; // class CExistsSubqueryTest
}
#endif // !GPOPT_CExistsSubqueryTest_H
// EOF
......@@ -83,6 +83,7 @@
#include "unittest/gpopt/minidump/CTVFTest.h"
#include "unittest/gpopt/minidump/CDMLTest.h"
#include "unittest/gpopt/minidump/CAggTest.h"
#include "unittest/gpopt/minidump/CExistsSubqueryTest.h"
#include "unittest/gpopt/minidump/CCollapseProjectTest.h"
#include "unittest/gpopt/minidump/CPhysicalParallelUnionAllTest.h"
#include "unittest/gpopt/minidump/CPruneColumnsTest.h"
......@@ -149,6 +150,7 @@ static gpos::CUnittest rgut[] =
GPOS_UNITTEST_STD(CDirectDispatchTest),
GPOS_UNITTEST_STD(CTVFTest),
GPOS_UNITTEST_STD(CAggTest),
GPOS_UNITTEST_STD(CExistsSubqueryTest),
GPOS_UNITTEST_STD(CCollapseProjectTest),
GPOS_UNITTEST_STD(CPruneColumnsTest),
GPOS_UNITTEST_STD(CPhysicalParallelUnionAllTest),
......
//---------------------------------------------------------------------------
// Greenplum Database
// Copyright (C) 2017 Pivotal, Inc.
//
// @filename:
// CExistsSubqueryTest.cpp
//
// @doc:
// Test for exists and not exists subquery optimization
//---------------------------------------------------------------------------
#include "unittest/gpopt/minidump/CExistsSubqueryTest.h"
#include "gpos/base.h"
#include "gpos/memory/CAutoMemoryPool.h"
#include "gpos/task/CAutoTraceFlag.h"
#include "gpos/test/CUnittest.h"
#include "gpopt/exception.h"
#include "gpopt/minidump/CMinidumperUtils.h"
#include "unittest/gpopt/CTestUtils.h"
using namespace gpopt;
ULONG CExistsSubqueryTest::m_ulExistsSubQueryTestCounter = 0; // start from first test
// minidump files
const CHAR *rgszExistsFileNames[] =
{
"../data/dxl/minidump/SubqExists-With-External-Corrs.mdp",
"../data/dxl/minidump/SubqExists-Without-External-Corrs.mdp",
"../data/dxl/minidump/Exists-SuperfluousEquality.mdp",
"../data/dxl/minidump/NotExists-SuperfluousEquality.mdp",
"../data/dxl/minidump/SimplifyExistsSubquery2Limit.mdp",
};
// unittest for expressions
GPOS_RESULT
CExistsSubqueryTest::EresUnittest()
{
#ifdef GPOS_DEBUG
// disable extended asserts before running test
fEnableExtendedAsserts = false;
#endif // GPOS_DEBUG
CUnittest rgut[] =
{
GPOS_UNITTEST_FUNC(EresUnittest_RunTests),
};
GPOS_RESULT eres = CUnittest::EresExecute(rgut, GPOS_ARRAY_SIZE(rgut));
#ifdef GPOS_DEBUG
// enable extended asserts after running test
fEnableExtendedAsserts = true;
#endif // GPOS_DEBUG
// reset metadata cache
CMDCache::Reset();
return eres;
}
// run all Minidump-based tests with plan matching
GPOS_RESULT
CExistsSubqueryTest::EresUnittest_RunTests()
{
return CTestUtils::EresUnittest_RunTests
(
rgszExistsFileNames,
&m_ulExistsSubQueryTestCounter,
GPOS_ARRAY_SIZE(rgszExistsFileNames)
);
}
// EOF
......@@ -107,8 +107,6 @@ const CHAR *rgszFileNames[] =
"../data/dxl/minidump/EquivClassesLimit.mdp",
"../data/dxl/minidump/Date-TimeStamp-HashJoin.mdp",
"../data/dxl/minidump/TimeStamp-Date-HashJoin.mdp",
"../data/dxl/minidump/Exists-SuperfluousEquality.mdp",
"../data/dxl/minidump/NotExists-SuperfluousEquality.mdp",
"../data/dxl/minidump/MultiLevel-CorrelatedExec.mdp",
"../data/dxl/minidump/OneLevel-CorrelatedExec.mdp",
"../data/dxl/minidump/MultiLevel-IN-Subquery.mdp",
......@@ -146,8 +144,6 @@ const CHAR *rgszFileNames[] =
"../data/dxl/minidump/SubqAll-To-ScalarSubq.mdp",
"../data/dxl/minidump/SubqAll-Limit1.mdp",
"../data/dxl/minidump/ProjectUnderSubq.mdp",
"../data/dxl/minidump/SubqExists-With-External-Corrs.mdp",
"../data/dxl/minidump/SubqExists-Without-External-Corrs.mdp",
#ifndef GPOS_DEBUG
"../data/dxl/minidump/TPCDS-39-InnerJoin-JoinEstimate.mdp",
"../data/dxl/minidump/TPCH-Partitioned-256GB.mdp",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册