Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Gpdb
提交
73d1040b
G
Gpdb
项目概览
Greenplum
/
Gpdb
通知
7
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
G
Gpdb
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
73d1040b
编写于
5月 27, 2001
作者:
T
Tom Lane
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix eqjoinsel() to make use of new statistics.
上级
a001f135
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
201 addition
and
41 deletion
+201
-41
src/backend/utils/adt/selfuncs.c
src/backend/utils/adt/selfuncs.c
+201
-41
未找到文件。
src/backend/utils/adt/selfuncs.c
浏览文件 @
73d1040b
...
...
@@ -15,7 +15,7 @@
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.9
0 2001/05/20 20:28:19
tgl Exp $
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.9
1 2001/05/27 17:37:48
tgl Exp $
*
*-------------------------------------------------------------------------
*/
...
...
@@ -940,9 +940,7 @@ Datum
eqjoinsel
(
PG_FUNCTION_ARGS
)
{
Query
*
root
=
(
Query
*
)
PG_GETARG_POINTER
(
0
);
#ifdef NOT_USED
/* see neqjoinsel() before removing me! */
Oid
operator
=
PG_GETARG_OID
(
1
);
#endif
List
*
args
=
(
List
*
)
PG_GETARG_POINTER
(
2
);
Var
*
var1
;
Var
*
var2
;
...
...
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
HeapTuple
statsTuple2
=
NULL
;
Form_pg_statistic
stats1
=
NULL
;
Form_pg_statistic
stats2
=
NULL
;
double
nd1
,
nd2
;
if
(
var1
==
NULL
)
{
nd1
=
DEFAULT_NUM_DISTINCT
;
}
else
double
nd1
=
DEFAULT_NUM_DISTINCT
;
double
nd2
=
DEFAULT_NUM_DISTINCT
;
bool
have_mcvs1
=
false
;
Datum
*
values1
=
NULL
;
int
nvalues1
=
0
;
float4
*
numbers1
=
NULL
;
int
nnumbers1
=
0
;
bool
have_mcvs2
=
false
;
Datum
*
values2
=
NULL
;
int
nvalues2
=
0
;
float4
*
numbers2
=
NULL
;
int
nnumbers2
=
0
;
if
(
var1
!=
NULL
)
{
/* get stats for the attribute, if available */
Oid
relid1
=
getrelid
(
var1
->
varno
,
root
->
rtable
);
if
(
relid1
==
InvalidOid
)
nd1
=
DEFAULT_NUM_DISTINCT
;
else
if
(
relid1
!=
InvalidOid
)
{
statsTuple1
=
SearchSysCache
(
STATRELATT
,
ObjectIdGetDatum
(
relid1
),
Int16GetDatum
(
var1
->
varattno
),
0
,
0
);
if
(
HeapTupleIsValid
(
statsTuple1
))
{
stats1
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple1
);
have_mcvs1
=
get_attstatsslot
(
statsTuple1
,
var1
->
vartype
,
var1
->
vartypmod
,
STATISTIC_KIND_MCV
,
InvalidOid
,
&
values1
,
&
nvalues1
,
&
numbers1
,
&
nnumbers1
);
}
nd1
=
get_att_numdistinct
(
root
,
var1
,
stats1
);
}
}
if
(
var2
==
NULL
)
{
nd2
=
DEFAULT_NUM_DISTINCT
;
}
else
if
(
var2
!=
NULL
)
{
/* get stats for the attribute, if available */
Oid
relid2
=
getrelid
(
var2
->
varno
,
root
->
rtable
);
if
(
relid2
==
InvalidOid
)
nd2
=
DEFAULT_NUM_DISTINCT
;
else
if
(
relid2
!=
InvalidOid
)
{
statsTuple2
=
SearchSysCache
(
STATRELATT
,
ObjectIdGetDatum
(
relid2
),
Int16GetDatum
(
var2
->
varattno
),
0
,
0
);
if
(
HeapTupleIsValid
(
statsTuple2
))
{
stats2
=
(
Form_pg_statistic
)
GETSTRUCT
(
statsTuple2
);
have_mcvs2
=
get_attstatsslot
(
statsTuple2
,
var2
->
vartype
,
var2
->
vartypmod
,
STATISTIC_KIND_MCV
,
InvalidOid
,
&
values2
,
&
nvalues2
,
&
numbers2
,
&
nnumbers2
);
}
nd2
=
get_att_numdistinct
(
root
,
var2
,
stats2
);
}
}
/*
* Estimate the join selectivity as 1 / sqrt(nd1*nd2)
* (can we produce any theory for this)?
*
* XXX possibility to do better: if both attributes have histograms
* then we could determine the exact join selectivity between the
* MCV sets, and only have to assume the join behavior of the non-MCV
* values. This could be a big win when the MCVs cover a large part
* of the population.
*
* XXX what about nulls?
*/
selec
=
1
.
0
/
sqrt
(
nd1
*
nd2
);
if
(
selec
>
1
.
0
)
selec
=
1
.
0
;
if
(
have_mcvs1
&&
have_mcvs2
)
{
/*
* We have most-common-value lists for both relations. Run
* through the lists to see which MCVs actually join to each
* other with the given operator. This allows us to determine
* the exact join selectivity for the portion of the relations
* represented by the MCV lists. We still have to estimate for
* the remaining population, but in a skewed distribution this
* gives us a big leg up in accuracy. For motivation see the
* analysis in Y. Ioannidis and S. Christodoulakis, "On the
* propagation of errors in the size of join results", Technical
* Report 1018, Computer Science Dept., University of Wisconsin,
* Madison, March 1991 (available from ftp.cs.wisc.edu).
*/
FmgrInfo
eqproc
;
bool
*
hasmatch1
;
bool
*
hasmatch2
;
double
matchprodfreq
,
matchfreq1
,
matchfreq2
,
unmatchfreq1
,
unmatchfreq2
,
otherfreq1
,
otherfreq2
,
totalsel1
,
totalsel2
;
int
i
,
nmatches
;
fmgr_info
(
get_opcode
(
operator
),
&
eqproc
);
hasmatch1
=
(
bool
*
)
palloc
(
nvalues1
*
sizeof
(
bool
));
memset
(
hasmatch1
,
0
,
nvalues1
*
sizeof
(
bool
));
hasmatch2
=
(
bool
*
)
palloc
(
nvalues2
*
sizeof
(
bool
));
memset
(
hasmatch2
,
0
,
nvalues2
*
sizeof
(
bool
));
/*
* Note we assume that each MCV will match at most one member of
* the other MCV list. If the operator isn't really equality,
* there could be multiple matches --- but we don't look for them,
* both for speed and because the math wouldn't add up...
*/
matchprodfreq
=
0
.
0
;
nmatches
=
0
;
for
(
i
=
0
;
i
<
nvalues1
;
i
++
)
{
int
j
;
for
(
j
=
0
;
j
<
nvalues2
;
j
++
)
{
if
(
hasmatch2
[
j
])
continue
;
if
(
DatumGetBool
(
FunctionCall2
(
&
eqproc
,
values1
[
i
],
values2
[
j
])))
{
hasmatch1
[
i
]
=
hasmatch2
[
j
]
=
true
;
matchprodfreq
+=
numbers1
[
i
]
*
numbers2
[
j
];
nmatches
++
;
break
;
}
}
}
/* Sum up frequencies of matched and unmatched MCVs */
matchfreq1
=
unmatchfreq1
=
0
.
0
;
for
(
i
=
0
;
i
<
nvalues1
;
i
++
)
{
if
(
hasmatch1
[
i
])
matchfreq1
+=
numbers1
[
i
];
else
unmatchfreq1
+=
numbers1
[
i
];
}
matchfreq2
=
unmatchfreq2
=
0
.
0
;
for
(
i
=
0
;
i
<
nvalues2
;
i
++
)
{
if
(
hasmatch2
[
i
])
matchfreq2
+=
numbers2
[
i
];
else
unmatchfreq2
+=
numbers2
[
i
];
}
pfree
(
hasmatch1
);
pfree
(
hasmatch2
);
/*
* Compute total frequency of non-null values that are not in
* the MCV lists.
*/
otherfreq1
=
1
.
0
-
stats1
->
stanullfrac
-
matchfreq1
-
unmatchfreq1
;
otherfreq2
=
1
.
0
-
stats2
->
stanullfrac
-
matchfreq2
-
unmatchfreq2
;
/*
* We can estimate the total selectivity from the point of view
* of relation 1 as: the known selectivity for matched MCVs, plus
* unmatched MCVs that are assumed to match against random members
* of relation 2's non-MCV population, plus non-MCV values that
* are assumed to match against random members of relation 2's
* unmatched MCVs plus non-MCV values.
*/
totalsel1
=
matchprodfreq
;
if
(
nd2
>
nvalues2
)
totalsel1
+=
unmatchfreq1
*
otherfreq2
/
(
nd2
-
nvalues2
);
if
(
nd2
>
nmatches
)
totalsel1
+=
otherfreq1
*
(
otherfreq2
+
unmatchfreq2
)
/
(
nd2
-
nmatches
);
/* Same estimate from the point of view of relation 2. */
totalsel2
=
matchprodfreq
;
if
(
nd1
>
nvalues1
)
totalsel2
+=
unmatchfreq2
*
otherfreq1
/
(
nd1
-
nvalues1
);
if
(
nd1
>
nmatches
)
totalsel2
+=
otherfreq2
*
(
otherfreq1
+
unmatchfreq1
)
/
(
nd1
-
nmatches
);
/*
* For robustness, we average the two estimates. (Can a case
* be made for taking the min or max instead?)
*/
selec
=
(
totalsel1
+
totalsel2
)
*
0
.
5
;
}
else
{
/*
* We do not have MCV lists for both sides. Estimate the
* join selectivity as MIN(1/nd1, 1/nd2). This is plausible
* if we assume that the values are about equally distributed:
* a given tuple of rel1 will join to either 0 or N2/nd2 rows
* of rel2, so total join rows are at most N1*N2/nd2 giving
* a join selectivity of not more than 1/nd2. By the same logic
* it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
* bound. Using the MIN() means we estimate from the point of
* view of the relation with smaller nd (since the larger nd is
* determining the MIN). It is reasonable to assume that most
* tuples in this rel will have join partners, so the bound is
* probably reasonably tight and should be taken as-is.
*
* XXX Can we be smarter if we have an MCV list for just one side?
* It seems that if we assume equal distribution for the other
* side, we end up with the same answer anyway.
*/
if
(
nd1
>
nd2
)
selec
=
1
.
0
/
nd1
;
else
selec
=
1
.
0
/
nd2
;
}
if
(
have_mcvs1
)
free_attstatsslot
(
var1
->
vartype
,
values1
,
nvalues1
,
numbers1
,
nnumbers1
);
if
(
have_mcvs2
)
free_attstatsslot
(
var2
->
vartype
,
values2
,
nvalues2
,
numbers2
,
nnumbers2
);
if
(
HeapTupleIsValid
(
statsTuple1
))
ReleaseSysCache
(
statsTuple1
);
if
(
HeapTupleIsValid
(
statsTuple2
))
...
...
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
Datum
neqjoinsel
(
PG_FUNCTION_ARGS
)
{
Query
*
root
=
(
Query
*
)
PG_GETARG_POINTER
(
0
);
Oid
operator
=
PG_GETARG_OID
(
1
);
List
*
args
=
(
List
*
)
PG_GETARG_POINTER
(
2
);
Oid
eqop
;
float8
result
;
/*
* XXX we skip looking up the negator operator here because we know
* eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
* look, this routine will need to look more like neqsel() does.
* We want 1 - eqjoinsel() where the equality operator is the one
* associated with this != operator, that is, its negator.
*/
result
=
DatumGetFloat8
(
eqjoinsel
(
fcinfo
));
eqop
=
get_negator
(
operator
);
if
(
eqop
)
{
result
=
DatumGetFloat8
(
DirectFunctionCall3
(
eqjoinsel
,
PointerGetDatum
(
root
),
ObjectIdGetDatum
(
eqop
),
PointerGetDatum
(
args
)));
}
else
{
/* Use default selectivity (should we raise an error instead?) */
result
=
DEFAULT_EQ_SEL
;
}
result
=
1
.
0
-
result
;
PG_RETURN_FLOAT8
(
result
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录