Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Gpdb
提交
b525bf77
G
Gpdb
项目概览
Greenplum
/
Gpdb
通知
7
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
G
Gpdb
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b525bf77
编写于
12月 04, 2010
作者:
T
Tom Lane
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add KNNGIST support to contrib/pg_trgm.
Teodor Sigaev, with some revision by Tom
上级
b576757d
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
213 addition
and
42 deletion
+213
-42
contrib/pg_trgm/expected/pg_trgm.out
contrib/pg_trgm/expected/pg_trgm.out
+23
-0
contrib/pg_trgm/pg_trgm.sql.in
contrib/pg_trgm/pg_trgm.sql.in
+20
-1
contrib/pg_trgm/sql/pg_trgm.sql
contrib/pg_trgm/sql/pg_trgm.sql
+4
-0
contrib/pg_trgm/trgm.h
contrib/pg_trgm/trgm.h
+6
-4
contrib/pg_trgm/trgm_gin.c
contrib/pg_trgm/trgm_gin.c
+3
-0
contrib/pg_trgm/trgm_gist.c
contrib/pg_trgm/trgm_gist.c
+110
-30
contrib/pg_trgm/trgm_op.c
contrib/pg_trgm/trgm_op.c
+20
-6
contrib/pg_trgm/uninstall_pg_trgm.sql
contrib/pg_trgm/uninstall_pg_trgm.sql
+6
-0
doc/src/sgml/pgtrgm.sgml
doc/src/sgml/pgtrgm.sgml
+21
-1
未找到文件。
contrib/pg_trgm/expected/pg_trgm.out
浏览文件 @
b525bf77
...
...
@@ -1187,6 +1187,13 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
?column? | t
----------+-------------
0.411765 | qwertyu0988
0.5 | qwertyu0987
(2 rows)
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
...
...
@@ -2315,6 +2322,22 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
qwertyu0988 | 0.333333
(1 row)
explain (costs off)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
QUERY PLAN
---------------------------------------------------
Limit
-> Index Scan using trgm_idx on test_trgm
Order By: (t <-> 'q0987wertyu0988'::text)
(3 rows)
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
?column? | t
----------+-------------
0.411765 | qwertyu0988
0.5 | qwertyu0987
(2 rows)
drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
set enable_seqscan=off;
...
...
contrib/pg_trgm/pg_trgm.sql.in
浏览文件 @
b525bf77
...
...
@@ -26,7 +26,7 @@ LANGUAGE C STRICT IMMUTABLE;
CREATE OR REPLACE FUNCTION similarity_op(text,text)
RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT STABLE;
LANGUAGE C STRICT STABLE;
-- stable because depends on trgm_limit
CREATE OPERATOR % (
LEFTARG = text,
...
...
@@ -37,6 +37,18 @@ CREATE OPERATOR % (
JOIN = contjoinsel
);
CREATE OR REPLACE FUNCTION similarity_dist(text,text)
RETURNS float4
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT IMMUTABLE;
CREATE OPERATOR <-> (
LEFTARG = text,
RIGHTARG = text,
PROCEDURE = similarity_dist,
COMMUTATOR = '<->'
);
-- gist key
CREATE OR REPLACE FUNCTION gtrgm_in(cstring)
RETURNS gtrgm
...
...
@@ -60,6 +72,11 @@ RETURNS bool
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gtrgm_distance(internal,text,int,oid)
RETURNS float8
AS 'MODULE_PATHNAME'
LANGUAGE C IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION gtrgm_compress(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
...
...
@@ -95,6 +112,7 @@ CREATE OPERATOR CLASS gist_trgm_ops
FOR TYPE text USING gist
AS
OPERATOR 1 % (text, text),
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
FUNCTION 2 gtrgm_union (bytea, internal),
FUNCTION 3 gtrgm_compress (internal),
...
...
@@ -102,6 +120,7 @@ AS
FUNCTION 5 gtrgm_penalty (internal, internal, internal),
FUNCTION 6 gtrgm_picksplit (internal, internal),
FUNCTION 7 gtrgm_same (gtrgm, gtrgm, internal),
FUNCTION 8 gtrgm_distance (internal, text, int, oid),
STORAGE gtrgm;
-- support functions for gin
...
...
contrib/pg_trgm/sql/pg_trgm.sql
浏览文件 @
b525bf77
...
...
@@ -26,6 +26,7 @@ CREATE TABLE test_trgm(t text);
select t,similarity(t,'
qwertyu0988
') as sml from test_trgm where t % '
qwertyu0988
' order by sml desc, t;
select t,similarity(t,'
gwertyu0988
') as sml from test_trgm where t % '
gwertyu0988
' order by sml desc, t;
select t,similarity(t,'
gwertyu1988
') as sml from test_trgm where t % '
gwertyu1988
' order by sml desc, t;
select t <-> '
q0987wertyu0988
', t from test_trgm order by t <-> '
q0987wertyu0988
' limit 2;
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
set enable_seqscan=off;
...
...
@@ -33,6 +34,9 @@ set enable_seqscan=off;
select t,similarity(t,'
qwertyu0988
') as sml from test_trgm where t % '
qwertyu0988
' order by sml desc, t;
select t,similarity(t,'
gwertyu0988
') as sml from test_trgm where t % '
gwertyu0988
' order by sml desc, t;
select t,similarity(t,'
gwertyu1988
') as sml from test_trgm where t % '
gwertyu1988
' order by sml desc, t;
explain (costs off)
select t <-> '
q0987wertyu0988
', t from test_trgm order by t <-> '
q0987wertyu0988
' limit 2;
select t <-> '
q0987wertyu0988
', t from test_trgm order by t <-> '
q0987wertyu0988
' limit 2;
drop index trgm_idx;
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
...
...
contrib/pg_trgm/trgm.h
浏览文件 @
b525bf77
...
...
@@ -4,12 +4,10 @@
#ifndef __TRGM_H__
#define __TRGM_H__
#include "postgres.h"
#include "access/gist.h"
#include "access/itup.h"
#include "utils/builtins.h"
#include "storage/bufpage.h"
#include "utils/builtins.h"
/* options */
#define LPADDING 2
...
...
@@ -18,6 +16,10 @@
#define IGNORECASE
#define DIVUNION
/* operator strategy numbers */
#define SimilarityStrategyNumber 1
#define DistanceStrategyNumber 2
typedef
char
trgm
[
3
];
...
...
@@ -89,4 +91,4 @@ extern float4 trgm_limit;
TRGM
*
generate_trgm
(
char
*
str
,
int
slen
);
float4
cnt_sml
(
TRGM
*
trg1
,
TRGM
*
trg2
);
#endif
#endif
/* __TRGM_H__ */
contrib/pg_trgm/trgm_gin.c
浏览文件 @
b525bf77
/*
* contrib/pg_trgm/trgm_gin.c
*/
#include "postgres.h"
#include "trgm.h"
#include "access/gin.h"
...
...
@@ -10,6 +12,7 @@
#include "utils/array.h"
#include "utils/builtins.h"
PG_FUNCTION_INFO_V1
(
gin_extract_trgm
);
Datum
gin_extract_trgm
(
PG_FUNCTION_ARGS
);
...
...
contrib/pg_trgm/trgm_gist.c
浏览文件 @
b525bf77
/*
* contrib/pg_trgm/trgm_gist.c
*/
#include "postgres.h"
#include "trgm.h"
#include "access/gist.h"
#include "access/itup.h"
#include "access/skey.h"
#include "access/tuptoaster.h"
#include "storage/bufpage.h"
#include "utils/array.h"
#include "utils/builtins.h"
PG_FUNCTION_INFO_V1
(
gtrgm_in
);
Datum
gtrgm_in
(
PG_FUNCTION_ARGS
);
...
...
@@ -25,6 +29,9 @@ Datum gtrgm_decompress(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1
(
gtrgm_consistent
);
Datum
gtrgm_consistent
(
PG_FUNCTION_ARGS
);
PG_FUNCTION_INFO_V1
(
gtrgm_distance
);
Datum
gtrgm_distance
(
PG_FUNCTION_ARGS
);
PG_FUNCTION_INFO_V1
(
gtrgm_union
);
Datum
gtrgm_union
(
PG_FUNCTION_ARGS
);
...
...
@@ -159,18 +166,35 @@ gtrgm_decompress(PG_FUNCTION_ARGS)
}
}
static
int4
cnt_sml_sign_common
(
TRGM
*
qtrg
,
BITVECP
sign
)
{
int4
count
=
0
;
int4
k
,
len
=
ARRNELEM
(
qtrg
);
trgm
*
ptr
=
GETARR
(
qtrg
);
int4
tmp
=
0
;
for
(
k
=
0
;
k
<
len
;
k
++
)
{
CPTRGM
(((
char
*
)
&
tmp
),
ptr
+
k
);
count
+=
GETBIT
(
sign
,
HASHVAL
(
tmp
));
}
return
count
;
}
Datum
gtrgm_consistent
(
PG_FUNCTION_ARGS
)
{
GISTENTRY
*
entry
=
(
GISTENTRY
*
)
PG_GETARG_POINTER
(
0
);
text
*
query
=
PG_GETARG_TEXT_P
(
1
);
/* StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2); */
StrategyNumber
strategy
=
(
StrategyNumber
)
PG_GETARG_UINT16
(
2
);
/* Oid subtype = PG_GETARG_OID(3); */
bool
*
recheck
=
(
bool
*
)
PG_GETARG_POINTER
(
4
);
TRGM
*
key
=
(
TRGM
*
)
DatumGetPointer
(
entry
->
key
);
TRGM
*
qtrg
;
bool
res
=
false
;
bool
res
;
char
*
cache
=
(
char
*
)
fcinfo
->
flinfo
->
fn_extra
;
/* All cases served by this function are exact */
...
...
@@ -193,39 +217,95 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
qtrg
=
(
TRGM
*
)
(
cache
+
MAXALIGN
(
VARSIZE
(
query
)));
if
(
GIST_LEAF
(
entry
))
{
/* all leafs contains orig trgm */
float4
tmpsml
=
cnt_sml
(
key
,
qtrg
);
switch
(
strategy
)
{
case
SimilarityStrategyNumber
:
if
(
GIST_LEAF
(
entry
))
{
/* all leafs contains orig trgm */
float4
tmpsml
=
cnt_sml
(
key
,
qtrg
);
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res
=
(
*
(
int
*
)
&
tmpsml
==
*
(
int
*
)
&
trgm_limit
||
tmpsml
>
trgm_limit
)
?
true
:
false
;
/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
res
=
(
*
(
int
*
)
&
tmpsml
==
*
(
int
*
)
&
trgm_limit
||
tmpsml
>
trgm_limit
)
?
true
:
false
;
}
else
if
(
ISALLTRUE
(
key
))
{
/* non-leaf contains signature */
res
=
true
;
}
else
{
/* non-leaf contains signature */
int4
count
=
cnt_sml_sign_common
(
qtrg
,
GETSIGN
(
key
));
int4
len
=
ARRNELEM
(
qtrg
);
if
(
len
==
0
)
res
=
false
;
else
res
=
(((((
float8
)
count
)
/
((
float8
)
len
)))
>=
trgm_limit
)
?
true
:
false
;
}
break
;
default:
elog
(
ERROR
,
"unrecognized strategy number: %d"
,
strategy
);
res
=
false
;
/* keep compiler quiet */
break
;
}
else
if
(
ISALLTRUE
(
key
))
{
/* non-leaf contains signature */
res
=
true
;
PG_RETURN_BOOL
(
res
);
}
Datum
gtrgm_distance
(
PG_FUNCTION_ARGS
)
{
GISTENTRY
*
entry
=
(
GISTENTRY
*
)
PG_GETARG_POINTER
(
0
);
text
*
query
=
PG_GETARG_TEXT_P
(
1
);
StrategyNumber
strategy
=
(
StrategyNumber
)
PG_GETARG_UINT16
(
2
);
/* Oid subtype = PG_GETARG_OID(3); */
TRGM
*
key
=
(
TRGM
*
)
DatumGetPointer
(
entry
->
key
);
TRGM
*
qtrg
;
float8
res
;
char
*
cache
=
(
char
*
)
fcinfo
->
flinfo
->
fn_extra
;
if
(
cache
==
NULL
||
VARSIZE
(
cache
)
!=
VARSIZE
(
query
)
||
memcmp
(
cache
,
query
,
VARSIZE
(
query
))
!=
0
)
{
qtrg
=
generate_trgm
(
VARDATA
(
query
),
VARSIZE
(
query
)
-
VARHDRSZ
);
if
(
cache
)
pfree
(
cache
);
fcinfo
->
flinfo
->
fn_extra
=
MemoryContextAlloc
(
fcinfo
->
flinfo
->
fn_mcxt
,
MAXALIGN
(
VARSIZE
(
query
))
+
VARSIZE
(
qtrg
));
cache
=
(
char
*
)
fcinfo
->
flinfo
->
fn_extra
;
memcpy
(
cache
,
query
,
VARSIZE
(
query
));
memcpy
(
cache
+
MAXALIGN
(
VARSIZE
(
query
)),
qtrg
,
VARSIZE
(
qtrg
));
}
else
{
/* non-leaf contains signature */
int4
count
=
0
;
int4
k
,
len
=
ARRNELEM
(
qtrg
);
trgm
*
ptr
=
GETARR
(
qtrg
);
BITVECP
sign
=
GETSIGN
(
key
);
int4
tmp
=
0
;
for
(
k
=
0
;
k
<
len
;
k
++
)
{
CPTRGM
(((
char
*
)
&
tmp
),
ptr
+
k
);
count
+=
GETBIT
(
sign
,
HASHVAL
(
tmp
));
}
#ifdef DIVUNION
res
=
(
len
==
count
)
?
true
:
((((((
float4
)
count
)
/
((
float4
)
(
len
-
count
))))
>=
trgm_limit
)
?
true
:
false
);
#else
res
=
(
len
==
0
)
?
false
:
((((((
float4
)
count
)
/
((
float4
)
len
)))
>=
trgm_limit
)
?
true
:
false
);
#endif
qtrg
=
(
TRGM
*
)
(
cache
+
MAXALIGN
(
VARSIZE
(
query
)));
switch
(
strategy
)
{
case
DistanceStrategyNumber
:
if
(
GIST_LEAF
(
entry
))
{
/* all leafs contains orig trgm */
res
=
1
.
0
-
cnt_sml
(
key
,
qtrg
);
}
else
if
(
ISALLTRUE
(
key
))
{
/* all leafs contains orig trgm */
res
=
0
.
0
;
}
else
{
/* non-leaf contains signature */
int4
count
=
cnt_sml_sign_common
(
qtrg
,
GETSIGN
(
key
));
int4
len
=
ARRNELEM
(
qtrg
);
res
=
(
len
==
0
)
?
-
1
.
0
:
1
.
0
-
((
float8
)
count
)
/
((
float8
)
len
);
}
break
;
default:
elog
(
ERROR
,
"unrecognized strategy number: %d"
,
strategy
);
res
=
0
;
/* keep compiler quiet */
break
;
}
PG_RETURN_
BOOL
(
res
);
PG_RETURN_
FLOAT8
(
res
);
}
static
int4
...
...
contrib/pg_trgm/trgm_op.c
浏览文件 @
b525bf77
/*
* contrib/pg_trgm/trgm_op.c
*/
#include "trgm.h"
#include "postgres.h"
#include <ctype.h>
#include "utils/array.h"
#include "trgm.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_locale.h"
#include "utils/array.h"
PG_MODULE_MAGIC
;
...
...
@@ -359,16 +364,25 @@ similarity(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT4
(
res
);
}
PG_FUNCTION_INFO_V1
(
similarity_dist
);
Datum
similarity_dist
(
PG_FUNCTION_ARGS
);
Datum
similarity_dist
(
PG_FUNCTION_ARGS
)
{
float4
res
=
DatumGetFloat4
(
DirectFunctionCall2
(
similarity
,
PG_GETARG_DATUM
(
0
),
PG_GETARG_DATUM
(
1
)));
PG_RETURN_FLOAT4
(
1
.
0
-
res
);
}
PG_FUNCTION_INFO_V1
(
similarity_op
);
Datum
similarity_op
(
PG_FUNCTION_ARGS
);
Datum
similarity_op
(
PG_FUNCTION_ARGS
)
{
float4
res
=
DatumGetFloat4
(
DirectFunctionCall2
(
similarity
,
float4
res
=
DatumGetFloat4
(
DirectFunctionCall2
(
similarity
,
PG_GETARG_DATUM
(
0
),
PG_GETARG_DATUM
(
1
)
));
PG_GETARG_DATUM
(
1
)));
PG_RETURN_BOOL
(
res
>=
trgm_limit
);
}
contrib/pg_trgm/uninstall_pg_trgm.sql
浏览文件 @
b525bf77
...
...
@@ -19,6 +19,8 @@ DROP FUNCTION gtrgm_compress(internal);
DROP
FUNCTION
gtrgm_consistent
(
internal
,
text
,
int
,
oid
,
internal
);
DROP
FUNCTION
gtrgm_distance
(
internal
,
text
,
int
,
oid
);
DROP
TYPE
gtrgm
CASCADE
;
DROP
OPERATOR
CLASS
gin_trgm_ops
USING
gin
;
...
...
@@ -33,6 +35,10 @@ DROP OPERATOR % (text, text);
DROP
FUNCTION
similarity_op
(
text
,
text
);
DROP
OPERATOR
<->
(
text
,
text
);
DROP
FUNCTION
similarity_dist
(
text
,
text
);
DROP
FUNCTION
similarity
(
text
,
text
);
DROP
FUNCTION
show_trgm
(
text
);
...
...
doc/src/sgml/pgtrgm.sgml
浏览文件 @
b525bf77
...
...
@@ -117,6 +117,14 @@
<function>set_limit</>.
</entry>
</row>
<row>
<entry><type>text</> <literal><-></literal> <type>text</></entry>
<entry><type>real</type></entry>
<entry>
Returns the <quote>distance</> between the arguments, that is
one minus the <function>similarity()</> value.
</entry>
</row>
</tbody>
</tgroup>
</table>
...
...
@@ -129,7 +137,7 @@
The <filename>pg_trgm</filename> module provides GiST and GIN index
operator classes that allow you to create an index over a text column for
the purpose of very fast similarity searches. These index types support
the
<literal>%</> similarity operator
(and no other operators, so you may
the
above-described similarity operators
(and no other operators, so you may
want a regular B-tree index too).
</para>
...
...
@@ -161,6 +169,18 @@ SELECT t, similarity(t, '<replaceable>word</>') AS sml
sets.
</para>
<para>
A variant of the above query is
<programlisting>
SELECT t, t <-> '<replaceable>word</>' AS dist
FROM test_trgm
ORDER BY dist LIMIT 10;
</programlisting>
This can be implemented quite efficiently by GiST indexes, but not
by GIN indexes. It will usually beat the first formulation when only
a small number of the closest matches is wanted.
</para>
<para>
The choice between GiST and GIN indexing depends on the relative
performance characteristics of GiST and GIN, which are discussed elsewhere.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录