提交 5e2707c4 编写于 作者: T Teodor Sigaev

Snowball multibyte. It's a pity, but snowball sources is very diferent for multibyte and

singlebyte encodings, so we should have snowball for every encodings.

I hope that finalize multibyte support work in tsearch2, but testing is needed...
上级 75c47471
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.12 2005/11/21 12:27:57 teodor Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.13 2006/01/27 16:32:31 teodor Exp $
MODULE_big = tsearch2 MODULE_big = tsearch2
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \ OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
...@@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS) ...@@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
DATA = stopword/english.stop stopword/russian.stop DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
DATA_built = tsearch2.sql untsearch2.sql DATA_built = tsearch2.sql untsearch2.sql
DOCS = README.tsearch2 DOCS = README.tsearch2
REGRESS = tsearch2 REGRESS = tsearch2
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "snowball/header.h" #include "snowball/header.h"
#include "snowball/english_stem.h" #include "snowball/english_stem.h"
#include "snowball/russian_stem.h" #include "snowball/russian_stem.h"
#include "snowball/russian_stem_UTF8.h"
#include "ts_locale.h" #include "ts_locale.h"
typedef struct typedef struct
...@@ -23,8 +24,11 @@ typedef struct ...@@ -23,8 +24,11 @@ typedef struct
PG_FUNCTION_INFO_V1(snb_en_init); PG_FUNCTION_INFO_V1(snb_en_init);
Datum snb_en_init(PG_FUNCTION_ARGS); Datum snb_en_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_ru_init); PG_FUNCTION_INFO_V1(snb_ru_init_koi8);
Datum snb_ru_init(PG_FUNCTION_ARGS); Datum snb_ru_init_koi8(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_ru_init_utf8);
Datum snb_ru_init_utf8(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(snb_lexize); PG_FUNCTION_INFO_V1(snb_lexize);
Datum snb_lexize(PG_FUNCTION_ARGS); Datum snb_lexize(PG_FUNCTION_ARGS);
...@@ -64,7 +68,7 @@ snb_en_init(PG_FUNCTION_ARGS) ...@@ -64,7 +68,7 @@ snb_en_init(PG_FUNCTION_ARGS)
} }
Datum Datum
snb_ru_init(PG_FUNCTION_ARGS) snb_ru_init_koi8(PG_FUNCTION_ARGS)
{ {
DictSnowball *d = (DictSnowball *) malloc(sizeof(DictSnowball)); DictSnowball *d = (DictSnowball *) malloc(sizeof(DictSnowball));
...@@ -97,6 +101,40 @@ snb_ru_init(PG_FUNCTION_ARGS) ...@@ -97,6 +101,40 @@ snb_ru_init(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(d); PG_RETURN_POINTER(d);
} }
Datum
snb_ru_init_utf8(PG_FUNCTION_ARGS)
{
DictSnowball *d = (DictSnowball *) malloc(sizeof(DictSnowball));
if (!d)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
memset(d, 0, sizeof(DictSnowball));
d->stoplist.wordop = lowerstr;
if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
{
text *in = PG_GETARG_TEXT_P(0);
readstoplist(in, &(d->stoplist));
sortstoplist(&(d->stoplist));
PG_FREE_IF_COPY(in, 0);
}
d->z = russian_UTF_8_create_env();
if (!d->z)
{
freestoplist(&(d->stoplist));
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
d->stem = russian_UTF_8_stem;
PG_RETURN_POINTER(d);
}
Datum Datum
snb_lexize(PG_FUNCTION_ARGS) snb_lexize(PG_FUNCTION_ARGS)
{ {
......
...@@ -4,21 +4,21 @@ ...@@ -4,21 +4,21 @@
-- --
\set ECHO none \set ECHO none
psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict" psql:tsearch2.sql:13: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_dict_pkey" for table "pg_ts_dict"
psql:tsearch2.sql:145: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser" psql:tsearch2.sql:158: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_parser_pkey" for table "pg_ts_parser"
psql:tsearch2.sql:244: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg" psql:tsearch2.sql:257: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfg_pkey" for table "pg_ts_cfg"
psql:tsearch2.sql:251: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap" psql:tsearch2.sql:264: NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "pg_ts_cfgmap_pkey" for table "pg_ts_cfgmap"
psql:tsearch2.sql:337: NOTICE: type "tsvector" is not yet defined psql:tsearch2.sql:370: NOTICE: type "tsvector" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:342: NOTICE: argument type tsvector is only a shell psql:tsearch2.sql:375: NOTICE: argument type tsvector is only a shell
psql:tsearch2.sql:396: NOTICE: type "tsquery" is not yet defined psql:tsearch2.sql:429: NOTICE: type "tsquery" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:401: NOTICE: argument type tsquery is only a shell psql:tsearch2.sql:434: NOTICE: argument type tsquery is only a shell
psql:tsearch2.sql:559: NOTICE: type "gtsvector" is not yet defined psql:tsearch2.sql:592: NOTICE: type "gtsvector" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:564: NOTICE: argument type gtsvector is only a shell psql:tsearch2.sql:597: NOTICE: argument type gtsvector is only a shell
psql:tsearch2.sql:1054: NOTICE: type "gtsq" is not yet defined psql:tsearch2.sql:1087: NOTICE: type "gtsq" is not yet defined
DETAIL: Creating a shell type definition. DETAIL: Creating a shell type definition.
psql:tsearch2.sql:1059: NOTICE: argument type gtsq is only a shell psql:tsearch2.sql:1092: NOTICE: argument type gtsq is only a shell
--tsvector --tsvector
SELECT '1'::tsvector; SELECT '1'::tsvector;
tsvector tsvector
......
# $PostgreSQL: pgsql/contrib/tsearch2/snowball/Makefile,v 1.8 2005/10/18 01:30:48 tgl Exp $ # $PostgreSQL: pgsql/contrib/tsearch2/snowball/Makefile,v 1.9 2006/01/27 16:32:31 teodor Exp $
SUBOBJS = english_stem.o api.o russian_stem.o utilities.o SUBOBJS = english_stem.o api.o russian_stem.o russian_stem_UTF8.o utilities.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
......
此差异已折叠。
/* This file was generated automatically by the Snowball to ANSI C compiler */
#ifdef __cplusplus
extern "C" {
#endif
extern struct SN_env * russian_UTF_8_create_env(void);
extern void russian_UTF_8_close_env(struct SN_env * z);
extern int russian_UTF_8_stem(struct SN_env * z);
#ifdef __cplusplus
}
#endif
и
в
во
не
что
он
на
я
с
со
как
а
то
все
она
так
его
но
да
ты
к
у
же
вы
за
бы
по
только
ее
мне
было
вот
от
меня
еще
нет
о
из
ему
теперь
когда
даже
ну
вдруг
ли
если
уже
или
ни
быть
был
него
до
вас
нибудь
опять
уж
вам
ведь
там
потом
себя
ничего
ей
может
они
тут
где
есть
надо
ней
для
мы
тебя
их
чем
была
сам
чтоб
без
будто
чего
раз
тоже
себе
под
будет
ж
тогда
кто
этот
того
потому
этого
какой
совсем
ним
здесь
этом
один
почти
мой
тем
чтобы
нее
сейчас
были
куда
зачем
всех
никогда
можно
при
наконец
два
об
другой
хоть
после
над
больше
тот
через
эти
нас
про
всего
них
какая
много
разве
три
эту
моя
впрочем
хорошо
свою
этой
перед
иногда
лучше
чуть
том
нельзя
такой
им
более
всегда
конечно
всю
между
...@@ -82,17 +82,30 @@ insert into pg_ts_dict select ...@@ -82,17 +82,30 @@ insert into pg_ts_dict select
'English Stemmer. Snowball.' 'English Stemmer. Snowball.'
; ;
CREATE FUNCTION snb_ru_init(internal) CREATE FUNCTION snb_ru_init_koi8(internal)
returns internal returns internal
as 'MODULE_PATHNAME' as 'MODULE_PATHNAME'
language 'C'; language 'C';
insert into pg_ts_dict select insert into pg_ts_dict select
'ru_stem', 'ru_stem_koi8',
'snb_ru_init(internal)', 'snb_ru_init_koi8(internal)',
'contrib/russian.stop', 'contrib/russian.stop',
'snb_lexize(internal,internal,int4)', 'snb_lexize(internal,internal,int4)',
'Russian Stemmer. Snowball.' 'Russian Stemmer. Snowball. KOI8 Encoding'
;
CREATE FUNCTION snb_ru_init_utf8(internal)
returns internal
as 'MODULE_PATHNAME'
language 'C';
insert into pg_ts_dict select
'ru_stem_utf8',
'snb_ru_init_utf8(internal)',
'contrib/russian.stop.utf8',
'snb_lexize(internal,internal,int4)',
'Russian Stemmer. Snowball. UTF8 Encoding'
; ;
CREATE FUNCTION spell_init(internal) CREATE FUNCTION spell_init(internal)
...@@ -270,6 +283,7 @@ CREATE FUNCTION show_curcfg() ...@@ -270,6 +283,7 @@ CREATE FUNCTION show_curcfg()
insert into pg_ts_cfg values ('default', 'default','C'); insert into pg_ts_cfg values ('default', 'default','C');
insert into pg_ts_cfg values ('default_russian', 'default','ru_RU.KOI8-R'); insert into pg_ts_cfg values ('default_russian', 'default','ru_RU.KOI8-R');
insert into pg_ts_cfg values ('utf8_russian', 'default','ru_RU.UTF-8');
insert into pg_ts_cfg values ('simple', 'default'); insert into pg_ts_cfg values ('simple', 'default');
insert into pg_ts_cfgmap values ('default', 'lword', '{en_stem}'); insert into pg_ts_cfgmap values ('default', 'lword', '{en_stem}');
...@@ -292,24 +306,43 @@ insert into pg_ts_cfgmap values ('default', 'float', '{simple}'); ...@@ -292,24 +306,43 @@ insert into pg_ts_cfgmap values ('default', 'float', '{simple}');
insert into pg_ts_cfgmap values ('default', 'int', '{simple}'); insert into pg_ts_cfgmap values ('default', 'int', '{simple}');
insert into pg_ts_cfgmap values ('default', 'uint', '{simple}'); insert into pg_ts_cfgmap values ('default', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'lword', '{en_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'lword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlword', '{ru_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'nlword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'word', '{ru_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'word', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'email', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'email', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'url', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'url', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'host', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'host', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'sfloat', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'sfloat', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'version', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'version', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'part_hword', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'part_hword', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'nlpart_hword', '{ru_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'nlpart_hword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'lpart_hword', '{en_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'lpart_hword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'hword', '{ru_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'hword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'lhword', '{en_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'lhword', '{en_stem}');
insert into pg_ts_cfgmap values ('default_russian', 'nlhword', '{ru_stem}'); insert into pg_ts_cfgmap values ('default_russian', 'nlhword', '{ru_stem_koi8}');
insert into pg_ts_cfgmap values ('default_russian', 'uri', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'uri', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'file', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'file', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'float', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'float', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'int', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'int', '{simple}');
insert into pg_ts_cfgmap values ('default_russian', 'uint', '{simple}'); insert into pg_ts_cfgmap values ('default_russian', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'word', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'email', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'url', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'host', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'sfloat', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'version', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'part_hword', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlpart_hword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lpart_hword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'hword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'lhword', '{en_stem}');
insert into pg_ts_cfgmap values ('utf8_russian', 'nlhword', '{ru_stem_utf8}');
insert into pg_ts_cfgmap values ('utf8_russian', 'uri', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'file', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'float', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'int', '{simple}');
insert into pg_ts_cfgmap values ('utf8_russian', 'uint', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'lword', '{simple}'); insert into pg_ts_cfgmap values ('simple', 'lword', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'nlword', '{simple}'); insert into pg_ts_cfgmap values ('simple', 'nlword', '{simple}');
insert into pg_ts_cfgmap values ('simple', 'word', '{simple}'); insert into pg_ts_cfgmap values ('simple', 'word', '{simple}');
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册