From 661ecf3c48e16a9add216287eb969d7615e47968 Mon Sep 17 00:00:00 2001 From: "Marc G. Fournier" Date: Sun, 15 Mar 1998 07:39:04 +0000 Subject: [PATCH] From: t-ishii@sra.co.jp Included are patches intended for allowing PostgreSQL to handle multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and Mule internal code. With the MB patch you can use multi-byte character sets in regexp and LIKE. The encoding system chosen is determined at the compile time. To enable the MB extension, you need to define a variable "MB" in Makefile.global or in Makefile.custom. For further information please take a look at README.mb under doc directory. (Note that unlike "jp patch" I do not use modified GNU regexp any more. I changed Henry Spencer's regexp coming with PostgreSQL.) --- doc/Makefile | 4 +- doc/README.mb | 67 +++++++ doc/README.mb.jp | 106 ++++++++++ src/backend/regex/Makefile | 6 +- src/backend/regex/engine.c | 170 +++++++++------- src/backend/regex/regcomp.c | 178 +++++++++++++---- src/backend/regex/regerror.c | 5 + src/backend/regex/regexec.c | 25 ++- src/backend/regex/regfree.c | 6 +- src/backend/regex/utftest.c | 33 ++++ src/backend/regex/utils.c | 348 +++++++++++++++++++++++++++++++++ src/backend/regex/wstrcmp.c | 48 +++++ src/backend/regex/wstrncmp.c | 83 ++++++++ src/backend/utils/adt/Makefile | 5 +- src/backend/utils/adt/like.c | 29 ++- src/configure | 8 +- src/configure.in | 8 +- src/include/regex/pg_wchar.h | 44 +++++ src/include/regex/regex.h | 7 +- src/include/regex/regex2.h | 42 +++- src/include/regex/utils.h | 5 + src/test/regress/GNUmakefile | 3 +- src/test/regress/regress.sh | 9 +- 23 files changed, 1104 insertions(+), 135 deletions(-) create mode 100644 doc/README.mb create mode 100644 doc/README.mb.jp create mode 100644 src/backend/regex/utftest.c create mode 100644 src/backend/regex/utils.c create mode 100644 src/backend/regex/wstrcmp.c create mode 100644 src/backend/regex/wstrncmp.c create mode 100644 src/include/regex/pg_wchar.h diff --git a/doc/Makefile b/doc/Makefile index c487b11f73..8642dc4b5f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -8,11 +8,11 @@ # # # IDENTIFICATION -# $Header: /cvsroot/pgsql/doc/Makefile,v 1.4 1998/03/01 20:37:44 thomas Exp $ +# $Header: /cvsroot/pgsql/doc/Makefile,v 1.5 1998/03/15 07:37:51 scrappy Exp $ # #---------------------------------------------------------------------------- -PGDOCS= . +PGDOCS= /usr/local/cdrom/docs SRCDIR= ../src TAR= tar diff --git a/doc/README.mb b/doc/README.mb new file mode 100644 index 0000000000..d6ff7e569b --- /dev/null +++ b/doc/README.mb @@ -0,0 +1,67 @@ +postgresql 6.3 multi-byte(MB) patch PL2 README Mar 10 1998 + + Tatsuo Ishii + t-ishii@sra.co.jp + http://www.sra.co.jp/people/t-ishii/PostgreSQL/ + +Introduction + +MB patch is intended for allowing PostgreSQL to handle multi-byte +charachter sets such as EUC(Extende Unix Code), Unicode and Mule +internal code. With the MB patch you can use multi-byte character sets +in regexp and LIKE. The encoding system chosen is determined at the +compile time. + +The patch also fixes some problems concerning with 8-bit single byte +character sets including ISO8859. (I would not say all of problems +have been fixed. I just confirmed that the regression test ran fine +and a few French characters could be used with the patch. Please let +me know if you find any problem while using 8-bit characters) + +How to use + +After applying the MB patch, create src/Makefile.custom with a line +including: + +MB=encoding_system + +where encoding_system is one of: + +EUC_JP Japanese EUC +EUC_CN Chinese EUC +EUC_KR Korean EUC +EUC_TW Taiwan EUC +UNICODE Unicode(UTF-8) +MULE_INTERNAL Mule internal + +Example: + +% cat Makefile.custom +MB=EUC_JP + +If MB is not defined, nothing is changed except better supporting for +8-bit single byte character sets. + +References + +These are good sources to start learning various kind of encoding +systems. + +ftp://ftp.ora.com/pub/examples/nutshell/ujip/doc/cjk.inf + Detailed explanations of EUC_JP, EUC_CN, EUC_KR, EUC_TW + appear in section 3.2. + +Unicode: http://www.unicode.org/ + The homepage of UNICODE. + + RFC 2044 + UTF-8 is defined here. + +History + +Mar 10, 1998 PL2 released + * add regression test for EUC_JP, EUC_CN and MULE_INTERNAL + * add an English document (this file) + * fix problems concerning 8-bit single byte characters + +Mar 1, 1998 PL1 released diff --git a/doc/README.mb.jp b/doc/README.mb.jp new file mode 100644 index 0000000000..00375000b5 --- /dev/null +++ b/doc/README.mb.jp @@ -0,0 +1,106 @@ +postgresql 6.3 multi-byte (MB) patch PL2 README 1998/3/10 作成 + + 石井達夫 + t-ishii@sra.co.jp + http://www.sra.co.jp/people/t-ishii/PostgreSQL/ + +はじめに: + このパッチは、フリーな RDBMS(Relational Database Management System)の + PostgreSQL (http://www.postgresql.org/)の最新版 6.3 で日本語 EUC + など、マルチバイト文字を扱うことを可能にするためのものです。このパッ + チをあてることにより、以下のことが可能になります。 + + 1.マルチバイト文字として、日本語、中国語などの各国の EUC、Unicode、 + mule internal code がコンパイル時に選択可能。データベースには + このコードのまま格納されます。 + 2.テーブル名にマルチバイト文字が使用可能(ただし、OS がマルチバイト + のファイル名を許していることが必要) + 3.カラム名にマルチバイト文字が使用可能 + 4.データそのものにもマルチバイト文字が使用可能 + 5.マルチバイト文字の正規表現検索が使用可能 + 6.マルチバイト文字の LIKE 検索が使用可能 + + (ただし、2,3,4 についてはパッチをあてなくても可能です。) + +postgresql-6.3 の入手方法: + postgresql-6.3.tar.gz は postgresql の日本での公式ミラーサイトで + ある ftp://ftp.jaist.ac.jp/pub/dbms/PostgreSQL/ から入手できます。 + 何らかの理由でここから入手できない場合は、 + ftp://ftp.sra.co.jp/pub/cmd/postgres/6.3/ も利用できます。 + なお、postgresql のオリジナル ftp サイトは ftp://ftp.postgresql.org + です。 + +このパッチの入手方法: + + ftp://ftp.sra.co.jp/pub/cmd/postgres/6.3/patches/6.3mbPL2.patch.gz + を入手して下さい。 + +パッチのあてかた: + 入手したパッチファイルを展開します。 + + % gunzip 6.3mbPL2.patch.gz + + postgresql-6.3 のソースを展開します。 + + % gtar xfz postgresql-6.3.tar.gz + + すると、postgresql-6.3 というディレクトリができるので、そこに + cd します。 + + % cd postgresql-6.3 + + パッチを当てます。 + + % patch -p1 < 6.3mbPL2.patch + + としてあててください。次に、src/Makefile.custom というファイルを作り、 + + MB=EUC_JP + + の 1 行を追加します。EUC_JP を含め、以下のコードが指定できます。 + + EUC_JP 日本語 EUC + EUC_CN GB をベースにした中文EUC。code set 2 は + SS2+2バイトコード = 3バイト表現です。 + EUC_KR 韓国語 EUC。 + EUC_TW 台湾の EUC。code set 2 は + SS2+面番号+2バイトコード = 4バイト表現です。 + UNICODE UTF-8。ただしサポートするのは UCS-2 の範囲、 + すなわち 0xffff までです。 + MULE_INTERNAL mule の内部コード。ただし、Type N の不定長文字は + サポートしていません。 + + 選択の目安としては、英語と日本語しか使わない場合は EUC_JP(同様に、中 + 国語しか使わない場合は EUC_CN... などとなります)、その他の言語も使いた + い場合は UNICODE もしくは MULE_INTERNAL となるでしょう。 + + 注意:MULE_INTERNAL を選ぶと、たくさんの文字集合に対応できて便利です + が、正規表現で複数の文字集合にまたがるような範囲指定(たとえば、[a-範] + とか、[abc範囲]のような)は使えません。複数の範囲指定で異なる文字集合 + を使うのは構いません(たとえば [abc][範-囲])。また、[^a] のような表現 + は、"a" の属する文字集合(この場合、US-ASCII)において "a" 以外である + ことを表します。決して漢字や平仮名など "a" 以外をすべて表すわけでは + ないことに注意して下さい。 + + インストールは普通に行ないます。インストールの詳細は INSTALL という + テキストファイルを御覧下さい。また、 + http://www.sra.co.jp/people/t-ishii/PostgreSQL/ でも簡単なインストー + ル方法を紹介しています。 + +謝辞: + o 各種文字セット、コード系について、日本語 PostgreSQL メーリングリスト + のメンバの方からアドバイスを頂きました。ここに感謝します。 + +改定履歴: + + 1998/3/10 PL2 をリリース + * EUC_JP, EUC_CN, MULE_INTERNAL の regression test を追加 + (EUC_CN のデータは he@sra.co.jp さん提供) + * regexp において、isalpha などに unsigend char 以外の値が + 渡らないようにガードをかける + * 英語のドキュメントを追加 + * MB を定義しない場合に発生するバグを修正 + + 1998/3/1 PL1 をリリース + +以上。 diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile index f7ef534bd3..6f080cfcb5 100644 --- a/src/backend/regex/Makefile +++ b/src/backend/regex/Makefile @@ -4,7 +4,7 @@ # Makefile for regex # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.4 1997/12/20 00:26:58 scrappy Exp $ +# $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.5 1998/03/15 07:38:14 scrappy Exp $ # #------------------------------------------------------------------------- @@ -17,6 +17,10 @@ CFLAGS+=$(INCLUDE_OPT) CFLAGS+=-DPOSIX_MISTAKE OBJS = regcomp.o regerror.o regexec.o regfree.o +ifdef MB +OBJS += utils.o wstrcmp.o wstrncmp.o +CFLAGS += -DMB=$(MB) +endif all: SUBSYS.o diff --git a/src/backend/regex/engine.c b/src/backend/regex/engine.c index 4801361f90..1964f2a024 100644 --- a/src/backend/regex/engine.c +++ b/src/backend/regex/engine.c @@ -73,11 +73,11 @@ struct match struct re_guts *g; int eflags; regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ - char *offp; /* offsets work from here */ - char *beginp; /* start of string -- virtual NUL precedes */ - char *endp; /* end of string -- virtual NUL here */ - char *coldp; /* can be no match starting before here */ - char **lastpos; /* [nplus+1] */ + pg_wchar *offp; /* offsets work from here */ + pg_wchar *beginp; /* start of string -- virtual NUL precedes */ + pg_wchar *endp; /* end of string -- virtual NUL here */ + pg_wchar *coldp; /* can be no match starting before here */ + pg_wchar **lastpos; /* [nplus+1] */ STATEVARS; states st; /* current states */ states fresh; /* states for a fresh start */ @@ -93,19 +93,19 @@ extern "C" /* === engine.c === */ static int - matcher(struct re_guts * g, char *string, size_t nmatch, + matcher(struct re_guts * g, pg_wchar *string, size_t nmatch, regmatch_t pmatch[], int eflags); - static char * - dissect(struct match * m, char *start, char *stop, + static pg_wchar * + dissect(struct match * m, pg_wchar *start, pg_wchar *stop, sopno startst, sopno stopst); - static char * - backref(struct match * m, char *start, char *stop, + static pg_wchar * + backref(struct match * m, pg_wchar *start, pg_wchar *stop, sopno startst, sopno stopst, sopno lev); - static char * - fast(struct match * m, char *start, char *stop, + static pg_wchar * + fast(struct match * m, pg_wchar *start, pg_wchar *stop, sopno startst, sopno stopst); - static char * - slow(struct match * m, char *start, char *stop, sopno startst, sopno stopst); + static pg_wchar * + slow(struct match * m, pg_wchar *start, pg_wchar *stop, sopno startst, sopno stopst); static states step(struct re_guts * g, sopno start, sopno stop, states bef, int ch, states aft); @@ -116,20 +116,35 @@ extern "C" #define BOW (BOL+4) #define EOW (BOL+5) #define CODEMAX (BOL+5) /* highest code used */ -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) + +#ifdef MB +# if MB == MULE_INTERNAL +# define NONCHAR(c) ((c) > 16777216) /* 16777216 == 2^24 == 3 bytes */ +# define NNONCHAR (CODEMAX-16777216) +# elif MB == EUC_JP || MB == EUC_CN || MB == EUC_KR || MB == EUC_TW +# define NONCHAR(c) ((c) > USHRT_MAX) +# define NNONCHAR (CODEMAX-USHRT_MAX) +# elif MB == UNICODE +# define NONCHAR(c) ((c) > USHRT_MAX) +# define NNONCHAR (CODEMAX-USHRT_MAX) +# endif +#else +# define NONCHAR(c) ((c) > CHAR_MAX) +# define NNONCHAR (CODEMAX-CHAR_MAX) +#endif + #ifdef REDEBUG static void - print(struct match * m, char *caption, states st, int ch, FILE *d); + print(struct match * m, pg_wchar *caption, states st, int ch, FILE *d); #endif #ifdef REDEBUG static void - at(struct match * m, char *title, char *start, char *stop, + at(struct match * m, pg_wchar *title, pg_wchar *start, pg_wchar *stop, sopno startst, sopno stopst); #endif #ifdef REDEBUG - static char * - pchar(int ch); + static pg_wchar * + p_char(int ch); #endif #ifdef __cplusplus @@ -150,26 +165,26 @@ extern "C" /* - matcher - the actual matching engine - == static int matcher(struct re_guts *g, char *string, \ + == static int matcher(struct re_guts *g, pg_wchar *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); */ static int /* 0 success, REG_NOMATCH failure */ matcher(g, string, nmatch, pmatch, eflags) struct re_guts *g; -char *string; +pg_wchar *string; size_t nmatch; regmatch_t pmatch[]; int eflags; { - char *endp; + pg_wchar *endp; int i; struct match mv; struct match *m = &mv; - char *dp; + pg_wchar *dp; const sopno gf = g->firststate + 1; /* +1 for OEND */ const sopno gl = g->laststate; - char *start; - char *stop; + pg_wchar *start; + pg_wchar *stop; /* simplify the situation where possible */ if (g->cflags & REG_NOSUB) @@ -182,7 +197,11 @@ int eflags; else { start = string; +#ifdef MB + stop = start + pg_wchar_strlen(start); +#else stop = start + strlen(start); +#endif } if (stop < start) return (REG_INVARG); @@ -192,7 +211,11 @@ int eflags; { for (dp = start; dp < stop; dp++) if (*dp == g->must[0] && stop - dp >= g->mlen && +#ifdef MB + memcmp(dp, g->must, (size_t) (g->mlen * sizeof(pg_wchar))) == 0) +#else memcmp(dp, g->must, (size_t) g->mlen) == 0) +#endif break; if (dp == stop) /* we didn't find g->must */ return (REG_NOMATCH); @@ -258,8 +281,8 @@ int eflags; else { if (g->nplus > 0 && m->lastpos == NULL) - m->lastpos = (char **) malloc((g->nplus + 1) * - sizeof(char *)); + m->lastpos = (pg_wchar **) malloc((g->nplus + 1) * + sizeof(pg_wchar *)); if (g->nplus > 0 && m->lastpos == NULL) { free(m->pmatch); @@ -324,9 +347,9 @@ int eflags; } if (m->pmatch != NULL) - free((char *) m->pmatch); + free((pg_wchar *) m->pmatch); if (m->lastpos != NULL) - free((char *) m->lastpos); + free((pg_wchar *) m->lastpos); STATETEARDOWN(m); return (0); } @@ -336,27 +359,27 @@ int eflags; == static char *dissect(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ -static char * /* == stop (success) always */ +static pg_wchar * /* == stop (success) always */ dissect(m, start, stop, startst, stopst) struct match *m; -char *start; -char *stop; +pg_wchar *start; +pg_wchar *stop; sopno startst; sopno stopst; { int i; sopno ss; /* start sop of current subRE */ sopno es; /* end sop of current subRE */ - char *sp; /* start of string matched by it */ - char *stp; /* string matched by it cannot pass here */ - char *rest; /* start of rest of string */ - char *tail; /* string unmatched by rest of RE */ + pg_wchar *sp; /* start of string matched by it */ + pg_wchar *stp; /* string matched by it cannot pass here */ + pg_wchar *rest; /* start of rest of string */ + pg_wchar *tail; /* string unmatched by rest of RE */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *sep; /* end of string matched by subsubRE */ - char *oldssp; /* previous ssp */ - char *dp; + pg_wchar *ssp; /* start of string matched by subsubRE */ + pg_wchar *sep; /* end of string matched by subsubRE */ + pg_wchar *oldssp; /* previous ssp */ + pg_wchar *dp; AT("diss", start, stop, startst, stopst); sp = start; @@ -536,22 +559,22 @@ sopno stopst; == static char *backref(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst, sopno lev); */ -static char * /* == stop (success) or NULL (failure) */ +static pg_wchar * /* == stop (success) or NULL (failure) */ backref(m, start, stop, startst, stopst, lev) struct match *m; -char *start; -char *stop; +pg_wchar *start; +pg_wchar *stop; sopno startst; sopno stopst; sopno lev; /* PLUS nesting level */ { int i; sopno ss; /* start sop of current subRE */ - char *sp; /* start of string matched by it */ + pg_wchar *sp; /* start of string matched by it */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ - char *ssp; /* start of string matched by subsubRE */ - char *dp; + pg_wchar *ssp; /* start of string matched by subsubRE */ + pg_wchar *dp; size_t len; int hard; sop s; @@ -567,7 +590,7 @@ sopno lev; /* PLUS nesting level */ switch (OP(s = m->g->strip[ss])) { case OCHAR: - if (sp == stop || *sp++ != (char) OPND(s)) + if (sp == stop || *sp++ != (pg_wchar) OPND(s)) return (NULL); break; case OANY: @@ -750,23 +773,23 @@ sopno lev; /* PLUS nesting level */ == static char *fast(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ -static char * /* where tentative match ended, or NULL */ +static pg_wchar * /* where tentative match ended, or NULL */ fast(m, start, stop, startst, stopst) struct match *m; -char *start; -char *stop; +pg_wchar *start; +pg_wchar *stop; sopno startst; sopno stopst; { states st = m->st; states fresh = m->fresh; states tmp = m->tmp; - char *p = start; + pg_wchar *p = start; int c = (start == m->beginp) ? OUT : *(start - 1); int lastc; /* previous c */ int flagch; int i; - char *coldp; /* last p after which no match was + pg_wchar *coldp; /* last p after which no match was * underway */ CLEAR(st); @@ -849,23 +872,23 @@ sopno stopst; == static char *slow(struct match *m, char *start, \ == char *stop, sopno startst, sopno stopst); */ -static char * /* where it ended */ +static pg_wchar * /* where it ended */ slow(m, start, stop, startst, stopst) struct match *m; -char *start; -char *stop; +pg_wchar *start; +pg_wchar *stop; sopno startst; sopno stopst; { states st = m->st; states empty = m->empty; states tmp = m->tmp; - char *p = start; + pg_wchar *p = start; int c = (start == m->beginp) ? OUT : *(start - 1); int lastc; /* previous c */ int flagch; int i; - char *matchp; /* last p at which a match ended */ + pg_wchar *matchp; /* last p at which a match ended */ AT("slow", start, stop, startst, stopst); CLEAR(st); @@ -978,8 +1001,8 @@ states aft; /* states already known reachable after */ break; case OCHAR: /* only characters can match */ - assert(!NONCHAR(ch) || ch != (char) OPND(s)); - if (ch == (char) OPND(s)) + assert(!NONCHAR(ch) || ch != (pg_wchar) OPND(s)); + if (ch == (pg_wchar) OPND(s)) FWD(aft, bef, 1); break; case OBOL: @@ -1082,7 +1105,7 @@ states aft; /* states already known reachable after */ static void print(m, caption, st, ch, d) struct match *m; -char *caption; +pg_wchar *caption; states st; int ch; FILE *d; @@ -1109,16 +1132,16 @@ FILE *d; /* - at - print current situation == #ifdef REDEBUG - == static void at(struct match *m, char *title, char *start, char *stop, \ + == static void at(struct match *m, pg_wchar *title, pg_wchar *start, pg_wchar *stop, \ == sopno startst, sopno stopst); == #endif */ static void at(m, title, start, stop, startst, stopst) struct match *m; -char *title; -char *start; -char *stop; +pg_wchar *title; +pg_wchar *start; +pg_wchar *stop; sopno startst; sopno stopst; { @@ -1143,13 +1166,24 @@ sopno stopst; * a matching debug.o, and this is convenient. It all disappears in * the non-debug compilation anyway, so it doesn't matter much. */ -static char * /* -> representation */ + + +static int pg_isprint(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isprint(c)); +#else + return(isprint(c)); +#endif +} + +static pg_wchar * /* -> representation */ pchar(ch) int ch; { - static char pbuf[10]; + static pg_wchar pbuf[10]; - if (isprint(ch) || ch == ' ') + if (pg_isprint(ch) || ch == ' ') sprintf(pbuf, "%c", ch); else sprintf(pbuf, "\\%o", ch); diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index e31f865404..6b7c472f1b 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; */ struct parse { - char *next; /* next character in RE */ - char *end; /* end of string (-> NUL normally) */ + pg_wchar *next; /* next character in RE */ + pg_wchar *end; /* end of string (-> NUL normally) */ int error; /* has an error been seen? */ sop *strip; /* malloced strip */ sopno ssize; /* malloced strip size (allocated) */ @@ -93,7 +93,7 @@ extern "C" static void p_b_term(struct parse * p, cset *cs); static void p_b_cclass(struct parse * p, cset *cs); static void p_b_eclass(struct parse * p, cset *cs); - static char p_b_symbol(struct parse * p); + static pg_wchar p_b_symbol(struct parse * p); static char p_b_coll_elem(struct parse * p, int endc); static char othercase(int ch); static void bothcases(struct parse * p, int ch); @@ -120,6 +120,10 @@ extern "C" static void stripsnug(struct parse * p, struct re_guts * g); static void findmust(struct parse * p, struct re_guts * g); static sopno pluscount(struct parse * p, struct re_guts * g); + static int pg_isdigit(int c); + static int pg_isalpha(int c); + static int pg_isupper(int c); + static int pg_islower(int c); #ifdef __cplusplus } @@ -127,7 +131,7 @@ extern "C" #endif /* ========= end header generated by ./mkh ========= */ -static char nuls[10]; /* place to point scanner in event of +static pg_wchar nuls[10]; /* place to point scanner in event of * error */ /* @@ -190,6 +194,9 @@ int cflags; struct parse *p = &pa; int i; size_t len; +#ifdef MB + pg_wchar *wcp; +#endif #ifdef REDEBUG #define GOODFLAGS(f) (f) @@ -203,12 +210,31 @@ int cflags; if (cflags & REG_PEND) { +#ifdef MB + wcp = preg->patsave; + if (preg->re_endp < wcp) + return (REG_INVARG); + len = preg->re_endp - wcp; +#else if (preg->re_endp < pattern) return (REG_INVARG); len = preg->re_endp - pattern; +#endif + } + else { +#ifdef MB + wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar)); + if (wcp == NULL) { + return (REG_ESPACE); + } + preg->patsave = wcp; + (void)pg_mb2wchar((unsigned char *)pattern,wcp); + len = pg_wchar_strlen(wcp); +#else + + len = strlen((char *) pattern); +#endif } - else - len = strlen((char *) pattern); /* do the mallocs early so failure handling is easy */ g = (struct re_guts *) malloc(sizeof(struct re_guts) + @@ -227,7 +253,11 @@ int cflags; /* set things up */ p->g = g; - p->next = (char *) pattern; /* convenience; we do not modify it */ +#ifdef MB + p->next = wcp; +#else + p->next = pattern; /* convenience; we do not modify it */ +#endif p->end = p->next + len; p->error = 0; p->ncsalloc = 0; @@ -342,7 +372,7 @@ static void p_ere_exp(p) struct parse *p; { - char c; + pg_wchar c; sopno pos; int count; int count2; @@ -420,7 +450,7 @@ struct parse *p; break; case '{': /* okay as ordinary except if digit * follows */ - REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT); + REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT); /* FALLTHROUGH */ default: ordinary(p, c); @@ -432,7 +462,7 @@ struct parse *p; c = PEEK(); /* we call { a repetition if followed by a digit */ if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit(PEEK2())))) + (c == '{' && MORE2() && pg_isdigit(PEEK2())))) return; /* no repetition, we're done */ NEXT(); @@ -463,7 +493,7 @@ struct parse *p; count = p_count(p); if (EAT(',')) { - if (isdigit(PEEK())) + if (pg_isdigit(PEEK())) { count2 = p_count(p); REQUIRE(count <= count2, REG_BADBR); @@ -490,7 +520,7 @@ struct parse *p; return; c = PEEK(); if (!(c == '*' || c == '+' || c == '?' || - (c == '{' && MORE2() && isdigit(PEEK2())))) + (c == '{' && MORE2() && pg_isdigit(PEEK2())))) return; SETERROR(REG_BADRPT); } @@ -568,7 +598,7 @@ int starordinary; /* is a leading * an ordinary character? */ int i; sopno subno; -#define BACKSL (1<', ':', ']', ']'}; +#endif /* Dept of Truly Sickening Special-Case Kludges */ +#ifdef MB + if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0) +#else if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) +#endif { EMIT(OBOW, 0); NEXTn(6); return; } +#ifdef MB + if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0) +#else if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) +#endif { EMIT(OEOW, 0); NEXTn(6); @@ -757,7 +803,7 @@ struct parse *p; int ci; for (i = p->g->csetsize - 1; i >= 0; i--) - if (CHIN(cs, i) && isalpha(i)) + if (CHIN(cs, i) && pg_isalpha(i)) { ci = othercase(i); if (ci != i) @@ -801,8 +847,8 @@ p_b_term(p, cs) struct parse *p; cset *cs; { - char c; - char start, + pg_wchar c; + pg_wchar start, finish; int i; @@ -857,6 +903,11 @@ cset *cs; finish = start; /* xxx what about signed chars here... */ REQUIRE(start <= finish, REG_ERANGE); +#ifdef MB + if (CHlc(start) != CHlc(finish)) { + SETERROR(REG_ERANGE); + } +#endif for (i = start; i <= finish; i++) CHadd(cs, i); break; @@ -872,17 +923,21 @@ p_b_cclass(p, cs) struct parse *p; cset *cs; { - char *sp = p->next; + pg_wchar *sp = p->next; struct cclass *cp; size_t len; char *u; char c; - while (MORE() && isalpha(PEEK())) + while (MORE() && pg_isalpha(PEEK())) NEXT(); len = p->next - sp; for (cp = cclasses; cp->name != NULL; cp++) +#ifdef MB + if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#else if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#endif break; if (cp->name == NULL) { @@ -919,11 +974,11 @@ cset *cs; - p_b_symbol - parse a character or [..]ed multicharacter collating symbol == static char p_b_symbol(struct parse *p); */ -static char /* value of symbol */ +static pg_wchar /* value of symbol */ p_b_symbol(p) struct parse *p; { - char value; + pg_wchar value; REQUIRE(MORE(), REG_EBRACK); if (!EATTWO('[', '.')) @@ -944,7 +999,7 @@ p_b_coll_elem(p, endc) struct parse *p; int endc; /* name ended by endc,']' */ { - char *sp = p->next; + pg_wchar *sp = p->next; struct cname *cp; int len; @@ -957,7 +1012,11 @@ int endc; /* name ended by endc,']' */ } len = p->next - sp; for (cp = cnames; cp->name != NULL; cp++) +#ifdef MB + if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#else if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') +#endif return (cp->code); /* known name */ if (len == 1) return (*sp); /* single character */ @@ -973,10 +1032,10 @@ static char /* if no counterpart, return ch */ othercase(ch) int ch; { - assert(isalpha(ch)); - if (isupper(ch)) + assert(pg_isalpha(ch)); + if (pg_isupper(ch)) return (tolower(ch)); - else if (islower(ch)) + else if (pg_islower(ch)) return (toupper(ch)); else /* peculiar, but could happen */ @@ -994,9 +1053,9 @@ bothcases(p, ch) struct parse *p; int ch; { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[3]; + pg_wchar *oldnext = p->next; + pg_wchar *oldend = p->end; + pg_wchar bracket[3]; assert(othercase(ch) != ch);/* p_bracket() would recurse */ p->next = bracket; @@ -1021,12 +1080,16 @@ int ch; { cat_t *cap = p->g->categories; - if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch) + if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch) bothcases(p, ch); else { +#ifdef MB + EMIT(OCHAR, (pg_wchar) ch); +#else EMIT(OCHAR, (unsigned char) ch); - if (cap[ch] == 0) +#endif + if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0) cap[ch] = p->g->ncategories++; } } @@ -1041,9 +1104,9 @@ static void nonnewline(p) struct parse *p; { - char *oldnext = p->next; - char *oldend = p->end; - char bracket[4]; + pg_wchar *oldnext = p->next; + pg_wchar *oldend = p->end; + pg_wchar bracket[4]; p->next = bracket; p->end = bracket + 3; @@ -1674,7 +1737,7 @@ struct re_guts *g; sop *newstart = 0; sopno newlen; sop s; - char *cp; + pg_wchar *cp; sopno i; /* avoid making error situations worse */ @@ -1729,7 +1792,11 @@ struct re_guts *g; return; /* turn it into a character string */ +#ifdef MB + g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar)); +#else g->must = malloc((size_t) g->mlen + 1); +#endif if (g->must == NULL) { /* argh; just forget it */ g->mlen = 0; @@ -1742,7 +1809,7 @@ struct re_guts *g; while (OP(s = *scan++) != OCHAR) continue; assert(cp < g->must + g->mlen); - *cp++ = (char) OPND(s); + *cp++ = (pg_wchar) OPND(s); } assert(cp == g->must + g->mlen); *cp++ = '\0'; /* just on general principles */ @@ -1785,3 +1852,42 @@ struct re_guts *g; g->iflags |= BAD; return (maxnest); } + +/* + * some ctype functions with none-ascii-char guard + */ +static int pg_isdigit(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isdigit(c)); +#else + return(isdigit(c)); +#endif +} + +static int pg_isalpha(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isalpha(c)); +#else + return(isalpha(c)); +#endif +} + +static int pg_isupper(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && isupper(c)); +#else + return(isupper(c)); +#endif +} + +static int pg_islower(int c) +{ +#ifdef MB + return(c >= 0 && c <= UCHAR_MAX && islower(c)); +#else + return(islower(c)); +#endif +} diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c index a8ba2443c5..abdc314a94 100644 --- a/src/backend/regex/regerror.c +++ b/src/backend/regex/regerror.c @@ -52,6 +52,7 @@ static char sccsid[] = "@(#)regerror.c 8.4 (Berkeley) 3/20/94"; #include #include +#include /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus @@ -214,7 +215,11 @@ char *localbuf; struct rerr *r; for (r = rerrs; r->code != 0; r++) +#ifdef MB + if (pg_char_and_wchar_strcmp(r->name, preg->re_endp) == 0) +#else if (strcmp(r->name, preg->re_endp) == 0) +#endif break; if (r->code == 0) return ("0"); diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index e3ac5cd064..8a51810a20 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -164,6 +164,11 @@ int eflags; { struct re_guts *g = preg->re_g; +#ifdef MB + pg_wchar *str; + int sts; +#endif + #ifdef REDEBUG #define GOODFLAGS(f) (f) #else @@ -177,8 +182,24 @@ int eflags; return (REG_BADPAT); eflags = GOODFLAGS(eflags); +#ifdef MB + str = (pg_wchar *)malloc((strlen(string)+1) * sizeof(pg_wchar)); + if (!str) { + return(REG_ESPACE); + } + (void)pg_mb2wchar((unsigned char *)string,str); if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & REG_LARGE)) - return (smatcher(g, (char *) string, nmatch, pmatch, eflags)); + sts = smatcher(g, str, nmatch, pmatch, eflags); else - return (lmatcher(g, (char *) string, nmatch, pmatch, eflags)); + sts = lmatcher(g, str, nmatch, pmatch, eflags); + free((char *)str); + return(sts); + +# else + + if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & REG_LARGE)) + return (smatcher(g, (pg_wchar *) string, nmatch, pmatch, eflags)); + else + return (lmatcher(g, (pg_wchar *) string, nmatch, pmatch, eflags)); +#endif } diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c index e53fe54e86..b169c84041 100644 --- a/src/backend/regex/regfree.c +++ b/src/backend/regex/regfree.c @@ -68,7 +68,11 @@ regex_t *preg; return; preg->re_magic = 0; /* mark it invalid */ g->magic = 0; /* mark it invalid */ - +#ifdef MB + if (preg->patsave != NULL) { + free((char *)preg->patsave); + } +#endif if (g->strip != NULL) free((char *) g->strip); if (g->sets != NULL) diff --git a/src/backend/regex/utftest.c b/src/backend/regex/utftest.c new file mode 100644 index 0000000000..28baf7255e --- /dev/null +++ b/src/backend/regex/utftest.c @@ -0,0 +1,33 @@ +/* + * testing of utf2wchar() + * $Id: utftest.c,v 1.1 1998/03/15 07:38:37 scrappy Exp $ + */ +#include +#include +#include + +#include + +main() +{ + /* Example 1 from RFC2044 */ + char utf1[] = {0x41,0xe2,0x89,0xa2,0xce,0x91,0x2e,0}; + /* Example 2 from RFC2044 */ + char utf2[] = {0x48,0x69,0x20,0x4d,0x6f,0x6d,0x20,0xe2,0x98,0xba,0x21,0}; + /* Example 3 from RFC2044 */ + char utf3[] = {0xe6,0x97,0xa5,0xe6,0x9c,0xac,0xe8,0xaa,0x9e,0}; + char *utf[] = {utf1,utf2,utf3}; + pg_wchar ucs[128]; + pg_wchar *p; + int i; + + for (i=0;i +/* + * convert EUC to pg_wchar (EUC process code) + * caller should allocate enough space for "to" + */ +static void pg_euc2wchar(const unsigned char *from, pg_wchar *to) +{ + while (*from) { + if (*from == SS2) { + from++; + *to = *from++; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + } else { + *to = *from++; + } + to++; + } + *to = 0; +} + +static void pg_eucjp2wchar(const unsigned char *from, pg_wchar *to) +{ + pg_euc2wchar(from,to); +} + +static void pg_euckr2wchar(const unsigned char *from, pg_wchar *to) +{ + pg_euc2wchar(from,to); +} + +static void pg_eucch2wchar(const unsigned char *from, pg_wchar *to) +{ + while (*from) { + if (*from == SS2) { + from++; + *to = 0x3f00 & (*from++ << 8); + *to = *from++; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + } else { + *to = *from++; + } + to++; + } + *to = 0; +} + +static void pg_euccn2wchar(const unsigned char *from, pg_wchar *to) +{ + while (*from) { + if (*from == SS2) { + from++; + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + } else { + *to = *from++; + } + to++; + } + *to = 0; +} + +/* + * convert UTF-8 to pg_wchar (UCS-2) + * caller should allocate enough space for "to" + */ +static void pg_utf2wchar(const unsigned char *from, pg_wchar *to) +{ + unsigned char c1,c2,c3; + while (*from) { + if ((*from & 0x80) == 0) { + *to = *from++; + } else if ((*from & 0xe0) == 0xc0) { + c1 = *from++ & 0x1f; + c2 = *from++ & 0x3f; + *to = c1 << 6; + *to |= c2; + } else if ((*from & 0xe0) == 0xe0) { + c1 = *from++ & 0x0f; + c2 = *from++ & 0x3f; + c3 = *from++ & 0x3f; + *to = c1 << 12; + *to |= c2 << 6; + *to |= c3; + } + to++; + } + *to = 0; +} + +/* + * convert mule internal code to pg_wchar. + * in this case pg_wchar consists of following 4 bytes: + * + * 0x00(unused) + * 0x00(ASCII)|leading character (one of LC1, LC12, LC2 or LC22) + * 0x00(ASCII,1 byte code)|other than 0x00(2 byte code) + * the lowest byte of the code + * + * note that Type N (variable length byte encoding) cannot be represented by + * this schema. sorry. + * caller should allocate enough space for "to" + */ +static void pg_mule2wchar(const unsigned char *from, pg_wchar *to) +{ + while (*from) { + if (IS_LC1(*from)) { + *to = *from++ << 16; + *to |= *from++; + } else if (IS_LCPRV1(*from)) { + from++; + *to = *from++ << 16; + *to |= *from++; + } else if (IS_LC2(*from)) { + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + } else if (IS_LCPRV2(*from)) { + from++; + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + } else { /* assume ASCII */ + *to = *from++; + } + to++; + } + *to = 0; +} + +/* + * convert EUC to pg_wchar (EUC process code) + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static void pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + while (*from && len > 0) { + if (*from == SS2) { + from++; + len--; + *to = 0xff & *from++; + len--; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + len -= 3; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + len -= 2; + } else { + *to = *from++; + len--; + } + to++; + } + *to = 0; +} + +static void pg_eucjp2wchar_with_len +(const unsigned char *from, pg_wchar *to, int len) +{ + pg_euc2wchar_with_len(from,to,len); +} + +static void pg_euckr2wchar_with_len +(const unsigned char *from, pg_wchar *to, int len) +{ + pg_euc2wchar_with_len(from,to,len); +} + +static void pg_eucch2wchar_with_len +(const unsigned char *from, pg_wchar *to, int len) +{ + while (*from && len > 0) { + if (*from == SS2) { + from++; + len--; + *to = 0x3f00 & (*from++ << 8); + *to = *from++; + len -= 2; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + len -= 3; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + len -= 2; + } else { + *to = *from++; + len--; + } + to++; + } + *to = 0; +} + +static void pg_euccn2wchar_with_len +(const unsigned char *from, pg_wchar *to, int len) +{ + while (*from && len > 0) { + if (*from == SS2) { + from++; + len--; + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + len -= 3; + } else if (*from == SS3) { + from++; + *to = *from++ << 8; + *to |= 0x3f & *from++; + len -= 3; + } else if (*from & 0x80) { + *to = *from++ << 8; + *to |= *from++; + len -= 2; + } else { + *to = *from++; + len--; + } + to++; + } + *to = 0; +} + +/* + * convert UTF-8 to pg_wchar (UCS-2) + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static void pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + unsigned char c1,c2,c3; + while (*from && len > 0) { + if ((*from & 0x80) == 0) { + *to = *from++; + len--; + } else if ((*from & 0xe0) == 0xc0) { + c1 = *from++ & 0x1f; + c2 = *from++ & 0x3f; + len -= 2; + *to = c1 << 6; + *to |= c2; + } else if ((*from & 0xe0) == 0xe0) { + c1 = *from++ & 0x0f; + c2 = *from++ & 0x3f; + c3 = *from++ & 0x3f; + len -= 3; + *to = c1 << 12; + *to |= c2 << 6; + *to |= c3; + } + to++; + } + *to = 0; +} + +/* + * convert mule internal code to pg_wchar + * caller should allocate enough space for "to" + * len: length of from. + * "from" not necessarily null terminated. + */ +static void pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + while (*from && len > 0) { + if (IS_LC1(*from)) { + *to = *from++ << 16; + *to |= *from++; + len -= 2; + } else if (IS_LCPRV1(*from)) { + from++; + *to = *from++ << 16; + *to |= *from++; + len -= 3; + } else if (IS_LC2(*from)) { + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + len -= 3; + } else if (IS_LCPRV2(*from)) { + from++; + *to = *from++ << 16; + *to |= *from++ << 8; + *to |= *from++; + len -= 4; + } else { /* assume ASCII */ + *to = (unsigned char)*from++; + len--; + } + to++; + } + *to = 0; +} + +typedef struct { + void (*mb2wchar)(); + void (*mb2wchar_with_len)(); +} pg_wchar_tbl; + +static pg_wchar_tbl pg_wchar_table[] = { + {pg_eucjp2wchar, pg_eucjp2wchar_with_len}, + {pg_eucch2wchar, pg_eucch2wchar_with_len}, + {pg_euckr2wchar, pg_euckr2wchar_with_len}, + {pg_euccn2wchar, pg_euccn2wchar_with_len}, + {pg_utf2wchar, pg_utf2wchar_with_len}, + {pg_mule2wchar, pg_mule2wchar_with_len}}; + +void pg_mb2wchar(const unsigned char *from, pg_wchar *to) +{ + (*pg_wchar_table[MB].mb2wchar)(from,to); +} + +void pg_mb2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) +{ + (*pg_wchar_table[MB].mb2wchar_with_len)(from,to,len); +} diff --git a/src/backend/regex/wstrcmp.c b/src/backend/regex/wstrcmp.c new file mode 100644 index 0000000000..b562f10315 --- /dev/null +++ b/src/backend/regex/wstrcmp.c @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +int +pg_char_and_wchar_strcmp(s1, s2) + register const char *s1; + register const pg_wchar *s2; +{ + while ((pg_wchar)*s1 == *s2++) + if (*s1++ == 0) + return (0); + return (*(const unsigned char *)s1 - *(const pg_wchar *)(s2 - 1)); +} diff --git a/src/backend/regex/wstrncmp.c b/src/backend/regex/wstrncmp.c new file mode 100644 index 0000000000..e7ce52ed7b --- /dev/null +++ b/src/backend/regex/wstrncmp.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from FreeBSD 2.2.1-RELEASE software. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +int +pg_wchar_strncmp(s1, s2, n) + register const pg_wchar *s1, *s2; + register size_t n; +{ + + if (n == 0) + return (0); + do { + if (*s1 != *s2++) + return (*(const pg_wchar *)s1 - + *(const pg_wchar *)(s2 - 1)); + if (*s1++ == 0) + break; + } while (--n != 0); + return (0); +} + +int +pg_char_and_wchar_strncmp(s1, s2, n) + register const char *s1; + register const pg_wchar *s2; + register size_t n; +{ + + if (n == 0) + return (0); + do { + if ((pg_wchar )*s1 != *s2++) + return (*(const pg_wchar *)s1 - + *(const pg_wchar *)(s2 - 1)); + if (*s1++ == 0) + break; + } while (--n != 0); + return (0); +} + +size_t +pg_wchar_strlen(str) + const pg_wchar *str; +{ + register const pg_wchar *s; + + for (s = str; *s; ++s); + return(s - str); +} diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index fe7b5dd1d3..75cc755f7c 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -4,7 +4,7 @@ # Makefile for utils/adt # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.10 1997/12/20 00:28:21 scrappy Exp $ +# $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.11 1998/03/15 07:38:42 scrappy Exp $ # #------------------------------------------------------------------------- @@ -14,6 +14,9 @@ include ../../../Makefile.global INCLUDE_OPT = -I../.. CFLAGS+=$(INCLUDE_OPT) +ifdef MB +CFLAGS+=-DMB=$(MB) +endif OBJS = acl.o arrayfuncs.o arrayutils.o bool.o cash.o char.o chunk.o date.o \ datum.o dt.o filename.o float.o geo_ops.o geo_selfuncs.o int.o \ diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 7d4681262d..27d6ffc014 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -21,8 +21,9 @@ #include "postgres.h" /* postgres system include file */ #include "utils/palloc.h" #include "utils/builtins.h" /* where the function declarations go */ +#include "regex/pg_wchar.h" -static int like(char *text, char *p); +static int like(pg_wchar *text, pg_wchar *p); /* * interface routines called by the function manager @@ -39,16 +40,22 @@ static int like(char *text, char *p); static bool fixedlen_like(char *s, struct varlena * p, int charlen) { - char *sterm, + pg_wchar *sterm, *pterm; int result; + int len; if (!s || !p) return FALSE; /* be sure sterm is null-terminated */ +#ifdef MB + sterm = (pg_wchar *) palloc((charlen + 1)*sizeof(pg_wchar)); + (void)pg_mb2wchar_with_len((unsigned char *)s,sterm,charlen); +#else sterm = (char *) palloc(charlen + 1); StrNCpy(sterm, s, charlen + 1); +#endif /* * p is a text = varlena, not a string so we have to make a string @@ -56,9 +63,15 @@ fixedlen_like(char *s, struct varlena * p, int charlen) */ /* palloc the length of the text + the null character */ - pterm = (char *) palloc(VARSIZE(p) - VARHDRSZ + 1); - memmove(pterm, VARDATA(p), VARSIZE(p) - VARHDRSZ); - *(pterm + VARSIZE(p) - VARHDRSZ) = (char) NULL; + len = VARSIZE(p) - VARHDRSZ; +#ifdef MB + pterm = (pg_wchar *) palloc((len + 1)*sizeof(pg_wchar)); + (void)pg_mb2wchar_with_len((unsigned char *)VARDATA(p),pterm,len); +#else + pterm = (char *) palloc(len + 1); + memmove(pterm, VARDATA(p), len); + *(pterm + len) = (char) NULL; +#endif /* do the regexp matching */ result = like(sterm, pterm); @@ -150,7 +163,7 @@ textnlike(struct varlena * s, struct varlena * p) } -/* $Revision: 1.12 $ +/* $Revision: 1.13 $ ** "like.c" A first attempt at a LIKE operator for Postgres95. ** ** Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986. @@ -185,7 +198,7 @@ textnlike(struct varlena * s, struct varlena * p) ** Match text and p, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT. */ static int -DoMatch(char *text, char *p) +DoMatch(pg_wchar *text, pg_wchar *p) { int matched; @@ -228,7 +241,7 @@ DoMatch(char *text, char *p) ** User-level routine. Returns TRUE or FALSE. */ static int -like(char *text, char *p) +like(pg_wchar *text, pg_wchar *p) { if (p[0] == '%' && p[1] == '\0') return TRUE; diff --git a/src/configure b/src/configure index 3ef138e382..c9ed79c8f5 100755 --- a/src/configure +++ b/src/configure @@ -825,9 +825,9 @@ echo "configure:825: checking setting USE_TCL" >&5 # Check whether --with-tcl or --without-tcl was given. if test "${with_tcl+set}" = set; then withval="$with_tcl" - USE_TCL=true echo "$ac_t""enabled" 1>&6 + USE_TCL=true; echo "$ac_t""enabled" 1>&6 else - USE_TCL=false echo "$ac_t""disabled" 1>&6 + USE_TCL=false; echo "$ac_t""disabled" 1>&6 fi @@ -839,9 +839,9 @@ echo "configure:839: checking setting USE_PERL" >&5 # Check whether --with-perl or --without-perl was given. if test "${with_perl+set}" = set; then withval="$with_perl" - USE_PERL=true echo "$ac_t""enabled" 1>&6 + USE_PERL=true; echo "$ac_t""enabled" 1>&6 else - USE_PERL=false echo "$ac_t""disabled" 1>&6 + USE_PERL=false; echo "$ac_t""disabled" 1>&6 fi diff --git a/src/configure.in b/src/configure.in index 57b6cfdfee..cdca6b643e 100644 --- a/src/configure.in +++ b/src/configure.in @@ -239,8 +239,8 @@ AC_MSG_CHECKING(setting USE_TCL) AC_ARG_WITH( tcl, [ --with-tcl use tcl ], - USE_TCL=true AC_MSG_RESULT(enabled), - USE_TCL=false AC_MSG_RESULT(disabled) + USE_TCL=true; AC_MSG_RESULT(enabled), + USE_TCL=false; AC_MSG_RESULT(disabled) ) export USE_TCL USE_X=$USE_TCL @@ -250,8 +250,8 @@ AC_MSG_CHECKING(setting USE_PERL) AC_ARG_WITH( perl, [ --with-perl use perl ], - USE_PERL=true AC_MSG_RESULT(enabled), - USE_PERL=false AC_MSG_RESULT(disabled) + USE_PERL=true; AC_MSG_RESULT(enabled), + USE_PERL=false; AC_MSG_RESULT(disabled) ) export USE_PERL diff --git a/src/include/regex/pg_wchar.h b/src/include/regex/pg_wchar.h new file mode 100644 index 0000000000..616f76cfec --- /dev/null +++ b/src/include/regex/pg_wchar.h @@ -0,0 +1,44 @@ +/* $Id: pg_wchar.h,v 1.1 1998/03/15 07:38:47 scrappy Exp $ */ + +#ifndef PG_WCHAR_H +#define PG_WCHAR_H + +#include + +#define EUC_JP 0 /* EUC for Japanese */ +#define EUC_CN 1 /* EUC for Chinese */ +#define EUC_KR 2 /* EUC for Korean */ +#define EUC_TW 3 /* EUC for Taiwan */ +#define UNICODE 4 /* Unicode UTF-8 */ +#define MULE_INTERNAL 5 /* Mule internal code */ + +#ifdef MB +typedef unsigned int pg_wchar; +#else +#define pg_wchar char +#endif + +/* + * various definitions for EUC + */ +#define SS2 0x8e /* single shift 2 */ +#define SS3 0x8f /* single shift 3 */ + +/* + * various definitions for mule internal code + */ +#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8f) +#define IS_LCPRV1(c) ((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b) +#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99) +#define IS_LCPRV2(c) ((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d) + +#ifdef MB +extern void pg_mb2wchar(const unsigned char *, pg_wchar *); +extern void pg_mb2wchar_with_len(const unsigned char *, pg_wchar *, int); +extern int pg_char_and_wchar_strcmp(const char *, const pg_wchar *); +extern int pg_wchar_strncmp(const pg_wchar *, const pg_wchar *, size_t); +extern int pg_char_and_wchar_strncmp(const char *, const pg_wchar *, size_t); +extern size_t pg_wchar_strlen(const pg_wchar *); +#endif + +#endif diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index cd9efbceb7..f0c9876fe0 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -41,6 +41,7 @@ #define _REGEX_H_ #include +#include /* types */ typedef off_t regoff_t; @@ -49,8 +50,12 @@ typedef struct { int re_magic; size_t re_nsub; /* number of parenthesized subexpressions */ - const char *re_endp; /* end pointer for REG_PEND */ + const pg_wchar *re_endp; /* end pointer for REG_PEND */ struct re_guts *re_g; /* none of your business :-) */ +#ifdef MB + pg_wchar *patsave; /* mee too :-) */ +#endif + } regex_t; typedef struct diff --git a/src/include/regex/regex2.h b/src/include/regex/regex2.h index 564c626c5b..01cdadff45 100644 --- a/src/include/regex/regex2.h +++ b/src/include/regex/regex2.h @@ -127,12 +127,29 @@ typedef struct { uch *ptr; /* -> uch [csetsize] */ uch mask; /* bit within array */ - uch hash; /* hash code */ +#ifdef MB + pg_wchar hash; /* hash code */ + unsigned int lc; /* leading character (character-set) */ +#else + uch hash; /* hash code */ +#endif size_t smultis; char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ } cset; /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ +#ifdef MB +#define CHlc(c) (((unsigned)(c)&0xff0000)>>16) +#define CHadd(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] |= (cs)->mask, (cs)->hash += (unsigned)(c)&0xffff,\ + (cs)->lc = CHlc(c)) +#define CHsub(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] &= ~(cs)->mask, (cs)->hash -= (unsigned)(c)&0xffff) +#define CHIN(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] & (cs)->mask && \ + ((cs)->lc == CHlc(c))) +#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal + * fns */ +#define MCsub(p, cs, cp) mcsub(p, cs, cp) +#define MCin(p, cs, cp) mcin(p, cs, cp) +#else #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) @@ -140,6 +157,7 @@ typedef struct * fns */ #define MCsub(p, cs, cp) mcsub(p, cs, cp) #define MCin(p, cs, cp) mcin(p, cs, cp) +#endif /* stuff for character categories */ typedef unsigned char cat_t; @@ -168,7 +186,7 @@ struct re_guts int neol; /* number of $ used */ int ncategories; /* how many character categories */ cat_t *categories; /* ->catspace[-CHAR_MIN] */ - char *must; /* match must contain this string */ + pg_wchar *must; /* match must contain this string */ int mlen; /* length of must */ size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ @@ -178,5 +196,21 @@ struct re_guts }; /* misc utilities */ -#define OUT (CHAR_MAX+1) /* a non-character value */ -#define ISWORD(c) (isalnum(c) || (c) == '_') +#ifdef MB +# if MB == MULE_INTERNAL +# define OUT (16777216+1) /* 16777216 == 2^24 == 3 bytes */ +# elif MB == EUC_JP || MB == EUC_CN || MB == EUC_KR || MB == EUC_TW +# define OUT (USHRT_MAX+1) /* 2 bytes */ +# elif MB == UNICODE +# define OUT (USHRT_MAX+1) /* 2 bytes. assuming UCS-2 */ +# endif +#else +# define OUT (CHAR_MAX+1) /* a non-character value */ +#endif + +#ifdef MB +#define ISWORD(c) ((c >= 0 && c <= UCHAR_MAX) && \ + (isalnum(c) || (c) == '_')) +#else +#define ISWORD(c) (isalnum(c) || (c) == '_') +#endif diff --git a/src/include/regex/utils.h b/src/include/regex/utils.h index a7cae06919..6f02759aa1 100644 --- a/src/include/regex/utils.h +++ b/src/include/regex/utils.h @@ -42,7 +42,12 @@ /* utility definitions */ #define DUPMAX 100000000 /* xxx is this right? */ #define INFINITY (DUPMAX + 1) + +#ifdef MB +#define NC (SHRT_MAX - SHRT_MIN + 1) +#else #define NC (CHAR_MAX - CHAR_MIN + 1) +#endif typedef unsigned char uch; /* switch off assertions (if not already off) if no REDEBUG */ diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile index 90080e298b..0ac0021541 100644 --- a/src/test/regress/GNUmakefile +++ b/src/test/regress/GNUmakefile @@ -7,7 +7,7 @@ # # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/test/regress/GNUmakefile,v 1.14 1998/01/17 23:39:22 scrappy Exp $ +# $Header: /cvsroot/pgsql/src/test/regress/GNUmakefile,v 1.15 1998/03/15 07:39:01 scrappy Exp $ # #------------------------------------------------------------------------- @@ -50,6 +50,7 @@ all: $(INFILES) # run the test # runtest: $(INFILES) + MB=$(MB);export MB; \ $(SHELL) ./regress.sh 2>&1 | tee regress.out @echo "ACTUAL RESULTS OF REGRESSION TEST ARE NOW IN FILE regress.out" diff --git a/src/test/regress/regress.sh b/src/test/regress/regress.sh index d27d2a1e90..f12b431e31 100755 --- a/src/test/regress/regress.sh +++ b/src/test/regress/regress.sh @@ -1,5 +1,5 @@ #!/bin/sh -# $Header: /cvsroot/pgsql/src/test/regress/Attic/regress.sh,v 1.17 1998/02/25 15:02:18 scrappy Exp $ +# $Header: /cvsroot/pgsql/src/test/regress/Attic/regress.sh,v 1.18 1998/03/15 07:39:04 scrappy Exp $ # if echo '\c' | grep -s c >/dev/null 2>&1 then @@ -42,7 +42,12 @@ fi echo "=============== running regression queries... =================" echo "" > regression.diffs -for i in `cat sql/tests` +if [ a$MB != a ];then + mbtests=`echo $MB|tr A-Z a-z` +else + mbtests="" +fi +for i in `cat sql/tests` $mbtests do $ECHO_N "${i} .. " $ECHO_C $FRONTEND regression < sql/${i}.sql > results/${i}.out 2>&1 -- GitLab