From 7351b5fa1781c3942d2d5ff2116d2d0ba882bd42 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 25 Aug 2007 00:03:59 +0000 Subject: [PATCH] Cleanup for some problems in tsearch patch: - ispell initialization crashed on empty dictionary file - ispell initialization crashed on affix file with prefixes but no suffixes - stop words file was run through pg_verify_mbstr, with database encoding, but it's supposed to be UTF-8; similar bug for synonym files - bunch of comments added, typos fixed, and other cleanup Introduced consistent encoding checking/conversion of data read from tsearch configuration files, by doing this in a single t_readline() subroutine (replacing direct usages of fgets). Cleaned up API for readstopwords too. Heikki Linnakangas --- src/backend/snowball/dict_snowball.c | 6 +- src/backend/tsearch/dict_ispell.c | 6 +- src/backend/tsearch/dict_simple.c | 19 +- src/backend/tsearch/dict_synonym.c | 84 +++++---- src/backend/tsearch/dict_thesaurus.c | 30 +-- src/backend/tsearch/spell.c | 263 +++++++++++++-------------- src/backend/tsearch/ts_locale.c | 50 +++-- src/backend/tsearch/ts_parse.c | 26 +-- src/backend/tsearch/ts_utils.c | 71 ++++---- src/backend/tsearch/wparser.c | 10 +- src/include/tsearch/dicts/spell.h | 40 ++-- src/include/tsearch/ts_locale.h | 12 +- src/include/tsearch/ts_public.h | 14 +- src/include/tsearch/ts_utils.h | 38 +--- 14 files changed, 341 insertions(+), 328 deletions(-) diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index 03f2dd928c..57aac234ed 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/snowball/dict_snowball.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -192,7 +192,6 @@ dsnowball_init(PG_FUNCTION_ARGS) ListCell *l; d = (DictSnowball *) palloc0(sizeof(DictSnowball)); - d->stoplist.wordop = recode_and_lowerstr; foreach(l, dictoptions) { @@ -204,8 +203,7 @@ dsnowball_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &d->stoplist); - sortstoplist(&d->stoplist); + readstoplist(defGetString(defel), &d->stoplist, lowerstr); stoploaded = true; } else if (pg_strcasecmp("Language", defel->defname) == 0) diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c index 802a645087..d7fe3cc465 100644 --- a/src/backend/tsearch/dict_ispell.c +++ b/src/backend/tsearch/dict_ispell.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -39,7 +39,6 @@ dispell_init(PG_FUNCTION_ARGS) ListCell *l; d = (DictISpell *) palloc0(sizeof(DictISpell)); - d->stoplist.wordop = recode_and_lowerstr; foreach(l, dictoptions) { @@ -73,8 +72,7 @@ dispell_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &(d->stoplist)); - sortstoplist(&(d->stoplist)); + readstoplist(defGetString(defel), &(d->stoplist), lowerstr); stoploaded = true; } else diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c index fcc08ea180..aea2c0963b 100644 --- a/src/backend/tsearch/dict_simple.c +++ b/src/backend/tsearch/dict_simple.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -23,19 +23,17 @@ typedef struct { StopList stoplist; -} DictExample; +} DictSimple; Datum dsimple_init(PG_FUNCTION_ARGS) { List *dictoptions = (List *) PG_GETARG_POINTER(0); - DictExample *d = (DictExample *) palloc0(sizeof(DictExample)); + DictSimple *d = (DictSimple *) palloc0(sizeof(DictSimple)); bool stoploaded = false; ListCell *l; - d->stoplist.wordop = recode_and_lowerstr; - foreach(l, dictoptions) { DefElem *defel = (DefElem *) lfirst(l); @@ -46,8 +44,7 @@ dsimple_init(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("multiple StopWords parameters"))); - readstoplist(defGetString(defel), &d->stoplist); - sortstoplist(&d->stoplist); + readstoplist(defGetString(defel), &d->stoplist, lowerstr); stoploaded = true; } else @@ -65,16 +62,16 @@ dsimple_init(PG_FUNCTION_ARGS) Datum dsimple_lexize(PG_FUNCTION_ARGS) { - DictExample *d = (DictExample *) PG_GETARG_POINTER(0); + DictSimple *d = (DictSimple *) PG_GETARG_POINTER(0); char *in = (char *) PG_GETARG_POINTER(1); int32 len = PG_GETARG_INT32(2); - char *txt = lowerstr_with_len(in, len); + char *txt; TSLexeme *res = palloc0(sizeof(TSLexeme) * 2); + txt = lowerstr_with_len(in, len); + if (*txt == '\0' || searchstoplist(&(d->stoplist), txt)) - { pfree(txt); - } else res[0].lexeme = txt; diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c index c5bd197e92..1c0fd95413 100644 --- a/src/backend/tsearch/dict_synonym.c +++ b/src/backend/tsearch/dict_synonym.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.2 2007/08/22 04:13:15 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -20,9 +20,6 @@ #include "tsearch/ts_utils.h" #include "utils/builtins.h" - -#define SYNBUFLEN 4096 - typedef struct { char *in; @@ -31,23 +28,34 @@ typedef struct typedef struct { - int len; + int len; /* length of syn array */ Syn *syn; } DictSyn; +/* + * Finds the next whitespace-delimited word within the 'in' string. + * Returns a pointer to the first character of the word, and a pointer + * to the next byte after the last character in the word (in *end). + */ static char * findwrd(char *in, char **end) { char *start; - *end = NULL; + /* Skip leading spaces */ while (*in && t_isspace(in)) in += pg_mblen(in); + /* Return NULL on empty lines */ if (*in == '\0') + { + *end = NULL; return NULL; + } + start = in; + /* Find end of word */ while (*in && !t_isspace(in)) in += pg_mblen(in); @@ -70,12 +78,11 @@ dsynonym_init(PG_FUNCTION_ARGS) ListCell *l; char *filename = NULL; FILE *fin; - char buf[SYNBUFLEN]; char *starti, *starto, *end = NULL; int cur = 0; - int slen; + char *line = NULL; foreach(l, dictoptions) { @@ -105,10 +112,33 @@ dsynonym_init(PG_FUNCTION_ARGS) d = (DictSyn *) palloc0(sizeof(DictSyn)); - while (fgets(buf, SYNBUFLEN, fin)) + while ((line = t_readline(fin)) != NULL) { - slen = strlen(buf); - pg_verifymbstr(buf, slen, false); + starti = findwrd(line, &end); + if (!starti) + { + /* Empty line */ + goto skipline; + } + *end = '\0'; + if (end >= line + strlen(line)) + { + /* A line with only one word. Ignore silently. */ + goto skipline; + } + + starto = findwrd(end + 1, &end); + if (!starto) + { + /* A line with only one word. Ignore silently. */ + goto skipline; + } + *end = '\0'; + + /* starti now points to the first word, and starto to the second + * word on the line, with a \0 terminator at the end of both words. + */ + if (cur == d->len) { if (d->len == 0) @@ -123,36 +153,19 @@ dsynonym_init(PG_FUNCTION_ARGS) } } - starti = findwrd(buf, &end); - if (!starti) - continue; - *end = '\0'; - if (end >= buf + slen) - continue; - - starto = findwrd(end + 1, &end); - if (!starto) - continue; - *end = '\0'; - - d->syn[cur].in = recode_and_lowerstr(starti); - d->syn[cur].out = recode_and_lowerstr(starto); - if (!(d->syn[cur].in && d->syn[cur].out)) - { - FreeFile(fin); - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } + d->syn[cur].in = lowerstr(starti); + d->syn[cur].out = lowerstr(starto); cur++; + + skipline: + pfree(line); } FreeFile(fin); d->len = cur; - if (cur > 1) - qsort(d->syn, d->len, sizeof(Syn), compareSyn); + qsort(d->syn, d->len, sizeof(Syn), compareSyn); PG_RETURN_POINTER(d); } @@ -179,8 +192,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS) if (!found) PG_RETURN_POINTER(NULL); - res = palloc(sizeof(TSLexeme) * 2); - memset(res, 0, sizeof(TSLexeme) * 2); + res = palloc0(sizeof(TSLexeme) * 2); res[0].lexeme = pstrdup(found->out); PG_RETURN_POINTER(res); diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c index 70700db41f..2891dc42c7 100644 --- a/src/backend/tsearch/dict_thesaurus.c +++ b/src/backend/tsearch/dict_thesaurus.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -170,10 +170,10 @@ static void thesaurusRead(char *filename, DictThesaurus * d) { FILE *fh; - char str[BUFSIZ]; int lineno = 0; uint16 idsubst = 0; bool useasis = false; + char *line; filename = get_tsearch_config_filename(filename, "ths"); fh = AllocateFile(filename, "r"); @@ -183,27 +183,28 @@ thesaurusRead(char *filename, DictThesaurus * d) errmsg("could not open thesaurus file \"%s\": %m", filename))); - while (fgets(str, sizeof(str), fh)) + while ((line = t_readline(fh)) != NULL) { - char *ptr, - *recoded; + char *ptr; int state = TR_WAITLEX; char *beginwrd = NULL; uint16 posinsubst = 0; uint16 nwrd = 0; - ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - GetDatabaseEncoding(), PG_UTF8); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); - lineno++; - /* is it comment ? */ - while (t_isspace(ptr)) + ptr = line; + + /* is it a comment? */ + while (*ptr && t_isspace(ptr)) ptr += pg_mblen(ptr); - if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r')) + + if (t_iseq(ptr, '#') || *ptr == '\0' || + t_iseq(ptr, '\n') || t_iseq(ptr, '\r')) + { + pfree(line); continue; + } while (*ptr) { @@ -301,8 +302,7 @@ thesaurusRead(char *filename, DictThesaurus * d) lineno, filename))); } - if (recoded != str) - pfree(recoded); + pfree(line); } d->nsubst = idsubst; diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c index d09208649f..e9bb999562 100644 --- a/src/backend/tsearch/spell.c +++ b/src/backend/tsearch/spell.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/spell.c,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,8 +21,11 @@ /* - * during initialization dictionary requires a lot - * of memory, so it will use temporary context + * Initialization requires a lot of memory that's not needed + * after the initialization is done. In init function, + * CurrentMemoryContext is a long lived memory context associated + * with the dictionary cache entry, so we use a temporary context + * for the short-lived stuff. */ static MemoryContext tmpCtx = NULL; @@ -32,6 +35,9 @@ static MemoryContext tmpCtx = NULL; static void checkTmpCtx(void) { + /* XXX: This assumes that CurrentMemoryContext doesn't have + * any children other than the one we create here. + */ if (CurrentMemoryContext->firstchild == NULL) { tmpCtx = AllocSetContextCreate(CurrentMemoryContext, @@ -74,17 +80,7 @@ cmpspell(const void *s1, const void *s2) static int cmpspellaffix(const void *s1, const void *s2) { - return (strcmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag)); -} - -static char * -strnduplicate(char *s, int len) -{ - char *d = (char *) palloc(len + 1); - - memcpy(d, s, len); - d[len] = '\0'; - return d; + return (strncmp((*(const SPELL **) s1)->p.flag, (*(const SPELL **) s2)->p.flag, MAXFLAGLEN)); } static char * @@ -185,7 +181,7 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag) } Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1); strcpy(Conf->Spell[Conf->nspell]->word, word); - strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, 16); + strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN); Conf->nspell++; } @@ -197,9 +193,8 @@ NIAddSpell(IspellDict * Conf, const char *word, const char *flag) void NIImportDictionary(IspellDict * Conf, const char *filename) { - char str[BUFSIZ], - *pstr; FILE *dict; + char *line; checkTmpCtx(); @@ -209,19 +204,14 @@ NIImportDictionary(IspellDict * Conf, const char *filename) errmsg("could not open dictionary file \"%s\": %m", filename))); - while (fgets(str, sizeof(str), dict)) + while ((line = t_readline(dict)) != NULL) { - char *s, - *recoded; + char *s, *pstr; const char *flag; - recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - PG_UTF8, GetDatabaseEncoding()); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); - + /* Extract flag from the line */ flag = NULL; - if ((s = findchar(recoded, '/'))) + if ((s = findchar(line, '/'))) { *s++ = '\0'; flag = s; @@ -240,8 +230,8 @@ NIImportDictionary(IspellDict * Conf, const char *filename) else flag = ""; - - s = recoded; + /* Remove trailing spaces */ + s = line; while (*s) { if (t_isspace(s)) @@ -251,13 +241,12 @@ NIImportDictionary(IspellDict * Conf, const char *filename) } s += pg_mblen(s); } - pstr = lowerstr_ctx(recoded); + pstr = lowerstr_ctx(line); NIAddSpell(Conf, pstr, flag); pfree(pstr); - if (recoded != str) - pfree(recoded); + pfree(line); } FreeFile(dict); } @@ -402,7 +391,7 @@ NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const static bool parse_affentry(char *str, char *mask, char *find, char *repl, - const char *filename, int line) + const char *filename, int lineno) { int state = PAE_WAIT_MASK; char *pmask = mask, @@ -453,7 +442,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); } else if (state == PAE_INFIND) { @@ -471,7 +460,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); } else if (state == PAE_WAIT_REPL) { @@ -489,7 +478,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); } else if (state == PAE_INREPL) { @@ -507,7 +496,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); } else elog(ERROR, "unknown state in parse_affentry: %d", state); @@ -522,7 +511,7 @@ parse_affentry(char *str, char *mask, char *find, char *repl, static void addFlagValue(IspellDict * Conf, char *s, uint32 val, - const char *filename, int line) + const char *filename, int lineno) { while (*s && t_isspace(s)) s++; @@ -531,13 +520,13 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val, ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("syntax error at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); Conf->flagval[(unsigned int) *s] = (unsigned char) val; Conf->usecompound = true; @@ -546,7 +535,6 @@ addFlagValue(IspellDict * Conf, char *s, uint32 val, static void NIImportOOAffixes(IspellDict * Conf, const char *filename) { - char str[BUFSIZ]; char type[BUFSIZ], *ptype = NULL; char sflag[BUFSIZ]; @@ -560,9 +548,10 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) int flag = 0; char flagflags = 0; FILE *affix; - int line = 0; + int lineno = 0; int scanread = 0; char scanbuf[BUFSIZ]; + char *recoded; checkTmpCtx(); @@ -576,45 +565,41 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) errmsg("could not open affix file \"%s\": %m", filename))); - while (fgets(str, sizeof(str), affix)) + while ((recoded = t_readline(affix)) != NULL) { - char *recoded; - - recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - PG_UTF8, GetDatabaseEncoding()); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); - - line++; + lineno++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) + { + pfree(recoded); continue; + } if (STRNCMP(recoded, "COMPOUNDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"), - FF_COMPOUNDFLAG, filename, line); + FF_COMPOUNDFLAG, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"), - FF_COMPOUNDBEGIN, filename, line); + FF_COMPOUNDBEGIN, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDLAST") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"), - FF_COMPOUNDLAST, filename, line); + FF_COMPOUNDLAST, filename, lineno); /* COMPOUNDLAST and COMPOUNDEND are synonyms */ else if (STRNCMP(recoded, "COMPOUNDEND") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDEND"), - FF_COMPOUNDLAST, filename, line); + FF_COMPOUNDLAST, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"), - FF_COMPOUNDMIDDLE, filename, line); + FF_COMPOUNDMIDDLE, filename, lineno); else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0) addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"), - FF_COMPOUNDONLY, filename, line); + FF_COMPOUNDONLY, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"), - FF_COMPOUNDPERMITFLAG, filename, line); + FF_COMPOUNDPERMITFLAG, filename, lineno); else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0) addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"), - FF_COMPOUNDFORBIDFLAG, filename, line); + FF_COMPOUNDFORBIDFLAG, filename, lineno); else if (STRNCMP(recoded, "FLAG") == 0) { char *s = recoded + strlen("FLAG"); @@ -626,14 +611,13 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("Ispell dictionary supports only default flag value at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); } - if (recoded != str) - pfree(recoded); + pfree(recoded); } FreeFile(affix); - line = 0; + lineno = 0; sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5); @@ -643,18 +627,11 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) errmsg("could not open affix file \"%s\": %m", filename))); - while (fgets(str, sizeof(str), affix)) + while ((recoded = t_readline(affix)) != NULL) { - char *recoded; - - recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - PG_UTF8, GetDatabaseEncoding()); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); - - line++; + lineno++; if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#')) - continue; + goto nextline; scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask); @@ -662,12 +639,12 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) pfree(ptype); ptype = lowerstr_ctx(type); if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx"))) - continue; + goto nextline; if (scanread == 4) { if (strlen(sflag) != 1) - continue; + goto nextline; flag = *sflag; isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false; pfind = lowerstr_ctx(find); @@ -683,7 +660,7 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) int aflg = 0; if (strlen(sflag) != 1 || flag != *sflag || flag == 0) - continue; + goto nextline; prepl = lowerstr_ctx(repl); /* affix flag */ if ((ptr = strchr(prepl, '/')) != NULL) @@ -710,8 +687,8 @@ NIImportOOAffixes(IspellDict * Conf, const char *filename) pfree(pmask); } - if (recoded != str) - pfree(recoded); + nextline: + pfree(recoded); } if (ptype) @@ -733,13 +710,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename) char find[BUFSIZ]; char repl[BUFSIZ]; char *s; - int suffixes = 0; - int prefixes = 0; + bool suffixes = false; + bool prefixes = false; int flag = 0; char flagflags = 0; FILE *affix; - int line = 0; - int oldformat = 0; + int lineno = 0; + bool oldformat = false; + char *recoded = NULL; checkTmpCtx(); @@ -752,16 +730,16 @@ NIImportAffixes(IspellDict * Conf, const char *filename) memset(Conf->flagval, 0, sizeof(Conf->flagval)); Conf->usecompound = false; - while (fgets(str, sizeof(str), affix)) + while ((recoded = t_readline(affix)) != NULL) { - if (pstr) - pfree(pstr); + pstr = lowerstr(recoded); + pfree(recoded); - pstr = recode_and_lowerstr(str); + lineno++; - line++; + /* Skip comments and empty lines */ if (*pstr == '#' || *pstr == '\n') - continue; + goto nextline; if (STRNCMP(pstr, "compoundwords") == 0) { @@ -777,23 +755,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename) Conf->flagval[(unsigned int) *s] = FF_COMPOUNDFLAG; Conf->usecompound = true; } - oldformat++; - continue; + oldformat = true; + goto nextline; } } if (STRNCMP(pstr, "suffixes") == 0) { - suffixes = 1; - prefixes = 0; - oldformat++; - continue; + suffixes = true; + prefixes = false; + oldformat = true; + goto nextline; } if (STRNCMP(pstr, "prefixes") == 0) { - suffixes = 0; - prefixes = 1; - oldformat++; - continue; + suffixes = false; + prefixes = true; + oldformat = true; + goto nextline; } if (STRNCMP(pstr, "flag") == 0) { @@ -802,14 +780,14 @@ NIImportAffixes(IspellDict * Conf, const char *filename) while (*s && t_isspace(s)) s++; - oldformat++; + oldformat = true; /* allow only single-encoded flags */ if (pg_mblen(s) != 1) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); if (*s == '*') { @@ -830,10 +808,10 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("multibyte flag character is not allowed at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); flag = (unsigned char) *s; - continue; + goto nextline; } if (STRNCMP(str, "COMPOUNDFLAG") == 0 || STRNCMP(str, "COMPOUNDMIN") == 0 || STRNCMP(str, "PFX") == 0 || STRNCMP(str, "SFX") == 0) @@ -842,23 +820,23 @@ NIImportAffixes(IspellDict * Conf, const char *filename) ereport(ERROR, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("wrong affix file format for flag at line %d of affix file \"%s\"", - line, filename))); + lineno, filename))); FreeFile(affix); NIImportOOAffixes(Conf, filename); return; } if ((!suffixes) && (!prefixes)) - continue; + goto nextline; - if (!parse_affentry(pstr, mask, find, repl, filename, line)) - continue; + if (!parse_affentry(pstr, mask, find, repl, filename, lineno)) + goto nextline; NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX); - } - FreeFile(affix); - if (pstr) + nextline: pfree(pstr); + } + FreeFile(affix); } static int @@ -975,38 +953,55 @@ mkSPNode(IspellDict * Conf, int low, int high, int level) return rs; } +/* + * Builds the Conf->Dictionary tree and AffixData from the imported dictionary + * and affixes. + */ void NISortDictionary(IspellDict * Conf) { - size_t i; - int naffix = 3; + int i; + int naffix = 0; + int curaffix; checkTmpCtx(); /* compress affixes */ + + /* Count the number of different flags used in the dictionary */ + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix); - for (i = 1; i < Conf->nspell; i++) - if (strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag)) + + naffix = 0; + for (i = 0; i < Conf->nspell; i++) + { + if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN)) naffix++; + } + /* + * Fill in Conf->AffixData with the affixes that were used + * in the dictionary. Replace textual flag-field of Conf->Spell + * entries with indexes into Conf->AffixData array. + */ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *)); - naffix = 1; - Conf->AffixData[0] = pstrdup(""); - Conf->AffixData[1] = pstrdup(Conf->Spell[0]->p.flag); - Conf->Spell[0]->p.d.affix = 1; - Conf->Spell[0]->p.d.len = strlen(Conf->Spell[0]->word); - for (i = 1; i < Conf->nspell; i++) + + curaffix = -1; + for (i = 0; i < Conf->nspell; i++) { - if (strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[naffix])) + if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN)) { - naffix++; - Conf->AffixData[naffix] = pstrdup(Conf->Spell[i]->p.flag); + curaffix++; + Assert(curaffix < naffix); + Conf->AffixData[curaffix] = pstrdup(Conf->Spell[i]->p.flag); } - Conf->Spell[i]->p.d.affix = naffix; + + Conf->Spell[i]->p.d.affix = curaffix; Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word); } Conf->lenAffixData = Conf->nAffixData = naffix; + qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell); Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0); @@ -1085,7 +1080,7 @@ mkANode(IspellDict * Conf, int low, int high, int level, int type) } static void -mkVoidAffix(IspellDict * Conf, int issuffix, int startsuffix) +mkVoidAffix(IspellDict * Conf, bool issuffix, int startsuffix) { int i, cnt = 0; @@ -1145,7 +1140,7 @@ NISortAffixes(IspellDict * Conf) AFFIX *Affix; size_t i; CMPDAffix *ptr; - int firstsuffix = -1; + int firstsuffix = Conf->naffixes; checkTmpCtx(); @@ -1160,7 +1155,7 @@ NISortAffixes(IspellDict * Conf) for (i = 0; i < Conf->naffixes; i++) { Affix = &(((AFFIX *) Conf->Affix)[i]); - if (Affix->type == FF_SUFFIX && firstsuffix < 0) + if (Affix->type == FF_SUFFIX && i < firstsuffix) firstsuffix = i; if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 && @@ -1185,12 +1180,12 @@ NISortAffixes(IspellDict * Conf) Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX); Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX); - mkVoidAffix(Conf, 1, firstsuffix); - mkVoidAffix(Conf, 0, firstsuffix); + mkVoidAffix(Conf, true, firstsuffix); + mkVoidAffix(Conf, false, firstsuffix); } static AffixNodeData * -FinfAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type) +FindAffixes(AffixNode * node, const char *word, int wrdlen, int *level, int type) { AffixNodeData *StopLow, *StopHigh, @@ -1374,7 +1369,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) plevel = 0; while (pnode) { - prefix = FinfAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); + prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) @@ -1398,7 +1393,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) int baselen = 0; /* find possible suffix */ - suffix = FinfAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); + suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX); if (!suffix) break; /* foreach suffix check affix */ @@ -1416,7 +1411,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag) swrdlen = strlen(newword); while (pnode) { - prefix = FinfAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); + prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX); if (!prefix) break; for (j = 0; j < prefix->naff; j++) @@ -1626,7 +1621,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, if (wordlen == level + 1) { /* well, it was last word */ - var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); + var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; @@ -1641,7 +1636,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level); /* we can find next word */ level++; - var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos); + var->stem[var->nstem] = pnstrdup(word + startpos, level - startpos); var->nstem++; node = Conf->Dictionary; startpos = level; @@ -1656,7 +1651,7 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word, level++; } - var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); + var->stem[var->nstem] = pnstrdup(word + startpos, wordlen - startpos); var->nstem++; pfree(notprobed); return var; diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c index c822f086e0..361152e6be 100644 --- a/src/backend/tsearch/ts_locale.c +++ b/src/backend/tsearch/ts_locale.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -125,28 +125,47 @@ _t_isprint(const char *ptr) } #endif /* TS_USE_WIDE */ + /* - * Convert C-string from UTF8 to server encoding and - * lower it + * Read the next line from a tsearch data file (expected to be in UTF-8), and + * convert it to database encoding if needed. The returned string is palloc'd. + * NULL return means EOF. */ char * -recode_and_lowerstr(char *str) +t_readline(FILE *fp) { - char *recoded; - char *ret; + int len; + char *recoded; + char buf[4096]; /* lines must not be longer than this */ + + if (fgets(buf, sizeof(buf), fp) == NULL) + return NULL; - recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str), - PG_UTF8, GetDatabaseEncoding()); + len = strlen(buf); - if (recoded == NULL) - elog(ERROR, "encoding conversion failed"); + /* Make sure the input is valid UTF-8 */ + (void) pg_verify_mbstr(PG_UTF8, buf, len, false); - ret = lowerstr(recoded); + /* And convert */ + recoded = (char *) pg_do_encoding_conversion((unsigned char *) buf, + len, + PG_UTF8, + GetDatabaseEncoding()); - if (recoded != str) - pfree(recoded); + if (recoded == NULL) /* should not happen */ + elog(ERROR, "encoding conversion failed"); + + if (recoded == buf) + { + /* + * conversion didn't pstrdup, so we must. + * We can use the length of the original string, because + * no conversion was done. + */ + recoded = pnstrdup(recoded, len); + } - return ret; + return recoded; } char * @@ -155,6 +174,9 @@ lowerstr(char *str) return lowerstr_with_len(str, strlen(str)); } +/* + * Returned string is palloc'd + */ char * lowerstr_with_len(char *str, int len) { diff --git a/src/backend/tsearch/ts_parse.c b/src/backend/tsearch/ts_parse.c index f286a61fb0..47e18fc1ac 100644 --- a/src/backend/tsearch/ts_parse.c +++ b/src/backend/tsearch/ts_parse.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -308,7 +308,7 @@ LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem) { /* * Dictionary normalizes lexemes, so we remove from stack all - * used lexemes , return to basic mode and redo end of stack + * used lexemes, return to basic mode and redo end of stack * (if it exists) */ if (res) @@ -427,14 +427,14 @@ parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen) * Headline framework */ static void -hladdword(HeadlineText * prs, char *buf, int4 buflen, int type) +hladdword(HeadlineParsedText * prs, char *buf, int4 buflen, int type) { while (prs->curwords >= prs->lenwords) { prs->lenwords *= 2; - prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord)); + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); } - memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord)); + memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry)); prs->words[prs->curwords].type = (uint8) type; prs->words[prs->curwords].len = buflen; prs->words[prs->curwords].word = palloc(buflen); @@ -443,16 +443,16 @@ hladdword(HeadlineText * prs, char *buf, int4 buflen, int type) } static void -hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen) +hlfinditem(HeadlineParsedText * prs, TSQuery query, char *buf, int buflen) { int i; QueryItem *item = GETQUERY(query); - HeadlineWord *word; + HeadlineWordEntry *word; while (prs->curwords + query->size >= prs->lenwords) { prs->lenwords *= 2; - prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord)); + prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry)); } word = &(prs->words[prs->curwords - 1]); @@ -462,7 +462,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen) { if (word->item) { - memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord)); + memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry)); prs->words[prs->curwords].item = item; prs->words[prs->curwords].repeated = 1; prs->curwords++; @@ -475,7 +475,7 @@ hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen) } static void -addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms) +addHLParsedLex(HeadlineParsedText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms) { ParsedLex *tmplexs; TSLexeme *ptr; @@ -511,7 +511,7 @@ addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * n } void -hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen) +hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen) { int type, lenlemm; @@ -571,12 +571,12 @@ hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen } text * -generatHeadline(HeadlineText * prs) +generateHeadline(HeadlineParsedText * prs) { text *out; int len = 128; char *ptr; - HeadlineWord *wrd = prs->words; + HeadlineWordEntry *wrd = prs->words; out = (text *) palloc(len); ptr = ((char *) out) + VARHDRSZ; diff --git a/src/backend/tsearch/ts_utils.c b/src/backend/tsearch/ts_utils.c index 9270c40369..e9ad59282a 100644 --- a/src/backend/tsearch/ts_utils.c +++ b/src/backend/tsearch/ts_utils.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.2 2007/08/22 01:39:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -63,21 +63,29 @@ get_tsearch_config_filename(const char *basename, return result; } -#define STOPBUFLEN 4096 +static int +comparestr(const void *a, const void *b) +{ + return strcmp(*(char **) a, *(char **) b); +} +/* + * Reads a stopword file. Each word is run through 'wordop' + * function, if given. wordop may either modify the input in-place, + * or palloc a new version. + */ void -readstoplist(char *in, StopList * s) +readstoplist(const char *fname, StopList *s, char *(*wordop) (char *)) { char **stop = NULL; s->len = 0; - if (in && *in) + if (fname && *fname) { - char *filename = get_tsearch_config_filename(in, "stop"); + char *filename = get_tsearch_config_filename(fname, "stop"); FILE *hin; - char buf[STOPBUFLEN]; + char *line; int reallen = 0; - int line = 0; if ((hin = AllocateFile(filename, "r")) == NULL) ereport(ERROR, @@ -85,65 +93,56 @@ readstoplist(char *in, StopList * s) errmsg("could not open stopword file \"%s\": %m", filename))); - while (fgets(buf, STOPBUFLEN, hin)) + while ((line = t_readline(hin)) != NULL) { - char *pbuf = buf; + char *pbuf = line; - line++; - while (*pbuf && !isspace(*pbuf)) + /* Trim trailing space */ + while (*pbuf && !t_isspace(pbuf)) pbuf++; *pbuf = '\0'; - if (*buf == '\0') - continue; - - if (!pg_verifymbstr(buf, strlen(buf), true)) + /* Skip empty lines */ + if (*line == '\0') { - FreeFile(hin); - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("invalid multibyte encoding at line %d in file \"%s\"", - line, filename))); + pfree(line); + continue; } if (s->len >= reallen) { if (reallen == 0) { - reallen = 16; + reallen = 64; stop = (char **) palloc(sizeof(char *) * reallen); } else { reallen *= 2; - stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen); + stop = (char **) repalloc((void *) stop, + sizeof(char *) * reallen); } } - - if (s->wordop) - stop[s->len] = s->wordop(buf); + if (wordop) + { + stop[s->len] = wordop(line); + if (stop[s->len] != line) + pfree(line); + } else - stop[s->len] = pstrdup(buf); + stop[s->len] = line; (s->len)++; } + FreeFile(hin); pfree(filename); } s->stop = stop; -} -static int -comparestr(const void *a, const void *b) -{ - return strcmp(*(char **) a, *(char **) b); -} - -void -sortstoplist(StopList * s) -{ + /* Sort to allow binary searching */ if (s->stop && s->len > 0) qsort(s->stop, s->len, sizeof(char *), comparestr); } diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c index e927e98aab..0582fec2b5 100644 --- a/src/backend/tsearch/wparser.c +++ b/src/backend/tsearch/wparser.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.2 2007/08/22 01:39:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -300,7 +300,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS) text *in = PG_GETARG_TEXT_P(1); TSQuery query = PG_GETARG_TSQUERY(2); text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; - HeadlineText prs; + HeadlineParsedText prs; List *prsoptions; text *out; TSConfigCacheEntry *cfg; @@ -309,9 +309,9 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS) cfg = lookup_ts_config_cache(PG_GETARG_OID(0)); prsobj = lookup_ts_parser_cache(cfg->prsId); - memset(&prs, 0, sizeof(HeadlineText)); + memset(&prs, 0, sizeof(HeadlineParsedText)); prs.lenwords = 32; - prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords); + prs.words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs.lenwords); hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ); @@ -325,7 +325,7 @@ ts_headline_byid_opt(PG_FUNCTION_ARGS) PointerGetDatum(prsoptions), PointerGetDatum(query)); - out = generatHeadline(&prs); + out = generateHeadline(&prs); PG_FREE_IF_COPY(in, 1); PG_FREE_IF_COPY(query, 2); diff --git a/src/include/tsearch/dicts/spell.h b/src/include/tsearch/dicts/spell.h index 6c15a672f3..3dc013fea1 100644 --- a/src/include/tsearch/dicts/spell.h +++ b/src/include/tsearch/dicts/spell.h @@ -6,7 +6,7 @@ * * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/dicts/spell.h,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -18,12 +18,17 @@ #include "tsearch/dicts/regis.h" #include "tsearch/ts_public.h" +/* + * Max length of a flag name. Names longer than this will be truncated + * to the maximum. + */ +#define MAXFLAGLEN 16 + struct SPNode; typedef struct { - uint32 - val:8, + uint32 val:8, isword:1, compoundflag:4, affix:19; @@ -54,22 +59,25 @@ typedef struct spell_struct { union { - char flag[16]; + /* + * flag is filled in by NIImportDictionary. After NISortDictionary, + * d is valid and flag is invalid. + */ + char flag[MAXFLAGLEN]; struct { int affix; int len; } d; } p; - char word[1]; + char word[1]; /* variable length, null-terminated */ } SPELL; #define SPELLHDRSZ (offsetof(SPELL, word)) typedef struct aff_struct { - uint32 - flag:8, + uint32 flag:8, type:1, flagflags:7, issimple:1, @@ -85,11 +93,16 @@ typedef struct aff_struct } AFFIX; /* - * affixes use deictinary flags too + * affixes use dictionary flags too */ #define FF_COMPOUNDPERMITFLAG 0x10 #define FF_COMPOUNDFORBIDFLAG 0x20 #define FF_CROSSPRODUCT 0x40 + +/* + * Don't change the order of these. Initialization sorts by these, + * and expects prefixes to come first after sorting. + */ #define FF_SUFFIX 1 #define FF_PREFIX 0 @@ -97,8 +110,7 @@ struct AffixNode; typedef struct { - uint32 - val:8, + uint32 val:8, naff:24; AFFIX **aff; struct AffixNode *node; @@ -126,9 +138,13 @@ typedef struct int naffixes; AFFIX *Affix; - int nspell; - int mspell; + /* + * Temporary array of all words in the dict file. Only used during + * initialization + */ SPELL **Spell; + int nspell; /* number of valid entries in Spell array */ + int mspell; /* allocated length of Spell array */ AffixNode *Suffix; AffixNode *Prefix; diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h index 8a19766647..dcae2af93a 100644 --- a/src/include/tsearch/ts_locale.h +++ b/src/include/tsearch/ts_locale.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_locale.h,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -38,11 +38,11 @@ #ifdef TS_USE_WIDE -size_t char2wchar(wchar_t *to, const char *from, size_t len); +extern size_t char2wchar(wchar_t *to, const char *from, size_t len); #ifdef WIN32 -size_t wchar2char(char *to, const wchar_t *from, size_t len); +extern size_t wchar2char(char *to, const wchar_t *from, size_t len); #else /* WIN32 */ /* correct wcstombs */ @@ -81,8 +81,8 @@ extern int _t_isprint(const char *ptr); #define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s) #endif -char *lowerstr(char *str); -char *lowerstr_with_len(char *str, int len); -char *recode_and_lowerstr(char *str); +extern char *lowerstr(char *str); +extern char *lowerstr_with_len(char *str, int len); +extern char *t_readline(FILE *fp); #endif /* __TSLOCALE_H__ */ diff --git a/src/include/tsearch/ts_public.h b/src/include/tsearch/ts_public.h index 718abdb61d..148129aa8b 100644 --- a/src/include/tsearch/ts_public.h +++ b/src/include/tsearch/ts_public.h @@ -6,7 +6,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.2 2007/08/22 01:39:46 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_public.h,v 1.3 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -71,12 +71,11 @@ typedef struct { int len; char **stop; - char *(*wordop) (char *); } StopList; -extern void sortstoplist(StopList * s); -extern void readstoplist(char *in, StopList * s); -extern bool searchstoplist(StopList * s, char *key); +extern void readstoplist(const char *fname, StopList *s, + char *(*wordop) (char *)); +extern bool searchstoplist(StopList *s, char *key); /* * Interface with dictionaries @@ -102,9 +101,8 @@ typedef struct #define TSL_ADDPOS 0x01 /* - * Struct for supporting complex dictionaries like - * thesaurus, pointer to is an 4-th argument for - * dictlexize method + * Struct for supporting complex dictionaries like thesaurus. + * 4th argument for dictlexize method is a pointer to this */ typedef struct { diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h index f84db4c6e4..d2e5c8d8e4 100644 --- a/src/include/tsearch/ts_utils.h +++ b/src/include/tsearch/ts_utils.h @@ -5,7 +5,7 @@ * * Copyright (c) 1998-2007, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.1 2007/08/21 01:11:29 tgl Exp $ + * $PostgreSQL: pgsql/src/include/tsearch/ts_utils.h,v 1.2 2007/08/25 00:03:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -13,6 +13,7 @@ #define _PG_TS_UTILS_H_ #include "tsearch/ts_type.h" +#include "tsearch/ts_public.h" /* * Common parse definitions for tsvector and tsquery @@ -38,7 +39,8 @@ typedef struct extern bool gettoken_tsvector(TSVectorParseState *state); -struct ParseQueryNode; +struct ParseQueryNode; /* private in backend/utils/adt/tsquery.c */ + typedef struct { char *buffer; /* entire string we are scanning */ @@ -46,7 +48,7 @@ typedef struct int4 state; int4 count; - /* reverse polish notation in list (for temprorary usage) */ + /* reverse polish notation in list (for temporary usage) */ struct ParseQueryNode *str; /* number in str */ @@ -102,36 +104,12 @@ extern void parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen); * headline framework, flow in common to generate: * 1 parse text with hlparsetext * 2 parser-specific function to find part - * 3 generatHeadline to generate result text + * 3 generateHeadline to generate result text */ -typedef struct -{ - uint32 selected:1, - in:1, - replace:1, - repeated:1, - unused:4, - type:8, - len:16; - char *word; - QueryItem *item; -} HeadlineWord; - -typedef struct -{ - HeadlineWord *words; - int4 lenwords; - int4 curwords; - char *startsel; - char *stopsel; - int2 startsellen; - int2 stopsellen; -} HeadlineText; - -extern void hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, +extern void hlparsetext(Oid cfgId, HeadlineParsedText * prs, TSQuery query, char *buf, int4 buflen); -extern text *generatHeadline(HeadlineText * prs); +extern text *generateHeadline(HeadlineParsedText * prs); /* * token/node types for parsing -- GitLab