From c6aae3042be5249e672b731ebeb21875b5343010 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 7 Jul 2012 17:39:50 -0400 Subject: [PATCH] Simplify and document regex library's compact-NFA representation. The previous coding abused the first element of a cNFA state's arcs list to hold a per-state flag bit, which was confusing, undocumented, and not even particularly efficient. Get rid of that in favor of a separate "stflags" vector. Since there's only one bit in use, I chose to allocate a char per state; we could possibly replace this with a bitmap at some point, but that would make accesses a little slower. It's already about 8X smaller than before, so let's not get overly tense. Also document the representation better than it was before, which is to say not at all. This patch is a byproduct of investigations towards extracting a "fixed prefix" string from the compact-NFA representation of regex patterns. Might need to back-patch it if we decide to back-patch that fix, but for now it's just code cleanup so I'll just put it in HEAD. --- src/backend/regex/regc_nfa.c | 34 +++++++++++++++++----------------- src/backend/regex/regcomp.c | 2 +- src/backend/regex/rege_dfa.c | 11 +++++------ src/include/regex/regguts.h | 29 +++++++++++++++++++++-------- 4 files changed, 44 insertions(+), 32 deletions(-) diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 66a361ee2f..085842c92b 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -1330,14 +1330,16 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { nstates++; - narcs += 1 + s->nouts + 1; - /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */ + narcs += s->nouts + 1; /* need one extra for endmarker */ } + cnfa->stflags = (char *) MALLOC(nstates * sizeof(char)); cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *)); cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc)); - if (cnfa->states == NULL || cnfa->arcs == NULL) + if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL) { + if (cnfa->stflags != NULL) + FREE(cnfa->stflags); if (cnfa->states != NULL) FREE(cnfa->states); if (cnfa->arcs != NULL) @@ -1359,9 +1361,8 @@ compact(struct nfa * nfa, for (s = nfa->states; s != NULL; s = s->next) { assert((size_t) s->no < nstates); + cnfa->stflags[s->no] = 0; cnfa->states[s->no] = ca; - ca->co = 0; /* clear and skip flags "arc" */ - ca++; first = ca; for (a = s->outs; a != NULL; a = a->outchain) switch (a->type) @@ -1392,8 +1393,8 @@ compact(struct nfa * nfa, /* mark no-progress states */ for (a = nfa->pre->outs; a != NULL; a = a->outchain) - cnfa->states[a->to->no]->co = 1; - cnfa->states[nfa->pre->no]->co = 1; + cnfa->stflags[a->to->no] = CNFA_NOPROGRESS; + cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS; } /* @@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa) { assert(cnfa->nstates != 0); /* not empty already */ cnfa->nstates = 0; + FREE(cnfa->stflags); FREE(cnfa->states); FREE(cnfa->arcs); } @@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa, fprintf(f, ", haslacons"); fprintf(f, "\n"); for (st = 0; st < cnfa->nstates; st++) - dumpcstate(st, cnfa->states[st], cnfa, f); + dumpcstate(st, cnfa, f); fflush(f); } #endif @@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa, */ static void dumpcstate(int st, - struct carc * ca, struct cnfa * cnfa, FILE *f) { - int i; + struct carc * ca; int pos; - fprintf(f, "%d%s", st, (ca[0].co) ? ":" : "."); + fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : "."); pos = 1; - for (i = 1; ca[i].co != COLORLESS; i++) + for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { - if (ca[i].co < cnfa->ncolors) - fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to); + if (ca->co < cnfa->ncolors) + fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to); else - fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors, - ca[i].to); + fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to); if (pos == 5) { fprintf(f, "\n"); @@ -1653,7 +1653,7 @@ dumpcstate(int st, else pos++; } - if (i == 1 || pos != 1) + if (ca == cnfa->states[st] || pos != 1) fprintf(f, "\n"); fflush(f); } diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 57055f04ab..ceb6f0f873 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *); static int dumprarcs(struct arc *, struct state *, FILE *, int); static void dumparc(struct arc *, struct state *, FILE *); static void dumpcnfa(struct cnfa *, FILE *); -static void dumpcstate(int, struct carc *, struct cnfa *, FILE *); +static void dumpcstate(int, struct cnfa *, FILE *); #endif /* === regc_cvec.c === */ static struct cvec *newcvec(int, int); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index da7a0bf402..7a7ba5b89c 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -457,14 +457,14 @@ miss(struct vars * v, /* used only for debug flags */ gotstate = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(css->states, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) if (ca->co == co) { BSET(d->work, ca->to); gotstate = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d -> %d\n", i, ca->to)); } @@ -475,10 +475,9 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 0; for (i = 0; i < d->nstates; i++) if (ISBSET(d->work, i)) - for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; - ca++) + for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++) { - if (ca->co <= cnfa->ncolors) + if (ca->co < cnfa->ncolors) continue; /* NOTE CONTINUE */ sawlacons = 1; if (ISBSET(d->work, ca->to)) @@ -489,7 +488,7 @@ miss(struct vars * v, /* used only for debug flags */ dolacons = 1; if (ca->to == cnfa->post) ispost = 1; - if (!cnfa->states[ca->to]->co) + if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS)) noprogress = 0; FDEBUG(("%d :> %d\n", i, ca->to)); } diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index e8415799ec..b8788506d4 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -279,15 +279,14 @@ struct state; struct arc { - int type; -#define ARCFREE '\0' + int type; /* 0 if free, else an NFA arc type code */ color co; struct state *from; /* where it's from (and contained within) */ struct state *to; /* where it's to */ - struct arc *outchain; /* *from's outs chain or free chain */ + struct arc *outchain; /* link in *from's outs chain or free chain */ #define freechain outchain - struct arc *inchain; /* *to's ins chain */ - struct arc *colorchain; /* color's arc chain */ + struct arc *inchain; /* link in *to's ins chain */ + struct arc *colorchain; /* link in color's arc chain */ struct arc *colorchainRev; /* back-link in color's arc chain */ }; @@ -339,24 +338,38 @@ struct nfa /* * definitions for compacted NFA + * + * The main space savings in a compacted NFA is from making the arcs as small + * as possible. We store only the transition color and next-state number for + * each arc. The list of out arcs for each state is an array beginning at + * cnfa.states[statenumber], and terminated by a dummy carc struct with + * co == COLORLESS. + * + * The non-dummy carc structs are of two types: plain arcs and LACON arcs. + * Plain arcs just store the transition color number as "co". LACON arcs + * store the lookahead constraint number plus cnfa.ncolors as "co". LACON + * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. */ struct carc { color co; /* COLORLESS is list terminator */ - int to; /* state number */ + int to; /* next-state number */ }; struct cnfa { int nstates; /* number of states */ - int ncolors; /* number of colors */ + int ncolors; /* number of colors (max color in use + 1) */ int flags; -#define HASLACONS 01 /* uses lookahead constraints */ +#define HASLACONS 01 /* uses lookahead constraints */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ color eos[2]; /* colors, if any, assigned to EOS and EOL */ + char *stflags; /* vector of per-state flags bytes */ +#define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ struct carc **states; /* vector of pointers to outarc lists */ + /* states[n] are pointers into a single malloc'd array of arcs */ struct carc *arcs; /* the area for the lists */ }; -- GitLab