diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2003-02-05 17:41:33 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2003-02-05 17:41:33 +0000 |
commit | 7bcc6d98fb5c3bda2787ae085ef3ff3dbb65ae42 (patch) | |
tree | 7a269b416abdaec2b9b78c32ce485390aae1cda3 /src/backend/utils/adt/regexp.c | |
parent | 32c3db0f86cdf23646094b06331f71e42fd4e413 (diff) | |
download | postgresql-7bcc6d98fb5c3bda2787ae085ef3ff3dbb65ae42.tar.gz postgresql-7bcc6d98fb5c3bda2787ae085ef3ff3dbb65ae42.zip |
Replace regular expression package with Henry Spencer's latest version
(extracted from Tcl 8.4.1 release, as Henry still hasn't got round to
making it a separate library). This solves a performance problem for
multibyte, as well as upgrading our regexp support to match recent Tcl
and nearly match recent Perl.
Diffstat (limited to 'src/backend/utils/adt/regexp.c')
-rw-r--r-- | src/backend/utils/adt/regexp.c | 381 |
1 files changed, 201 insertions, 180 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index ebbca8f0401..604e55d4145 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -1,14 +1,14 @@ /*------------------------------------------------------------------------- * * regexp.c - * regular expression handling code. + * Postgres' interface to the regular expression package. * * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.44 2003/02/05 17:41:32 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -30,171 +30,189 @@ #include "postgres.h" #include "regex/regex.h" +#include "mb/pg_wchar.h" #include "utils/builtins.h" -#if defined(DISABLE_XOPEN_NLS) -#undef _XOPEN_SOURCE -#endif /* DISABLE_XOPEN_NLS */ -/* this is the number of cached regular expressions held. */ +/* + * We cache precompiled regular expressions using a "self organizing list" + * structure, in which recently-used items tend to be near the front. + * Whenever we use an entry, it's moved up to the front of the list. + * Over time, an item's average position corresponds to its frequency of use. + * + * When we first create an entry, it's inserted at the front of + * the array, dropping the entry at the end of the array if necessary to + * make room. (This might seem to be weighting the new entry too heavily, + * but if we insert new entries further back, we'll be unable to adjust to + * a sudden shift in the query mix where we are presented with MAX_CACHED_RES + * never-before-seen items used circularly. We ought to be able to handle + * that case, so we have to insert at the front.) + * + * Knuth mentions a variant strategy in which a used item is moved up just + * one place in the list. Although he says this uses fewer comparisons on + * average, it seems not to adapt very well to the situation where you have + * both some reusable patterns and a steady stream of non-reusable patterns. + * A reusable pattern that isn't used at least as often as non-reusable + * patterns are seen will "fail to keep up" and will drop off the end of the + * cache. With move-to-front, a reusable pattern is guaranteed to stay in + * the cache as long as it's used at least once in every MAX_CACHED_RES uses. + */ + +/* this is the maximum number of cached regular expressions */ #ifndef MAX_CACHED_RES #define MAX_CACHED_RES 32 #endif -/* this structure describes a cached regular expression */ -struct cached_re_str +/* this structure describes one cached regular expression */ +typedef struct cached_re_str { - char *cre_s; /* pattern as null-terminated string */ - int cre_type; /* compiled-type: extended,icase etc */ + text *cre_pat; /* original RE (untoasted TEXT form) */ + int cre_flags; /* compile flags: extended,icase etc */ regex_t cre_re; /* the compiled regular expression */ - unsigned long cre_lru; /* lru tag */ -}; +} cached_re_str; -static int rec = 0; /* # of cached re's */ -static struct cached_re_str rev[MAX_CACHED_RES]; /* cached re's */ -static unsigned long lru; /* system lru tag */ -static int pg_lastrec = 0; +static int num_res = 0; /* # of cached re's */ +static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ -/* attempt to compile `re' as an re, then match it against text */ -/* cflags - flag to regcomp indicates case sensitivity */ + +/* + * RE_compile_and_execute - compile and execute a RE, caching if possible + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ static bool -RE_compile_and_execute(text *text_re, char *text, int cflags, - int nmatch, regmatch_t *pmatch) +RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) { - char *re; - int oldest; + int text_re_len = VARSIZE(text_re); + pg_wchar *data; + size_t data_len; + pg_wchar *pattern; + size_t pattern_len; int i; int regcomp_result; + int regexec_result; + cached_re_str re_temp; - /* Convert 'text' pattern to null-terminated string */ - re = DatumGetCString(DirectFunctionCall1(textout, - PointerGetDatum(text_re))); + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); /* - * Find a previously compiled regular expression. Run the cache as a - * ring buffer, starting the search from the previous match if any. + * Look for a match among previously compiled REs. Since the data + * structure is self-organizing with most-used entries at the front, + * our search strategy can just be to scan from the front. */ - i = pg_lastrec; - while (i < rec) + for (i = 0; i < num_res; i++) { - if (rev[i].cre_s != NULL) + if (memcmp(re_array[i].cre_pat, text_re, text_re_len) == 0 && + re_array[i].cre_flags == cflags) { - if (strcmp(rev[i].cre_s, re) == 0 && - rev[i].cre_type == cflags) + /* + * Found a match; move it to front if not there already. + */ + if (i > 0) { - pg_lastrec = i; - rev[i].cre_lru = ++lru; - pfree(re); - return (pg_regexec(&rev[i].cre_re, - text, nmatch, - pmatch, 0) == 0); + re_temp = re_array[i]; + memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str)); + re_array[0] = re_temp; } - } - i++; - /* - * If we were not at the first slot to start, then think about - * wrapping if necessary. - */ - if (pg_lastrec != 0) - { - if (i >= rec) - i = 0; - else if (i == pg_lastrec) - break; - } - } + /* Perform RE match and return result */ + regexec_result = pg_regexec(&re_array[0].cre_re, + data, + data_len, + NULL, /* no details */ + nmatch, + pmatch, + 0); - /* we didn't find it - make room in the cache for it */ - if (rec >= MAX_CACHED_RES) - { - /* cache is full - find the oldest entry */ - for (oldest = 0, i = 1; i < rec; i++) - { - if (rev[i].cre_lru < rev[oldest].cre_lru) - oldest = i; - } - } - else - oldest = rec++; + pfree(data); - /* if there was an old re, then de-allocate the space it used */ - if (rev[oldest].cre_s != (char *) NULL) - { - for (lru = i = 0; i < rec; i++) - { - /* downweight all of the other cached entries */ - rev[i].cre_lru = (rev[i].cre_lru - rev[oldest].cre_lru) / 2; - if (rev[i].cre_lru > lru) - lru = rev[i].cre_lru; + return (regexec_result == 0); } - pg_regfree(&rev[oldest].cre_re); - - /* - * use malloc/free for the cre_s field because the storage has to - * persist across transactions - */ - free(rev[oldest].cre_s); - rev[oldest].cre_s = (char *) NULL; } - /* compile the re */ - regcomp_result = pg_regcomp(&rev[oldest].cre_re, re, cflags); - if (regcomp_result == 0) - { - pg_lastrec = oldest; - - /* - * use malloc/free for the cre_s field because the storage has to - * persist across transactions - */ - rev[oldest].cre_s = strdup(re); - rev[oldest].cre_lru = ++lru; - rev[oldest].cre_type = cflags; - pfree(re); - /* agc - fixed an old typo here */ - return (pg_regexec(&rev[oldest].cre_re, text, - nmatch, pmatch, 0) == 0); - } - else - { - char errMsg[1000]; + /* + * Couldn't find it, so try to compile the new RE. To avoid leaking + * resources on failure, we build into the re_temp local. + */ + + /* Convert pattern string to wide characters */ + pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar)); + pattern_len = pg_mb2wchar_with_len((unsigned char *) VARDATA(text_re), + pattern, + text_re_len - VARHDRSZ); + regcomp_result = pg_regcomp(&re_temp.cre_re, + pattern, + pattern_len, + cflags); + + pfree(pattern); + + if (regcomp_result != 0) + { /* re didn't compile */ - pg_regerror(regcomp_result, &rev[oldest].cre_re, errMsg, - sizeof(errMsg)); + char errMsg[100]; + + pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg)); + /* XXX should we pg_regfree here? */ elog(ERROR, "Invalid regular expression: %s", errMsg); } - /* not reached */ - return false; -} - + /* + * use malloc/free for the cre_pat field because the storage has to + * persist across transactions + */ + re_temp.cre_pat = malloc(text_re_len); + if (re_temp.cre_pat == NULL) + { + pg_regfree(&re_temp.cre_re); + elog(ERROR, "Out of memory"); + } + memcpy(re_temp.cre_pat, text_re, text_re_len); + re_temp.cre_flags = cflags; -/* - fixedlen_regexeq: + /* + * Okay, we have a valid new item in re_temp; insert it into the + * storage array. Discard last entry if needed. + */ + if (num_res >= MAX_CACHED_RES) + { + --num_res; + Assert(num_res < MAX_CACHED_RES); + pg_regfree(&re_array[num_res].cre_re); + free(re_array[num_res].cre_pat); + } - a generic fixed length regexp routine - s - the string to match against (not necessarily null-terminated) - p - the pattern (as a text*) - charlen - the length of the string -*/ -static bool -fixedlen_regexeq(char *s, text *p, int charlen, int cflags) -{ - char *sterm; - bool result; + if (num_res > 0) + memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str)); - /* be sure sterm is null-terminated */ - sterm = (char *) palloc(charlen + 1); - memcpy(sterm, s, charlen); - sterm[charlen] = '\0'; + re_array[0] = re_temp; + num_res++; - result = RE_compile_and_execute(p, sterm, cflags, 0, NULL); + /* Perform RE match and return result */ + regexec_result = pg_regexec(&re_array[0].cre_re, + data, + data_len, + NULL, /* no details */ + nmatch, + pmatch, + 0); - pfree(sterm); + pfree(data); - return result; + return (regexec_result == 0); } @@ -208,10 +226,11 @@ nameregexeq(PG_FUNCTION_ARGS) Name n = PG_GETARG_NAME(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n), - p, - strlen(NameStr(*n)), - REG_EXTENDED)); + PG_RETURN_BOOL(RE_compile_and_execute(p, + (unsigned char *) NameStr(*n), + strlen(NameStr(*n)), + REG_ADVANCED, + 0, NULL)); } Datum @@ -220,10 +239,11 @@ nameregexne(PG_FUNCTION_ARGS) Name n = PG_GETARG_NAME(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n), - p, - strlen(NameStr(*n)), - REG_EXTENDED)); + PG_RETURN_BOOL(!RE_compile_and_execute(p, + (unsigned char *) NameStr(*n), + strlen(NameStr(*n)), + REG_ADVANCED, + 0, NULL)); } Datum @@ -232,10 +252,11 @@ textregexeq(PG_FUNCTION_ARGS) text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s), - p, - VARSIZE(s) - VARHDRSZ, - REG_EXTENDED)); + PG_RETURN_BOOL(RE_compile_and_execute(p, + (unsigned char *) VARDATA(s), + VARSIZE(s) - VARHDRSZ, + REG_ADVANCED, + 0, NULL)); } Datum @@ -244,10 +265,11 @@ textregexne(PG_FUNCTION_ARGS) text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s), - p, - VARSIZE(s) - VARHDRSZ, - REG_EXTENDED)); + PG_RETURN_BOOL(!RE_compile_and_execute(p, + (unsigned char *) VARDATA(s), + VARSIZE(s) - VARHDRSZ, + REG_ADVANCED, + 0, NULL)); } @@ -258,82 +280,81 @@ textregexne(PG_FUNCTION_ARGS) Datum -texticregexeq(PG_FUNCTION_ARGS) +nameicregexeq(PG_FUNCTION_ARGS) { - text *s = PG_GETARG_TEXT_P(0); + Name n = PG_GETARG_NAME(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s), - p, - VARSIZE(s) - VARHDRSZ, - REG_ICASE | REG_EXTENDED)); + PG_RETURN_BOOL(RE_compile_and_execute(p, + (unsigned char *) NameStr(*n), + strlen(NameStr(*n)), + REG_ICASE | REG_ADVANCED, + 0, NULL)); } Datum -texticregexne(PG_FUNCTION_ARGS) +nameicregexne(PG_FUNCTION_ARGS) { - text *s = PG_GETARG_TEXT_P(0); + Name n = PG_GETARG_NAME(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s), - p, - VARSIZE(s) - VARHDRSZ, - REG_ICASE | REG_EXTENDED)); + PG_RETURN_BOOL(!RE_compile_and_execute(p, + (unsigned char *) NameStr(*n), + strlen(NameStr(*n)), + REG_ICASE | REG_ADVANCED, + 0, NULL)); } Datum -nameicregexeq(PG_FUNCTION_ARGS) +texticregexeq(PG_FUNCTION_ARGS) { - Name n = PG_GETARG_NAME(0); + text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n), - p, - strlen(NameStr(*n)), - REG_ICASE | REG_EXTENDED)); + PG_RETURN_BOOL(RE_compile_and_execute(p, + (unsigned char *) VARDATA(s), + VARSIZE(s) - VARHDRSZ, + REG_ICASE | REG_ADVANCED, + 0, NULL)); } Datum -nameicregexne(PG_FUNCTION_ARGS) +texticregexne(PG_FUNCTION_ARGS) { - Name n = PG_GETARG_NAME(0); + text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n), - p, - strlen(NameStr(*n)), - REG_ICASE | REG_EXTENDED)); + PG_RETURN_BOOL(!RE_compile_and_execute(p, + (unsigned char *) VARDATA(s), + VARSIZE(s) - VARHDRSZ, + REG_ICASE | REG_ADVANCED, + 0, NULL)); } -/* textregexsubstr() - * Return a substring matched by a regular expression. +/* + * textregexsubstr() + * Return a substring matched by a regular expression. */ Datum textregexsubstr(PG_FUNCTION_ARGS) { text *s = PG_GETARG_TEXT_P(0); text *p = PG_GETARG_TEXT_P(1); - char *sterm; - int len; bool match; regmatch_t pmatch[2]; - /* be sure sterm is null-terminated */ - len = VARSIZE(s) - VARHDRSZ; - sterm = (char *) palloc(len + 1); - memcpy(sterm, VARDATA(s), len); - sterm[len] = '\0'; - /* * We pass two regmatch_t structs to get info about the overall match * and the match for the first parenthesized subexpression (if any). * If there is a parenthesized subexpression, we return what it matched; * else return what the whole regexp matched. */ - match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch); - - pfree(sterm); + match = RE_compile_and_execute(p, + (unsigned char *) VARDATA(s), + VARSIZE(s) - VARHDRSZ, + REG_ADVANCED, + 2, pmatch); /* match? then return the substring matching the pattern */ if (match) |