diff options
Diffstat (limited to 'src/backend/utils/adt/regexp.c')
-rw-r--r-- | src/backend/utils/adt/regexp.c | 621 |
1 files changed, 331 insertions, 290 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 7f140ddfdcb..05c00deaf98 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.71 2007/03/28 22:59:37 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.72 2007/08/11 03:56:24 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -29,19 +29,42 @@ */ #include "postgres.h" -#include "access/heapam.h" #include "catalog/pg_type.h" #include "funcapi.h" #include "regex/regex.h" #include "utils/builtins.h" #include "utils/guc.h" -#include "utils/lsyscache.h" + +#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \ + (PG_NARGS() > (_n) ? PG_GETARG_TEXT_P(_n) : NULL) /* GUC-settable flavor parameter */ static int regex_flavor = REG_ADVANCED; +/* all the options of interest for regex functions */ +typedef struct pg_re_flags +{ + int cflags; /* compile flags for Spencer's regex code */ + bool glob; /* do it globally (for each occurrence) */ +} pg_re_flags; + +/* cross-call state for regexp_matches(), also regexp_split() */ +typedef struct regexp_matches_ctx +{ + text *orig_str; /* data string in original TEXT form */ + int nmatches; /* number of places where pattern matched */ + int npatterns; /* number of capturing subpatterns */ + /* We store start char index and end+1 char index for each match */ + /* so the number of entries in match_locs is nmatches * npatterns * 2 */ + int *match_locs; /* 0-based character indexes */ + int next_match; /* 0-based index of next match to process */ + /* workspace for build_regexp_matches_result() */ + Datum *elems; /* has npatterns elements */ + bool *nulls; /* has npatterns elements */ +} regexp_matches_ctx; + /* * We cache precompiled regular expressions using a "self organizing list" * structure, in which recently-used items tend to be near the front. @@ -79,48 +102,18 @@ typedef struct cached_re_str regex_t cre_re; /* the compiled regular expression */ } cached_re_str; -typedef struct re_comp_flags -{ - int cflags; - bool glob; -} re_comp_flags; - -typedef struct regexp_matches_ctx -{ - text *orig_str; - size_t orig_len; - pg_wchar *wide_str; - size_t wide_len; - regex_t *cpattern; - regmatch_t *pmatch; - size_t offset; - - re_comp_flags flags; -} regexp_matches_ctx; - -typedef struct regexp_split_ctx -{ - text *orig_str; - size_t orig_len; - pg_wchar *wide_str; - size_t wide_len; - regex_t *cpattern; - regmatch_t match; - size_t offset; - re_comp_flags flags; -} regexp_split_ctx; - - static int num_res = 0; /* # of cached re's */ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ -static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, - text *flags); -static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx); -static regexp_split_ctx *setup_regexp_split(text *str, text *pattern, - text *flags); -static Datum get_next_split(regexp_split_ctx *splitctx); +/* Local functions */ +static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, + text *flags, + bool force_glob, + bool use_subpatterns, + bool ignore_degenerate); +static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx); +static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); /* @@ -139,7 +132,7 @@ RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); pg_wchar *pattern; - size_t pattern_len; + int pattern_len; int i; int regcomp_result; cached_re_str re_temp; @@ -235,7 +228,7 @@ RE_compile_and_cache(text *text_re, int cflags) } /* - * RE_wchar_execute - execute a RE + * RE_wchar_execute - execute a RE on pg_wchar data * * Returns TRUE on match, FALSE on no match * @@ -250,7 +243,7 @@ RE_compile_and_cache(text *text_re, int cflags) */ static bool RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, - size_t start_search, int nmatch, regmatch_t *pmatch) + int start_search, int nmatch, regmatch_t *pmatch) { int regexec_result; char errMsg[100]; @@ -295,7 +288,7 @@ RE_execute(regex_t *re, char *dat, int dat_len, int nmatch, regmatch_t *pmatch) { pg_wchar *data; - size_t data_len; + int data_len; bool match; /* Convert data string to wide characters */ @@ -304,6 +297,7 @@ RE_execute(regex_t *re, char *dat, int dat_len, /* Perform RE match and return result */ match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch); + pfree(data); return match; } @@ -334,17 +328,28 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len, return RE_execute(re, dat, dat_len, nmatch, pmatch); } + +/* + * parse_re_flags - parse the options argument of regexp_matches and friends + * + * flags --- output argument, filled with desired options + * opts --- *untoasted* TEXT object, or NULL for defaults + * + * This accepts all the options allowed by any of the callers; callers that + * don't want some have to reject them after the fact. + */ static void -parse_re_comp_flags(re_comp_flags *flags, text *opts) +parse_re_flags(pg_re_flags *flags, text *opts) { - MemSet(flags, 0, sizeof(re_comp_flags)); + /* regex_flavor is always folded into the compile flags */ flags->cflags = regex_flavor; + flags->glob = false; if (opts) { - char *opt_p = VARDATA(opts); - size_t opt_len = VARSIZE(opts) - VARHDRSZ; - int i; + char *opt_p = VARDATA(opts); + int opt_len = VARSIZE(opts) - VARHDRSZ; + int i; for (i = 0; i < opt_len; i++) { @@ -353,28 +358,49 @@ parse_re_comp_flags(re_comp_flags *flags, text *opts) case 'g': flags->glob = true; break; - case 'i': + case 'b': /* BREs (but why???) */ + flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE); + break; + case 'c': /* case sensitive */ + flags->cflags &= ~REG_ICASE; + break; + case 'e': /* plain EREs */ + flags->cflags |= REG_EXTENDED; + flags->cflags &= ~(REG_ADVANCED | REG_QUOTE); + break; + case 'i': /* case insensitive */ flags->cflags |= REG_ICASE; break; - case 'm': - case 'n': + case 'm': /* Perloid synonym for n */ + case 'n': /* \n affects ^ $ . [^ */ flags->cflags |= REG_NEWLINE; break; - case 'p': + case 'p': /* ~Perl, \n affects . [^ */ flags->cflags |= REG_NLSTOP; flags->cflags &= ~REG_NLANCH; break; - case 'w': + case 'q': /* literal string */ + flags->cflags |= REG_QUOTE; + flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED); + break; + case 's': /* single line, \n ordinary */ + flags->cflags &= ~REG_NEWLINE; + break; + case 't': /* tight syntax */ + flags->cflags &= ~REG_EXPANDED; + break; + case 'w': /* weird, \n affects ^ $ only */ flags->cflags &= ~REG_NLSTOP; flags->cflags |= REG_NLANCH; break; - case 'x': + case 'x': /* expanded syntax */ flags->cflags |= REG_EXPANDED; break; default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid regexp option: %c", opt_p[i]))); + errmsg("invalid regexp option: \"%c\"", + opt_p[i]))); break; } } @@ -410,6 +436,16 @@ assign_regex_flavor(const char *value, bool doit, GucSource source) /* + * report whether regex_flavor is currently BASIC + */ +bool +regex_flavor_is_basic(void) +{ + return (regex_flavor == REG_BASIC); +} + + +/* * interface routines called by the function manager */ @@ -605,16 +641,17 @@ textregexreplace(PG_FUNCTION_ARGS) text *r = PG_GETARG_TEXT_P(2); text *opt = PG_GETARG_TEXT_P(3); regex_t *re; - re_comp_flags flags; + pg_re_flags flags; - parse_re_comp_flags(&flags, opt); + parse_re_flags(&flags, opt); re = RE_compile_and_cache(p, flags.cflags); PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob)); } -/* similar_escape() +/* + * similar_escape() * Convert a SQL99 regexp pattern to POSIX style, so it can be used by * our regexp engine. */ @@ -735,185 +772,255 @@ similar_escape(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } -#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \ - (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL) - +/* + * regexp_matches() + * Return a table of matches of a pattern within a string. + */ Datum regexp_matches(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; - MemoryContext oldcontext; regexp_matches_ctx *matchctx; if (SRF_IS_FIRSTCALL()) { text *pattern = PG_GETARG_TEXT_P(1); text *flags = PG_GETARG_TEXT_P_IF_EXISTS(2); + MemoryContext oldcontext; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* be sure to copy the input string into the multi-call ctx */ matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, - flags); + flags, false, true, false); + + /* Pre-create workspace that build_regexp_matches_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); + matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) matchctx; - - /* - * Avoid run-away function by making sure we never iterate - * more than the length of the text + 1 (the number of matches - * an empty pattern will make is length + 1) - */ - if (matchctx->flags.glob) - funcctx->max_calls = matchctx->wide_len + 1; - else - funcctx->max_calls = 0; } funcctx = SRF_PERCALL_SETUP(); matchctx = (regexp_matches_ctx *) funcctx->user_fctx; - if (funcctx->call_cntr > funcctx->max_calls) - { - /* - * If max_calls == 0, then we are doing a non-global match, we - * should stop now, no problem. Otherwise, if we exceed - * max_calls something really wonky is going on, since it is - * returning more matches than there are characters in the - * string, which should not happen - */ - if (funcctx->max_calls != 0) - elog(ERROR, "set returning match function terminated after iterating %d times", - funcctx->call_cntr); - - SRF_RETURN_DONE(funcctx); - } - - if (matchctx->offset < matchctx->wide_len) + if (matchctx->next_match < matchctx->nmatches) { ArrayType *result_ary; - if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo) - matchctx->offset++; - - result_ary = perform_regexp_matches(matchctx); - if (result_ary != NULL) - { - matchctx->offset = matchctx->pmatch[0].rm_eo; - SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); - } - /* else fall through and return done */ + result_ary = build_regexp_matches_result(matchctx); + matchctx->next_match++; + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); } SRF_RETURN_DONE(funcctx); } +/* This is separate to keep the opr_sanity regression test from complaining */ Datum regexp_matches_no_flags(PG_FUNCTION_ARGS) { return regexp_matches(fcinfo); } +/* + * setup_regexp_matches --- do the initial matching for regexp_matches() + * or regexp_split() + * + * To avoid having to re-find the compiled pattern on each call, we do + * all the matching in one swoop. The returned regexp_matches_ctx contains + * the locations of all the substrings matching the pattern. + * + * The three bool parameters have only two patterns (one for each caller) + * but it seems clearer to distinguish the functionality this way than to + * key it all off one "is_split" flag. + */ static regexp_matches_ctx * -setup_regexp_matches(text *orig_str, text *pattern, text *flags) +setup_regexp_matches(text *orig_str, text *pattern, text *flags, + bool force_glob, bool use_subpatterns, + bool ignore_degenerate) { - regexp_matches_ctx *matchctx = palloc(sizeof(regexp_matches_ctx)); - + regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); + int orig_len; + pg_wchar *wide_str; + int wide_len; + pg_re_flags re_flags; + regex_t *cpattern; + regmatch_t *pmatch; + int pmatch_len; + int array_len; + int array_idx; + int prev_match_end; + int start_search; + + /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; - matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ; - - parse_re_comp_flags(&matchctx->flags, flags); - matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags); - matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1)); - matchctx->offset = 0; + /* convert string to pg_wchar form for matching */ + orig_len = VARSIZE(orig_str) - VARHDRSZ; + wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); + wide_len = pg_mb2wchar_with_len(VARDATA(orig_str), wide_str, orig_len); - matchctx->wide_str = palloc(sizeof(pg_wchar) * (matchctx->orig_len + 1)); - matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str), - matchctx->wide_str, matchctx->orig_len); - - matchctx->pmatch[0].rm_so = -1; - /* both < 0 but not equal */ - matchctx->pmatch[0].rm_eo = -2; + /* determine options */ + parse_re_flags(&re_flags, flags); + if (force_glob) + { + /* user mustn't specify 'g' for regexp_split */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split does not support the global option"))); + /* but we find all the matches anyway */ + re_flags.glob = true; + } - return matchctx; -} + /* set up the compiled pattern */ + cpattern = RE_compile_and_cache(pattern, re_flags.cflags); -static ArrayType * -perform_regexp_matches(regexp_matches_ctx *matchctx) -{ - Datum *elems; - bool *nulls; - Datum fullmatch; /* used to avoid a palloc if no matches */ - int ndims = 1; - int dims[1]; - int lbs[1] = {1}; - - if (RE_wchar_execute(matchctx->cpattern, - matchctx->wide_str, - matchctx->wide_len, - matchctx->offset, - matchctx->cpattern->re_nsub + 1, - matchctx->pmatch) == false) - return NULL; - - if (matchctx->cpattern->re_nsub > 0) + /* do we want to remember subpatterns? */ + if (use_subpatterns && cpattern->re_nsub > 0) { - int i; + matchctx->npatterns = cpattern->re_nsub; + pmatch_len = cpattern->re_nsub + 1; + } + else + { + use_subpatterns = false; + matchctx->npatterns = 1; + pmatch_len = 1; + } - elems = palloc(sizeof(Datum) * matchctx->cpattern->re_nsub); - nulls = palloc(sizeof(bool) * matchctx->cpattern->re_nsub); - dims[0] = matchctx->cpattern->re_nsub; + /* temporary output space for RE package */ + pmatch = palloc(sizeof(regmatch_t) * pmatch_len); - for (i = 0; i < matchctx->cpattern->re_nsub; i++) + /* the real output space (grown dynamically if needed) */ + array_len = re_flags.glob ? 256 : 32; + matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); + array_idx = 0; + + /* search for the pattern, perhaps repeatedly */ + prev_match_end = 0; + start_search = 0; + while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, + pmatch_len, pmatch)) + { + /* + * If requested, ignore degenerate matches, which are zero-length + * matches occurring at the start or end of a string or just after + * a previous match. + */ + if (!ignore_degenerate || + (pmatch[0].rm_so < wide_len && + pmatch[0].rm_eo > prev_match_end)) { - int so = matchctx->pmatch[i + 1].rm_so; - int eo = matchctx->pmatch[i + 1].rm_eo; + /* enlarge output space if needed */ + while (array_idx + matchctx->npatterns * 2 > array_len) + { + array_len *= 2; + matchctx->match_locs = (int *) repalloc(matchctx->match_locs, + sizeof(int) * array_len); + } - if (so < 0 || eo < 0) + /* save this match's locations */ + if (use_subpatterns) { - elems[i] = 0; - nulls[i] = true; + int i; + + for (i = 1; i <= matchctx->npatterns; i++) + { + matchctx->match_locs[array_idx++] = pmatch[i].rm_so; + matchctx->match_locs[array_idx++] = pmatch[i].rm_eo; + } } else { - elems[i] = DirectFunctionCall3(text_substr, - PointerGetDatum(matchctx->orig_str), - Int32GetDatum(so + 1), - Int32GetDatum(eo - so)); - nulls[i] = false; + matchctx->match_locs[array_idx++] = pmatch[0].rm_so; + matchctx->match_locs[array_idx++] = pmatch[0].rm_eo; } + matchctx->nmatches++; } + prev_match_end = pmatch[0].rm_eo; + + /* if not glob, stop after one match */ + if (!re_flags.glob) + break; + + /* + * Advance search position. Normally we start just after the end + * of the previous match, but always advance at least one character + * (the special case can occur if the pattern matches zero characters + * just after the prior match or at the end of the string). + */ + if (start_search < pmatch[0].rm_eo) + start_search = pmatch[0].rm_eo; + else + start_search++; + if (start_search > wide_len) + break; } - else - { - int so = matchctx->pmatch[0].rm_so; - int eo = matchctx->pmatch[0].rm_eo; - if (so < 0 || eo < 0) - elog(ERROR, "regexp code said it had a match, but did not return it"); + /* Clean up temp storage */ + pfree(wide_str); + pfree(pmatch); - fullmatch = DirectFunctionCall3(text_substr, - PointerGetDatum(matchctx->orig_str), - Int32GetDatum(so + 1), - Int32GetDatum(eo - so)); + return matchctx; +} + +/* + * build_regexp_matches_result - build output array for current match + */ +static ArrayType * +build_regexp_matches_result(regexp_matches_ctx *matchctx) +{ + Datum *elems = matchctx->elems; + bool *nulls = matchctx->nulls; + int dims[1]; + int lbs[1]; + int loc; + int i; - elems = &fullmatch; - nulls = NULL; - dims[0] = 1; + /* Extract matching substrings from the original string */ + loc = matchctx->next_match * matchctx->npatterns * 2; + for (i = 0; i < matchctx->npatterns; i++) + { + int so = matchctx->match_locs[loc++]; + int eo = matchctx->match_locs[loc++]; + + if (so < 0 || eo < 0) + { + elems[i] = (Datum) 0; + nulls[i] = true; + } + else + { + elems[i] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); + nulls[i] = false; + } } + /* And form an array */ + dims[0] = matchctx->npatterns; + lbs[0] = 1; /* XXX: this hardcodes assumptions about the text type */ - return construct_md_array(elems, nulls, ndims, dims, lbs, + return construct_md_array(elems, nulls, 1, dims, lbs, TEXTOID, -1, false, 'i'); } +/* + * regexp_split_to_table() + * Split the string at matches of the pattern, returning the + * split-out substrings as a table. + */ Datum regexp_split_to_table(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; - regexp_split_ctx *splitctx; + regexp_matches_ctx *splitctx; if (SRF_IS_FIRSTCALL()) { @@ -924,168 +1031,102 @@ regexp_split_to_table(PG_FUNCTION_ARGS) funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - splitctx = setup_regexp_split(PG_GETARG_TEXT_P_COPY(0), pattern, flags); + /* be sure to copy the input string into the multi-call ctx */ + splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, + flags, true, false, true); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) splitctx; - - /* - * Avoid run-away function by making sure we never iterate - * more than the length of the text - */ - funcctx->max_calls = splitctx->wide_len; } funcctx = SRF_PERCALL_SETUP(); - splitctx = (regexp_split_ctx *) funcctx->user_fctx; + splitctx = (regexp_matches_ctx *) funcctx->user_fctx; - if (funcctx->call_cntr > funcctx->max_calls) + if (splitctx->next_match <= splitctx->nmatches) { - /* - * If we exceed wide_len something really wonky is going on, - * since it is returning more matches than there are - * characters in the string, which should not happen - */ - elog(ERROR, "set returning split function terminated after iterating %d times", - funcctx->call_cntr); + Datum result = build_regexp_split_result(splitctx); + + splitctx->next_match++; + SRF_RETURN_NEXT(funcctx, result); } - if (splitctx->offset < splitctx->wide_len) - SRF_RETURN_NEXT(funcctx, get_next_split(splitctx)); - else - SRF_RETURN_DONE(funcctx); + SRF_RETURN_DONE(funcctx); } +/* This is separate to keep the opr_sanity regression test from complaining */ Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS) { return regexp_split_to_table(fcinfo); } +/* + * regexp_split_to_array() + * Split the string at matches of the pattern, returning the + * split-out substrings as an array. + */ Datum regexp_split_to_array(PG_FUNCTION_ARGS) { ArrayBuildState *astate = NULL; - regexp_split_ctx *splitctx; - int nitems; + regexp_matches_ctx *splitctx; - splitctx = setup_regexp_split(PG_GETARG_TEXT_P(0), - PG_GETARG_TEXT_P(1), - PG_GETARG_TEXT_P_IF_EXISTS(2)); + splitctx = setup_regexp_matches(PG_GETARG_TEXT_P(0), + PG_GETARG_TEXT_P(1), + PG_GETARG_TEXT_P_IF_EXISTS(2), + true, false, true); - for (nitems = 0; splitctx->offset < splitctx->wide_len; nitems++) + while (splitctx->next_match <= splitctx->nmatches) { - if (nitems > splitctx->wide_len) - elog(ERROR, "split function terminated after iterating %d times", - nitems); - astate = accumArrayResult(astate, - get_next_split(splitctx), + build_regexp_split_result(splitctx), false, TEXTOID, CurrentMemoryContext); + splitctx->next_match++; } PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext)); } +/* This is separate to keep the opr_sanity regression test from complaining */ Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS) { return regexp_split_to_array(fcinfo); } -static regexp_split_ctx * -setup_regexp_split(text *str, text *pattern, text *flags) -{ - regexp_split_ctx *splitctx = palloc(sizeof(regexp_split_ctx)); - - splitctx->orig_str = str; - splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ; - - parse_re_comp_flags(&splitctx->flags, flags); - if (splitctx->flags.glob) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("regexp_split does not support the global option"))); - - splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags); - - splitctx->wide_str = palloc(sizeof(pg_wchar) * (splitctx->orig_len + 1)); - splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str), - splitctx->wide_str, - splitctx->orig_len); - - splitctx->offset = 0; - - splitctx->match.rm_so = -1; - /* both < 0 but not equal */ - splitctx->match.rm_eo = -2; - - return splitctx; -} - +/* + * build_regexp_split_result - build output string for current match + * + * We return the string between the current match and the previous one, + * or the string after the last match when next_match == nmatches. + */ static Datum -get_next_split(regexp_split_ctx *splitctx) +build_regexp_split_result(regexp_matches_ctx *splitctx) { - regmatch_t *pmatch = &(splitctx->match); - - for (;;) - { - Datum result; - int startpos = splitctx->offset + 1; - - /* - * If the last match was zero-length, we need to push the - * offset forward to avoid matching the same place forever - */ - if (pmatch->rm_so == pmatch->rm_eo) - splitctx->offset++; - - if (RE_wchar_execute(splitctx->cpattern, - splitctx->wide_str, - splitctx->wide_len, - splitctx->offset, - 1, - pmatch)) - { - int length = splitctx->match.rm_so - startpos + 1; - - /* - * If we are trying to match at the beginning of the string and - * we got a zero-length match, or if we just matched where we - * left off last time, go around the loop again and increment - * the offset. If we have incremented the offset already and - * it matched at the new offset, that's ok - */ - if (length == 0) - continue; + int startpos; + int endpos; - result = DirectFunctionCall3(text_substr, - PointerGetDatum(splitctx->orig_str), - Int32GetDatum(startpos), - Int32GetDatum(length)); - - /* set the offset to the end of this match for next time */ - splitctx->offset = pmatch->rm_eo; - - return result; - } + if (splitctx->next_match > 0) + startpos = splitctx->match_locs[splitctx->next_match * 2 - 1]; + else + startpos = 0; + if (startpos < 0) + elog(ERROR, "invalid match ending position"); + if (splitctx->next_match < splitctx->nmatches) + { + endpos = splitctx->match_locs[splitctx->next_match * 2]; + if (endpos < startpos) + elog(ERROR, "invalid match starting position"); + return DirectFunctionCall3(text_substr, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(startpos + 1), + Int32GetDatum(endpos - startpos)); + } + else + { /* no more matches, return rest of string */ - result = DirectFunctionCall2(text_substr_no_len, - PointerGetDatum(splitctx->orig_str), - Int32GetDatum(startpos)); - - /* so we know we're done next time through */ - splitctx->offset = splitctx->wide_len; - - return result; + return DirectFunctionCall2(text_substr_no_len, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(startpos + 1)); } } - -/* - * report whether regex_flavor is currently BASIC - */ -bool -regex_flavor_is_basic(void) -{ - return (regex_flavor == REG_BASIC); -} |