diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2016-08-17 18:32:56 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2016-08-17 18:33:01 -0400 |
commit | cf9b0fea5f6d1bcc9b2c66f5c30ecb04684a0919 (patch) | |
tree | 37757fd2bb978cdd4837168e61340ddd7f33de04 /src/backend/utils/adt/regexp.c | |
parent | 2d7e591007a6f44e5e27e2b6c1098483105c0d10 (diff) | |
download | postgresql-cf9b0fea5f6d1bcc9b2c66f5c30ecb04684a0919.tar.gz postgresql-cf9b0fea5f6d1bcc9b2c66f5c30ecb04684a0919.zip |
Implement regexp_match(), a simplified alternative to regexp_matches().
regexp_match() is like regexp_matches(), but it disallows the 'g' flag
and in consequence does not need to return a set. Instead, it returns
a simple text array value, or NULL if there's no match. Previously people
usually got that behavior with a sub-select, but this way is considerably
more efficient.
Documentation adjusted so that regexp_match() is presented first and then
regexp_matches() is introduced as a more complicated version. This is
a bit historically revisionist but seems pedagogically better.
Still TODO: extend contrib/citext to support this function.
Emre Hasegeli, reviewed by David Johnston
Discussion: <CAE2gYzy42sna2ME_e3y1KLQ-4UBrB-eVF0SWn8QG39sQSeVhEw@mail.gmail.com>
Diffstat (limited to 'src/backend/utils/adt/regexp.c')
-rw-r--r-- | src/backend/utils/adt/regexp.c | 137 |
1 files changed, 96 insertions, 41 deletions
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 5b216e0b721..bc5e34e222b 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -47,7 +47,7 @@ typedef struct pg_re_flags bool glob; /* do it globally (for each occurrence) */ } pg_re_flags; -/* cross-call state for regexp_matches(), also regexp_split() */ +/* cross-call state for regexp_match and regexp_split functions */ typedef struct regexp_matches_ctx { text *orig_str; /* data string in original TEXT form */ @@ -57,7 +57,7 @@ typedef struct regexp_matches_ctx /* so the number of entries in match_locs is nmatches * npatterns * 2 */ int *match_locs; /* 0-based character indexes */ int next_match; /* 0-based index of next match to process */ - /* workspace for build_regexp_matches_result() */ + /* workspace for build_regexp_match_result() */ Datum *elems; /* has npatterns elements */ bool *nulls; /* has npatterns elements */ } regexp_matches_ctx; @@ -107,13 +107,12 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ /* Local functions */ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern, - text *flags, + pg_re_flags *flags, Oid collation, - bool force_glob, bool use_subpatterns, bool ignore_degenerate); static void cleanup_regexp_matches(regexp_matches_ctx *matchctx); -static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx); +static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx); static Datum build_regexp_split_result(regexp_matches_ctx *splitctx); @@ -350,7 +349,7 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len, /* - * parse_re_flags - parse the options argument of regexp_matches and friends + * parse_re_flags - parse the options argument of regexp_match and friends * * flags --- output argument, filled with desired options * opts --- TEXT object, or NULL for defaults @@ -841,8 +840,52 @@ similar_escape(PG_FUNCTION_ARGS) } /* + * regexp_match() + * Return the first substring(s) matching a pattern within a string. + */ +Datum +regexp_match(PG_FUNCTION_ARGS) +{ + text *orig_str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_match does not support the global option"), + errhint("Use the regexp_matches function instead."))); + + matchctx = setup_regexp_matches(orig_str, pattern, &re_flags, + PG_GET_COLLATION(), true, false); + + if (matchctx->nmatches == 0) + PG_RETURN_NULL(); + + Assert(matchctx->nmatches == 1); + + /* Create workspace that build_regexp_match_result needs */ + matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); + matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); + + PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx))); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +regexp_match_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_match(fcinfo); +} + +/* * regexp_matches() - * Return a table of matches of a pattern within a string. + * Return a table of all matches of a pattern within a string. */ Datum regexp_matches(PG_FUNCTION_ARGS) @@ -854,18 +897,22 @@ regexp_matches(PG_FUNCTION_ARGS) { text *pattern = PG_GETARG_TEXT_PP(1); text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + pg_re_flags re_flags; MemoryContext oldcontext; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* be sure to copy the input string into the multi-call ctx */ matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, - flags, + &re_flags, PG_GET_COLLATION(), - false, true, false); + true, false); - /* Pre-create workspace that build_regexp_matches_result needs */ + /* Pre-create workspace that build_regexp_match_result needs */ matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns); matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns); @@ -880,7 +927,7 @@ regexp_matches(PG_FUNCTION_ARGS) { ArrayType *result_ary; - result_ary = build_regexp_matches_result(matchctx); + result_ary = build_regexp_match_result(matchctx); matchctx->next_match++; SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); } @@ -899,28 +946,27 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS) } /* - * setup_regexp_matches --- do the initial matching for regexp_matches() - * or regexp_split() + * setup_regexp_matches --- do the initial matching for regexp_match + * and regexp_split functions * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * - * The three bool parameters have only two patterns (one for each caller) - * but it seems clearer to distinguish the functionality this way than to - * key it all off one "is_split" flag. + * The two bool parameters have only two patterns (one for matching, one for + * splitting) but it seems clearer to distinguish the functionality this way + * than to key it all off one "is_split" flag. */ static regexp_matches_ctx * -setup_regexp_matches(text *orig_str, text *pattern, text *flags, +setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, Oid collation, - bool force_glob, bool use_subpatterns, + bool use_subpatterns, bool ignore_degenerate) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int orig_len; pg_wchar *wide_str; int wide_len; - pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; @@ -937,21 +983,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags, wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); - /* determine options */ - parse_re_flags(&re_flags, flags); - if (force_glob) - { - /* user mustn't specify 'g' for regexp_split */ - if (re_flags.glob) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("regexp_split does not support the global option"))); - /* but we find all the matches anyway */ - re_flags.glob = true; - } - /* set up the compiled pattern */ - cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); + cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) @@ -970,7 +1003,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags, pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* the real output space (grown dynamically if needed) */ - array_len = re_flags.glob ? 256 : 32; + array_len = re_flags->glob ? 256 : 32; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; @@ -1018,7 +1051,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags, prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ - if (!re_flags.glob) + if (!re_flags->glob) break; /* @@ -1057,10 +1090,10 @@ cleanup_regexp_matches(regexp_matches_ctx *matchctx) } /* - * build_regexp_matches_result - build output array for current match + * build_regexp_match_result - build output array for current match */ static ArrayType * -build_regexp_matches_result(regexp_matches_ctx *matchctx) +build_regexp_match_result(regexp_matches_ctx *matchctx) { Datum *elems = matchctx->elems; bool *nulls = matchctx->nulls; @@ -1114,16 +1147,27 @@ regexp_split_to_table(PG_FUNCTION_ARGS) { text *pattern = PG_GETARG_TEXT_PP(1); text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2); + pg_re_flags re_flags; MemoryContext oldcontext; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split_to_table does not support the global option"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + /* be sure to copy the input string into the multi-call ctx */ splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern, - flags, + &re_flags, PG_GET_COLLATION(), - true, false, true); + false, true); MemoryContextSwitchTo(oldcontext); funcctx->user_fctx = (void *) splitctx; @@ -1162,13 +1206,24 @@ Datum regexp_split_to_array(PG_FUNCTION_ARGS) { ArrayBuildState *astate = NULL; + pg_re_flags re_flags; regexp_matches_ctx *splitctx; + /* Determine options */ + parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2)); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split_to_array does not support the global option"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), - PG_GETARG_TEXT_PP_IF_EXISTS(2), + &re_flags, PG_GET_COLLATION(), - true, false, true); + false, true); while (splitctx->next_match <= splitctx->nmatches) { |