diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2006-10-07 00:11:53 +0000 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2006-10-07 00:11:53 +0000 |
commit | 452fa214e5f035a3b58eff206205075d8562a3a5 (patch) | |
tree | 635ea9e9511284e485878292e5faa47194cc430a /src | |
parent | 11d4d424d4be8cccde845dd770e99ac03ce3306a (diff) | |
download | postgresql-452fa214e5f035a3b58eff206205075d8562a3a5.tar.gz postgresql-452fa214e5f035a3b58eff206205075d8562a3a5.zip |
Fix string_to_array() to correctly handle the case where there are
overlapping possible matches for the separator string, such as
string_to_array('123xx456xxx789', 'xx').
Also, revise the logic of replace(), split_part(), and string_to_array()
to avoid O(N^2) work from redundant searches and conversions to pg_wchar
format when there are N matches to the separator string.
Backpatched the full patch as far as 8.0. 7.4 also has the bug, but the
code has diverged a lot, so I just went for a quick-and-dirty fix of the
bug itself in that branch.
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/adt/varlena.c | 335 |
1 files changed, 197 insertions, 138 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 33f40b685c7..6d8216fcde8 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.151 2006/10/04 00:30:00 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.152 2006/10/07 00:11:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -30,6 +30,17 @@ typedef struct varlena unknown; +typedef struct +{ + bool use_wchar; /* T if multibyte encoding */ + char *str1; /* use these if not use_wchar */ + char *str2; /* note: these point to original texts */ + pg_wchar *wstr1; /* use these if use_wchar */ + pg_wchar *wstr2; /* note: these are palloc'd */ + int len1; /* string lengths in logical characters */ + int len2; +} TextPositionState; + #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X)) #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X)) #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n)) @@ -44,16 +55,13 @@ typedef struct varlena unknown; DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(str_))) #define TEXTLEN(textp) \ text_length(PointerGetDatum(textp)) -#define TEXTPOS(buf_text, from_sub_text) \ - text_position(buf_text, from_sub_text, 1) -#define LEFT(buf_text, from_sub_text) \ - text_substring(PointerGetDatum(buf_text), \ - 1, \ - TEXTPOS(buf_text, from_sub_text) - 1, false) static int text_cmp(text *arg1, text *arg2); static int32 text_length(Datum str); -static int32 text_position(text *t1, text *t2, int matchnum); +static int text_position(text *t1, text *t2); +static void text_position_setup(text *t1, text *t2, TextPositionState *state); +static int text_position_next(int start_pos, TextPositionState *state); +static void text_position_cleanup(TextPositionState *state); static text *text_substring(Datum str, int32 start, int32 length, @@ -709,7 +717,7 @@ textpos(PG_FUNCTION_ARGS) text *str = PG_GETARG_TEXT_P(0); text *search_str = PG_GETARG_TEXT_P(1); - PG_RETURN_INT32(text_position(str, search_str, 1)); + PG_RETURN_INT32((int32) text_position(str, search_str)); } /* @@ -719,7 +727,6 @@ textpos(PG_FUNCTION_ARGS) * Inputs: * t1 - string to be searched * t2 - pattern to match within t1 - * matchnum - number of the match to be found (1 is the first match) * Result: * Character index of the first matched char, starting from 1, * or 0 if no match. @@ -727,46 +734,92 @@ textpos(PG_FUNCTION_ARGS) * This is broken out so it can be called directly by other string processing * functions. */ -static int32 -text_position(text *t1, text *t2, int matchnum) +static int +text_position(text *t1, text *t2) { - int match = 0, - pos = 0, - p, - px, - len1, - len2; + TextPositionState state; + int result; - if (matchnum <= 0) - return 0; /* result for 0th match */ + text_position_setup(t1, t2, &state); + result = text_position_next(1, &state); + text_position_cleanup(&state); + return result; +} - if (VARSIZE(t2) <= VARHDRSZ) - return 1; /* result for empty pattern */ +/* + * text_position_setup, text_position_next, text_position_cleanup - + * Component steps of text_position() + * + * These are broken out so that a string can be efficiently searched for + * multiple occurrences of the same pattern. text_position_next may be + * called multiple times with increasing values of start_pos, which is + * the 1-based character position to start the search from. The "state" + * variable is normally just a local variable in the caller. + */ - len1 = VARSIZE(t1) - VARHDRSZ; - len2 = VARSIZE(t2) - VARHDRSZ; +static void +text_position_setup(text *t1, text *t2, TextPositionState *state) +{ + int len1 = VARSIZE(t1) - VARHDRSZ; + int len2 = VARSIZE(t2) - VARHDRSZ; if (pg_database_encoding_max_length() == 1) { /* simple case - single byte encoding */ - char *p1, + state->use_wchar = false; + state->str1 = VARDATA(t1); + state->str2 = VARDATA(t2); + state->len1 = len1; + state->len2 = len2; + } + else + { + /* not as simple - multibyte encoding */ + pg_wchar *p1, *p2; - p1 = VARDATA(t1); - p2 = VARDATA(t2); + p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar)); + len1 = pg_mb2wchar_with_len(VARDATA(t1), p1, len1); + p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar)); + len2 = pg_mb2wchar_with_len(VARDATA(t2), p2, len2); + + state->use_wchar = true; + state->wstr1 = p1; + state->wstr2 = p2; + state->len1 = len1; + state->len2 = len2; + } +} + +static int +text_position_next(int start_pos, TextPositionState *state) +{ + int pos = 0, + p, + px; + + Assert(start_pos > 0); /* else caller error */ + + if (state->len2 <= 0) + return start_pos; /* result for empty pattern */ + + if (!state->use_wchar) + { + /* simple case - single byte encoding */ + char *p1 = state->str1; + char *p2 = state->str2; /* no use in searching str past point where search_str will fit */ - px = (len1 - len2); + px = (state->len1 - state->len2); + + p1 += start_pos - 1; - for (p = 0; p <= px; p++) + for (p = start_pos - 1; p <= px; p++) { - if ((*p1 == *p2) && (strncmp(p1, p2, len2) == 0)) + if ((*p1 == *p2) && (strncmp(p1, p2, state->len2) == 0)) { - if (++match == matchnum) - { - pos = p + 1; - break; - } + pos = p + 1; + break; } p1++; } @@ -774,41 +827,38 @@ text_position(text *t1, text *t2, int matchnum) else { /* not as simple - multibyte encoding */ - pg_wchar *p1, - *p2, - *ps1, - *ps2; - - ps1 = p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar)); - (void) pg_mb2wchar_with_len(VARDATA(t1), p1, len1); - len1 = pg_wchar_strlen(p1); - ps2 = p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar)); - (void) pg_mb2wchar_with_len(VARDATA(t2), p2, len2); - len2 = pg_wchar_strlen(p2); + pg_wchar *p1 = state->wstr1; + pg_wchar *p2 = state->wstr2; /* no use in searching str past point where search_str will fit */ - px = (len1 - len2); + px = (state->len1 - state->len2); - for (p = 0; p <= px; p++) + p1 += start_pos - 1; + + for (p = start_pos - 1; p <= px; p++) { - if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, len2) == 0)) + if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, state->len2) == 0)) { - if (++match == matchnum) - { - pos = p + 1; - break; - } + pos = p + 1; + break; } p1++; } - - pfree(ps1); - pfree(ps2); } return pos; } +static void +text_position_cleanup(TextPositionState *state) +{ + if (state->use_wchar) + { + pfree(state->wstr1); + pfree(state->wstr2); + } +} + /* varstr_cmp() * Comparison function for text strings with given lengths. * Includes locale support, but must copy strings to temporary memory @@ -1325,6 +1375,7 @@ byteacat(PG_FUNCTION_ARGS) #define PG_STR_GET_BYTEA(str_) \ DatumGetByteaP(DirectFunctionCall1(byteain, CStringGetDatum(str_))) + /* * bytea_substr() * Return a substring starting at the specified position. @@ -2024,45 +2075,55 @@ replace_text(PG_FUNCTION_ARGS) text *to_sub_text = PG_GETARG_TEXT_P(2); int src_text_len = TEXTLEN(src_text); int from_sub_text_len = TEXTLEN(from_sub_text); - text *left_text; - text *right_text; - text *buf_text; + TextPositionState state; + text *chunk_text; text *ret_text; + int start_posn; int curr_posn; StringInfoData str; if (src_text_len == 0 || from_sub_text_len == 0) PG_RETURN_TEXT_P(src_text); - curr_posn = TEXTPOS(src_text, from_sub_text); + text_position_setup(src_text, from_sub_text, &state); + + start_posn = 1; + curr_posn = text_position_next(1, &state); /* When the from_sub_text is not found, there is nothing to do. */ if (curr_posn == 0) + { + text_position_cleanup(&state); PG_RETURN_TEXT_P(src_text); + } initStringInfo(&str); - buf_text = src_text; - while (curr_posn > 0) + do { - left_text = text_substring(PointerGetDatum(buf_text), - 1, curr_posn - 1, false); - right_text = text_substring(PointerGetDatum(buf_text), - curr_posn + from_sub_text_len, -1, true); + chunk_text = text_substring(PointerGetDatum(src_text), + start_posn, + curr_posn - start_posn, + false); + appendStringInfoText(&str, chunk_text); + pfree(chunk_text); - appendStringInfoText(&str, left_text); appendStringInfoText(&str, to_sub_text); - if (buf_text != src_text) - pfree(buf_text); - pfree(left_text); - buf_text = right_text; - curr_posn = TEXTPOS(buf_text, from_sub_text); + start_posn = curr_posn + from_sub_text_len; + curr_posn = text_position_next(start_posn, &state); } + while (curr_posn > 0); + + /* copy trailing chunk */ + chunk_text = text_substring(PointerGetDatum(src_text), + start_posn, + -1, + true); + appendStringInfoText(&str, chunk_text); + pfree(chunk_text); - appendStringInfoText(&str, buf_text); - if (buf_text != src_text) - pfree(buf_text); + text_position_cleanup(&state); ret_text = PG_STR_GET_TEXT(str.data); pfree(str.data); @@ -2335,6 +2396,7 @@ split_text(PG_FUNCTION_ARGS) int fldnum = PG_GETARG_INT32(2); int inputstring_len = TEXTLEN(inputstring); int fldsep_len = TEXTLEN(fldsep); + TextPositionState state; int start_posn; int end_posn; text *result_text; @@ -2359,40 +2421,54 @@ split_text(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(PG_STR_GET_TEXT("")); } - start_posn = text_position(inputstring, fldsep, fldnum - 1); - end_posn = text_position(inputstring, fldsep, fldnum); + text_position_setup(inputstring, fldsep, &state); - if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */ + /* identify bounds of first field */ + start_posn = 1; + end_posn = text_position_next(1, &state); + + /* special case if fldsep not found at all */ + if (end_posn == 0) { - /* if first field, return input string, else empty string */ + text_position_cleanup(&state); + /* if field 1 requested, return input string, else empty string */ if (fldnum == 1) PG_RETURN_TEXT_P(inputstring); else PG_RETURN_TEXT_P(PG_STR_GET_TEXT("")); } - else if (start_posn == 0) + + while (end_posn > 0 && --fldnum > 0) { - /* first field requested */ - result_text = LEFT(inputstring, fldsep); - PG_RETURN_TEXT_P(result_text); + /* identify bounds of next field */ + start_posn = end_posn + fldsep_len; + end_posn = text_position_next(start_posn, &state); } - else if (end_posn == 0) + + text_position_cleanup(&state); + + if (fldnum > 0) { - /* last field requested */ - result_text = text_substring(PointerGetDatum(inputstring), - start_posn + fldsep_len, - -1, true); - PG_RETURN_TEXT_P(result_text); + /* N'th field separator not found */ + /* if last field requested, return it, else empty string */ + if (fldnum == 1) + result_text = text_substring(PointerGetDatum(inputstring), + start_posn, + -1, + true); + else + result_text = PG_STR_GET_TEXT(""); } else { - /* interior field requested */ + /* non-last field requested */ result_text = text_substring(PointerGetDatum(inputstring), - start_posn + fldsep_len, - end_posn - start_posn - fldsep_len, + start_posn, + end_posn - start_posn, false); - PG_RETURN_TEXT_P(result_text); } + + PG_RETURN_TEXT_P(result_text); } /* @@ -2408,6 +2484,7 @@ text_to_array(PG_FUNCTION_ARGS) text *fldsep = PG_GETARG_TEXT_P(1); int inputstring_len = TEXTLEN(inputstring); int fldsep_len = TEXTLEN(fldsep); + TextPositionState state; int fldnum; int start_posn; int end_posn; @@ -2424,66 +2501,48 @@ text_to_array(PG_FUNCTION_ARGS) */ if (fldsep_len < 1) PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, - CStringGetDatum(inputstring), 1)); + PointerGetDatum(inputstring), 1)); - /* start with end position holding the initial start position */ - end_posn = 0; + text_position_setup(inputstring, fldsep, &state); + + start_posn = 1; for (fldnum = 1;; fldnum++) /* field number is 1 based */ { - Datum dvalue; - bool disnull = false; - - start_posn = end_posn; - end_posn = text_position(inputstring, fldsep, fldnum); + end_posn = text_position_next(start_posn, &state); - if ((start_posn == 0) && (end_posn == 0)) /* fldsep not found */ + if (end_posn == 0) { - if (fldnum == 1) - { - /* - * first element return one element, 1D, array using the input - * string - */ - PG_RETURN_ARRAYTYPE_P(create_singleton_array(fcinfo, TEXTOID, - CStringGetDatum(inputstring), 1)); - } - else - { - /* otherwise create array and exit */ - PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, - CurrentMemoryContext)); - } - } - else if (start_posn == 0) - { - /* first field requested */ - result_text = LEFT(inputstring, fldsep); - } - else if (end_posn == 0) - { - /* last field requested */ + /* fetch last field */ result_text = text_substring(PointerGetDatum(inputstring), - start_posn + fldsep_len, - -1, true); + start_posn, + -1, + true); } else { - /* interior field requested */ + /* fetch non-last field */ result_text = text_substring(PointerGetDatum(inputstring), - start_posn + fldsep_len, - end_posn - start_posn - fldsep_len, + start_posn, + end_posn - start_posn, false); } - /* stash away current value */ - dvalue = PointerGetDatum(result_text); - astate = accumArrayResult(astate, dvalue, - disnull, TEXTOID, + /* stash away this field */ + astate = accumArrayResult(astate, + PointerGetDatum(result_text), + false, + TEXTOID, CurrentMemoryContext); + + if (end_posn == 0) + break; + start_posn = end_posn + fldsep_len; } - /* never reached -- keep compiler quiet */ - PG_RETURN_NULL(); + text_position_cleanup(&state); + + PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, + CurrentMemoryContext)); } /* |