diff options
author | Peter Eisentraut <peter@eisentraut.org> | 2021-03-28 08:16:15 +0200 |
---|---|---|
committer | Peter Eisentraut <peter@eisentraut.org> | 2021-03-29 11:56:53 +0200 |
commit | f37fec837ce8bf7af408ba66d32099e5a0182402 (patch) | |
tree | cfbdeae1f431dab8803c9d2cac91e4d71a5e1cad /src/backend/utils/adt/varlena.c | |
parent | ebedd0c78fc51c293abe56e99a18c67af14da0c9 (diff) | |
download | postgresql-f37fec837ce8bf7af408ba66d32099e5a0182402.tar.gz postgresql-f37fec837ce8bf7af408ba66d32099e5a0182402.zip |
Add unistr function
This allows decoding a string with Unicode escape sequences. It is
similar to Unicode escape strings, but offers some more flexibility.
Author: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Asif Rehman <asifr.rehman@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRA5GnKT+gDVwbVRH2ep451H_myBt+NTz8RkYUARE9+qOQ@mail.gmail.com
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r-- | src/backend/utils/adt/varlena.c | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 640e3fd4c04..efc74e8f2d7 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -6380,3 +6380,213 @@ unicode_is_normalized(PG_FUNCTION_ARGS) PG_RETURN_BOOL(result); } + +/* + * Check if first n chars are hexadecimal digits + */ +static bool +isxdigits_n(const char *instr, size_t n) +{ + for (size_t i = 0; i < n; i++) + if (!isxdigit((unsigned char) instr[i])) + return false; + + return true; +} + +static unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* + * Translate string with hexadecimal digits to number + */ +static unsigned int +hexval_n(const char *instr, size_t n) +{ + unsigned int result = 0; + + for (size_t i = 0; i < n; i++) + result += hexval(instr[i]) << (4 * (n - i - 1)); + + return result; +} + +/* + * Replaces Unicode escape sequences by Unicode characters + */ +Datum +unistr(PG_FUNCTION_ARGS) +{ + text *input_text = PG_GETARG_TEXT_PP(0); + char *instr; + int len; + StringInfoData str; + text *result; + pg_wchar pair_first = 0; + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + + instr = VARDATA_ANY(input_text); + len = VARSIZE_ANY_EXHDR(input_text); + + initStringInfo(&str); + + while (len > 0) + { + if (instr[0] == '\\') + { + if (len >= 2 && + instr[1] == '\\') + { + if (pair_first) + goto invalid_pair; + appendStringInfoChar(&str, '\\'); + instr += 2; + len -= 2; + } + else if ((len >= 5 && isxdigits_n(instr + 1, 4)) || + (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4))) + { + pg_wchar unicode; + int offset = instr[1] == 'u' ? 2 : 1; + + unicode = hexval_n(instr + offset, 4); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 4 + offset; + len -= 4 + offset; + } + else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6)) + { + pg_wchar unicode; + + unicode = hexval_n(instr + 2, 6); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 8; + len -= 8; + } + else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8)) + { + pg_wchar unicode; + + unicode = hexval_n(instr + 2, 8); + + if (!is_valid_unicode_codepoint(unicode)) + ereport(ERROR, + errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid Unicode code point: %04X", unicode)); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 10; + len -= 10; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX."))); + } + else + { + if (pair_first) + goto invalid_pair; + + appendStringInfoChar(&str, *instr++); + len--; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"))); +} |