diff options
-rw-r--r-- | src/backend/utils/mb/mbutils.c | 18 | ||||
-rw-r--r-- | src/common/jsonapi.c | 7 | ||||
-rw-r--r-- | src/common/wchar.c | 51 | ||||
-rw-r--r-- | src/include/mb/pg_wchar.h | 2 | ||||
-rw-r--r-- | src/interfaces/libpq/fe-exec.c | 6 | ||||
-rw-r--r-- | src/interfaces/libpq/fe-misc.c | 15 | ||||
-rw-r--r-- | src/test/modules/test_escape/test_escape.c | 96 | ||||
-rw-r--r-- | src/test/regress/expected/conversion.out | 13 | ||||
-rw-r--r-- | src/test/regress/sql/conversion.sql | 7 |
9 files changed, 185 insertions, 30 deletions
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 5ddba5bccb4..308016d7763 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -1087,7 +1087,7 @@ pg_mbcliplen(const char *mbstr, int len, int limit) } /* - * pg_mbcliplen with specified encoding + * pg_mbcliplen with specified encoding; string must be valid in encoding */ int pg_encoding_mbcliplen(int encoding, const char *mbstr, @@ -1692,12 +1692,12 @@ check_encoding_conversion_args(int src_encoding, * report_invalid_encoding: complain about invalid multibyte character * * note: len is remaining length of string, not length of character; - * len must be greater than zero, as we always examine the first byte. + * len must be greater than zero (or we'd neglect initializing "buf"). */ void report_invalid_encoding(int encoding, const char *mbstr, int len) { - int l = pg_encoding_mblen(encoding, mbstr); + int l = pg_encoding_mblen_or_incomplete(encoding, mbstr, len); char buf[8 * 5 + 1]; char *p = buf; int j, @@ -1724,18 +1724,26 @@ report_invalid_encoding(int encoding, const char *mbstr, int len) * report_untranslatable_char: complain about untranslatable character * * note: len is remaining length of string, not length of character; - * len must be greater than zero, as we always examine the first byte. + * len must be greater than zero (or we'd neglect initializing "buf"). */ void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len) { - int l = pg_encoding_mblen(src_encoding, mbstr); + int l; char buf[8 * 5 + 1]; char *p = buf; int j, jlimit; + /* + * We probably could use plain pg_encoding_mblen(), because + * gb18030_to_utf8() verifies before it converts. All conversions should. + * For src_encoding!=GB18030, len>0 meets pg_encoding_mblen() needs. Even + * so, be defensive, since a buggy conversion might pass invalid data. + * This is not a performance-critical path. + */ + l = pg_encoding_mblen_or_incomplete(src_encoding, mbstr, len); jlimit = Min(l, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index f1c80933c92..7dad4da65f6 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -1982,8 +1982,11 @@ json_lex_string(JsonLexContext *lex) } while (0) #define FAIL_AT_CHAR_END(code) \ do { \ - const char *term = s + pg_encoding_mblen(lex->input_encoding, s); \ - lex->token_terminator = (term <= end) ? term : end; \ + ptrdiff_t remaining = end - s; \ + int charlen; \ + charlen = pg_encoding_mblen_or_incomplete(lex->input_encoding, \ + s, remaining); \ + lex->token_terminator = (charlen <= remaining) ? s + charlen : end; \ return code; \ } while (0) diff --git a/src/common/wchar.c b/src/common/wchar.c index 1e5de502276..a4bc29921de 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -12,6 +12,8 @@ */ #include "c.h" +#include <limits.h> + #include "mb/pg_wchar.h" #include "utils/ascii.h" @@ -2107,10 +2109,27 @@ const pg_wchar_tbl pg_wchar_table[] = { /* * Returns the byte length of a multibyte character. * - * Caution: when dealing with text that is not certainly valid in the - * specified encoding, the result may exceed the actual remaining - * string length. Callers that are not prepared to deal with that - * should use pg_encoding_mblen_bounded() instead. + * Choose "mblen" functions based on the input string characteristics. + * pg_encoding_mblen() can be used when ANY of these conditions are met: + * + * - The input string is zero-terminated + * + * - The input string is known to be valid in the encoding (e.g., string + * converted from database encoding) + * + * - The encoding is not GB18030 (e.g., when only database encodings are + * passed to 'encoding' parameter) + * + * encoding==GB18030 requires examining up to two bytes to determine character + * length. Therefore, callers satisfying none of those conditions must use + * pg_encoding_mblen_or_incomplete() instead, as access to mbstr[1] cannot be + * guaranteed to be within allocation bounds. + * + * When dealing with text that is not certainly valid in the specified + * encoding, the result may exceed the actual remaining string length. + * Callers that are not prepared to deal with that should use Min(remaining, + * pg_encoding_mblen_or_incomplete()). For zero-terminated strings, that and + * pg_encoding_mblen_bounded() are interchangeable. */ int pg_encoding_mblen(int encoding, const char *mbstr) @@ -2121,8 +2140,28 @@ pg_encoding_mblen(int encoding, const char *mbstr) } /* - * Returns the byte length of a multibyte character; but not more than - * the distance to end of string. + * Returns the byte length of a multibyte character (possibly not + * zero-terminated), or INT_MAX if too few bytes remain to determine a length. + */ +int +pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, + size_t remaining) +{ + /* + * Define zero remaining as too few, even for single-byte encodings. + * pg_gb18030_mblen() reads one or two bytes; single-byte encodings read + * zero; others read one. + */ + if (remaining < 1 || + (encoding == PG_GB18030 && IS_HIGHBIT_SET(*mbstr) && remaining < 2)) + return INT_MAX; + return pg_encoding_mblen(encoding, mbstr); +} + +/* + * Returns the byte length of a multibyte character; but not more than the + * distance to the terminating zero byte. For input that might lack a + * terminating zero, use Min(remaining, pg_encoding_mblen_or_incomplete()). */ int pg_encoding_mblen_bounded(int encoding, const char *mbstr) diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index bfef95baea2..4b4a9974b75 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -664,6 +664,8 @@ extern int pg_valid_server_encoding_id(int encoding); */ extern void pg_encoding_set_invalid(int encoding, char *dst); extern int pg_encoding_mblen(int encoding, const char *mbstr); +extern int pg_encoding_mblen_or_incomplete(int encoding, const char *mbstr, + size_t remaining); extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr); extern int pg_encoding_dsplen(int encoding, const char *mbstr); extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len); diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c index 294843ed8b0..4256ae5c0cc 100644 --- a/src/interfaces/libpq/fe-exec.c +++ b/src/interfaces/libpq/fe-exec.c @@ -4101,7 +4101,8 @@ PQescapeStringInternal(PGconn *conn, } /* Slow path for possible multibyte characters */ - charlen = pg_encoding_mblen(encoding, source); + charlen = pg_encoding_mblen_or_incomplete(encoding, + source, remaining); if (remaining < charlen || pg_encoding_verifymbchar(encoding, source, charlen) == -1) @@ -4245,7 +4246,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t len, bool as_ident) int charlen; /* Slow path for possible multibyte characters */ - charlen = pg_encoding_mblen(conn->client_encoding, s); + charlen = pg_encoding_mblen_or_incomplete(conn->client_encoding, + s, remaining); if (charlen > remaining) { diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c index d78445c70af..2648df93813 100644 --- a/src/interfaces/libpq/fe-misc.c +++ b/src/interfaces/libpq/fe-misc.c @@ -1221,13 +1221,9 @@ PQgetCurrentTimeUSec(void) */ /* - * Returns the byte length of the character beginning at s, using the - * specified encoding. - * - * Caution: when dealing with text that is not certainly valid in the - * specified encoding, the result may exceed the actual remaining - * string length. Callers that are not prepared to deal with that - * should use PQmblenBounded() instead. + * Like pg_encoding_mblen(). Use this in callers that want the + * dynamically-linked libpq's stance on encodings, even if that means + * different behavior in different startups of the executable. */ int PQmblen(const char *s, int encoding) @@ -1236,8 +1232,9 @@ PQmblen(const char *s, int encoding) } /* - * Returns the byte length of the character beginning at s, using the - * specified encoding; but not more than the distance to end of string. + * Like pg_encoding_mblen_bounded(). Use this in callers that want the + * dynamically-linked libpq's stance on encodings, even if that means + * different behavior in different startups of the executable. */ int PQmblenBounded(const char *s, int encoding) diff --git a/src/test/modules/test_escape/test_escape.c b/src/test/modules/test_escape/test_escape.c index ffd9d7166fa..59430ed46c4 100644 --- a/src/test/modules/test_escape/test_escape.c +++ b/src/test/modules/test_escape/test_escape.c @@ -12,6 +12,7 @@ #include <string.h> #include <stdio.h> +#include "common/jsonapi.h" #include "fe_utils/psqlscan.h" #include "fe_utils/string_utils.h" #include "getopt_long.h" @@ -164,6 +165,88 @@ encoding_conflicts_ascii(int encoding) } +/* + * Confirm escaping doesn't read past the end of an allocation. Consider the + * result of malloc(4096), in the absence of freelist entries satisfying the + * allocation. On OpenBSD, reading one byte past the end of that object + * yields SIGSEGV. + * + * Run this test before the program's other tests, so freelists are minimal. + * len=4096 didn't SIGSEGV, likely due to free() calls in libpq. len=8192 + * did. Use 128 KiB, to somewhat insulate the outcome from distant new free() + * calls and libc changes. + */ +static void +test_gb18030_page_multiple(pe_test_config *tc) +{ + PQExpBuffer testname; + size_t input_len = 0x20000; + char *input; + + /* prepare input */ + input = pg_malloc(input_len); + memset(input, '-', input_len - 1); + input[input_len - 1] = 0xfe; + + /* name to describe the test */ + testname = createPQExpBuffer(); + appendPQExpBuffer(testname, ">repeat(%c, %zu)", input[0], input_len - 1); + escapify(testname, input + input_len - 1, 1); + appendPQExpBuffer(testname, "< - GB18030 - PQescapeLiteral"); + + /* test itself */ + PQsetClientEncoding(tc->conn, "GB18030"); + report_result(tc, PQescapeLiteral(tc->conn, input, input_len) == NULL, + testname->data, "", + "input validity vs escape success", "ok"); + + destroyPQExpBuffer(testname); + pg_free(input); +} + +/* + * Confirm json parsing doesn't read past the end of an allocation. This + * exercises wchar.c infrastructure like the true "escape" tests do, but this + * isn't an "escape" test. + */ +static void +test_gb18030_json(pe_test_config *tc) +{ + PQExpBuffer raw_buf; + PQExpBuffer testname; + const char input[] = "{\"\\u\xFE"; + size_t input_len = sizeof(input) - 1; + JsonLexContext *lex; + JsonSemAction sem = {0}; /* no callbacks */ + JsonParseErrorType json_error; + + /* prepare input like test_one_vector_escape() does */ + raw_buf = createPQExpBuffer(); + appendBinaryPQExpBuffer(raw_buf, input, input_len); + appendPQExpBufferStr(raw_buf, NEVER_ACCESS_STR); + VALGRIND_MAKE_MEM_NOACCESS(&raw_buf->data[input_len], + raw_buf->len - input_len); + + /* name to describe the test */ + testname = createPQExpBuffer(); + appendPQExpBuffer(testname, ">"); + escapify(testname, input, input_len); + appendPQExpBuffer(testname, "< - GB18030 - pg_parse_json"); + + /* test itself */ + lex = makeJsonLexContextCstringLen(NULL, raw_buf->data, input_len, + PG_GB18030, false); + json_error = pg_parse_json(lex, &sem); + report_result(tc, json_error == JSON_UNICODE_ESCAPE_FORMAT, + testname->data, "", + "diagnosed", json_errdetail(json_error, lex)); + + freeJsonLexContext(lex); + destroyPQExpBuffer(testname); + destroyPQExpBuffer(raw_buf); +} + + static bool escape_literal(PGconn *conn, PQExpBuffer target, const char *unescaped, size_t unescaped_len, @@ -451,8 +534,18 @@ static pe_test_vector pe_test_vectors[] = * Testcases that are not null terminated for the specified input length. * That's interesting to verify that escape functions don't read beyond * the intended input length. + * + * One interesting special case is GB18030, which has the odd behaviour + * needing to read beyond the first byte to determine the length of a + * multi-byte character. */ TV_LEN("gbk", "\x80", 1), + TV_LEN("GB18030", "\x80", 1), + TV_LEN("GB18030", "\x80\0", 2), + TV_LEN("GB18030", "\x80\x30", 2), + TV_LEN("GB18030", "\x80\x30\0", 3), + TV_LEN("GB18030", "\x80\x30\x30", 3), + TV_LEN("GB18030", "\x80\x30\x30\0", 4), TV_LEN("UTF-8", "\xC3\xb6 ", 1), TV_LEN("UTF-8", "\xC3\xb6 ", 2), }; @@ -861,6 +954,9 @@ main(int argc, char *argv[]) exit(1); } + test_gb18030_page_multiple(&tc); + test_gb18030_json(&tc); + for (int i = 0; i < lengthof(pe_test_vectors); i++) { test_one_vector(&tc, &pe_test_vectors[i]); diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index d785f92561e..7dd1ef6161f 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -508,10 +508,13 @@ insert into gb18030_inputs values ('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'), ('\x666f6f84309c', 'incomplete char '), ('\x666f6f84309c0a', 'incomplete char, followed by newline '), + ('\x666f6f84', 'incomplete char at end'), ('\x666f6f84309c3800', 'invalid, NUL byte'), ('\x666f6f84309c0038', 'invalid, NUL byte'); --- Test GB18030 verification -select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs; +-- Test GB18030 verification. Round-trip through text so the backing of the +-- bytea values is palloc, not shared_buffers. This lets Valgrind detect +-- reads past the end. +select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs; description | inbytes | result | errorat | error ------------------------------------------------+--------------------+------------------+--------------+------------------------------------------------------------------- valid, pure ASCII | \x666f6f | \x666f6f | | @@ -520,9 +523,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from g valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6f84309c38 | | incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a + incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84 invalid, NUL byte | \x666f6f84309c3800 | \x666f6f84309c38 | \x00 | invalid byte sequence for encoding "GB18030": 0x00 invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00 -(8 rows) +(9 rows) -- Test conversions from GB18030 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs; @@ -534,9 +538,10 @@ select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18 valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6fefa8aa | | incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a + incomplete char at end | \x666f6f84 | \x666f6f | \x84 | invalid byte sequence for encoding "GB18030": 0x84 invalid, NUL byte | \x666f6f84309c3800 | \x666f6fefa8aa | \x00 | invalid byte sequence for encoding "GB18030": 0x00 invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00 -(8 rows) +(9 rows) -- -- ISO-8859-5 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index b567a1a5721..a80d62367a2 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -300,11 +300,14 @@ insert into gb18030_inputs values ('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'), ('\x666f6f84309c', 'incomplete char '), ('\x666f6f84309c0a', 'incomplete char, followed by newline '), + ('\x666f6f84', 'incomplete char at end'), ('\x666f6f84309c3800', 'invalid, NUL byte'), ('\x666f6f84309c0038', 'invalid, NUL byte'); --- Test GB18030 verification -select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs; +-- Test GB18030 verification. Round-trip through text so the backing of the +-- bytea values is palloc, not shared_buffers. This lets Valgrind detect +-- reads past the end. +select description, inbytes, (test_conv(inbytes::text::bytea, 'gb18030', 'gb18030')).* from gb18030_inputs; -- Test conversions from GB18030 select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs; |