diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2015-01-30 14:44:46 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2015-01-30 14:44:56 -0500 |
commit | 451d2808151e56c2c70893b8c3ee72af393a9f1d (patch) | |
tree | 3380b5cfd00c81235c609a805a02b773ecf77d79 /src/backend/utils/adt/json.c | |
parent | e40d43f88eb8617e7c1b3a03dec595efe6066f72 (diff) | |
download | postgresql-451d2808151e56c2c70893b8c3ee72af393a9f1d.tar.gz postgresql-451d2808151e56c2c70893b8c3ee72af393a9f1d.zip |
Fix jsonb Unicode escape processing, and in consequence disallow \u0000.
We've been trying to support \u0000 in JSON values since commit
78ed8e03c67d7333, and have introduced increasingly worse hacks to try to
make it work, such as commit 0ad1a816320a2b53. However, it fundamentally
can't work in the way envisioned, because the stored representation looks
the same as for \\u0000 which is not the same thing at all. It's also
entirely bogus to output \u0000 when de-escaped output is called for.
The right way to do this would be to store an actual 0x00 byte, and then
throw error only if asked to produce de-escaped textual output. However,
getting to that point seems likely to take considerable work and may well
never be practical in the 9.4.x series.
To preserve our options for better behavior while getting rid of the nasty
side-effects of 0ad1a816320a2b53, revert that commit in toto and instead
throw error if \u0000 is used in a context where it needs to be de-escaped.
(These are the same contexts where non-ASCII Unicode escapes throw error
if the database encoding isn't UTF8, so this behavior is by no means
without precedent.)
In passing, make both the \u0000 case and the non-ASCII Unicode case report
ERRCODE_UNTRANSLATABLE_CHARACTER / "unsupported Unicode escape sequence"
rather than claiming there's something wrong with the input syntax.
Back-patch to 9.4, where we have to do something because 0ad1a816320a2b53
broke things for many cases having nothing to do with \u0000. 9.3 also has
bogus behavior, but only for that specific escape value, so given the lack
of field complaints it seems better to leave 9.3 alone.
Diffstat (limited to 'src/backend/utils/adt/json.c')
-rw-r--r-- | src/backend/utils/adt/json.c | 49 |
1 files changed, 15 insertions, 34 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index 3c137ead1d0..951b6554007 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -806,14 +806,17 @@ json_lex_string(JsonLexContext *lex) * For UTF8, replace the escape sequence by the actual * utf8 character in lex->strval. Do this also for other * encodings if the escape designates an ASCII character, - * otherwise raise an error. We don't ever unescape a - * \u0000, since that would result in an impermissible nul - * byte. + * otherwise raise an error. */ if (ch == 0) { - appendStringInfoString(lex->strval, "\\u0000"); + /* We can't allow this, since our TEXT type doesn't */ + ereport(ERROR, + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("unsupported Unicode escape sequence"), + errdetail("\\u0000 cannot be converted to text."), + report_json_context(lex))); } else if (GetDatabaseEncoding() == PG_UTF8) { @@ -833,8 +836,8 @@ json_lex_string(JsonLexContext *lex) else { ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid input syntax for type json"), + (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER), + errmsg("unsupported Unicode escape sequence"), errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), report_json_context(lex))); } @@ -1284,8 +1287,8 @@ json_categorize_type(Oid typoid, /* * We need to get the output function for everything except date and - * timestamp types, array and composite types, booleans, - * and non-builtin types where there's a cast to json. + * timestamp types, array and composite types, booleans, and non-builtin + * types where there's a cast to json. */ switch (typoid) @@ -1335,11 +1338,12 @@ json_categorize_type(Oid typoid, /* but let's look for a cast to json, if it's not built-in */ if (typoid >= FirstNormalObjectId) { - Oid castfunc; + Oid castfunc; CoercionPathType ctype; ctype = find_coercion_pathway(JSONOID, typoid, - COERCION_EXPLICIT, &castfunc); + COERCION_EXPLICIT, + &castfunc); if (ctype == COERCION_PATH_FUNC && OidIsValid(castfunc)) { *tcategory = JSONTYPE_CAST; @@ -2382,30 +2386,7 @@ escape_json(StringInfo buf, const char *str) appendStringInfoString(buf, "\\\""); break; case '\\': - - /* - * Unicode escapes are passed through as is. There is no - * requirement that they denote a valid character in the - * server encoding - indeed that is a big part of their - * usefulness. - * - * All we require is that they consist of \uXXXX where the Xs - * are hexadecimal digits. It is the responsibility of the - * caller of, say, to_json() to make sure that the unicode - * escape is valid. - * - * In the case of a jsonb string value being escaped, the only - * unicode escape that should be present is \u0000, all the - * other unicode escapes will have been resolved. - */ - if (p[1] == 'u' && - isxdigit((unsigned char) p[2]) && - isxdigit((unsigned char) p[3]) && - isxdigit((unsigned char) p[4]) && - isxdigit((unsigned char) p[5])) - appendStringInfoCharMacro(buf, *p); - else - appendStringInfoString(buf, "\\\\"); + appendStringInfoString(buf, "\\\\"); break; default: if ((unsigned char) *p < ' ') |