diff options
author | Andrew Dunstan <andrew@dunslane.net> | 2013-06-08 09:12:48 -0400 |
---|---|---|
committer | Andrew Dunstan <andrew@dunslane.net> | 2013-06-08 09:12:48 -0400 |
commit | 94e3311b97448324d67ba9a527854271373329d9 (patch) | |
tree | f54ab210d201b70735affadcd018c00c8db737c4 /src/backend/utils/adt/json.c | |
parent | c99d5d1bcc137c15058458bbdcdd2789b56e4c66 (diff) | |
download | postgresql-94e3311b97448324d67ba9a527854271373329d9.tar.gz postgresql-94e3311b97448324d67ba9a527854271373329d9.zip |
Handle Unicode surrogate pairs correctly when processing JSON.
In 9.2, Unicode escape sequences are not analysed at all other than
to make sure that they are in the form \uXXXX. But in 9.3 many of the
new operators and functions try to turn JSON text values into text in
the server encoding, and this includes de-escaping Unicode escape
sequences. This processing had not taken into account the possibility
that this might contain a surrogate pair to designate a character
outside the BMP. That is now handled correctly.
This also enforces correct use of surrogate pairs, something that is not
done by the type's input routines. This fact is noted in the docs.
Diffstat (limited to 'src/backend/utils/adt/json.c')
-rw-r--r-- | src/backend/utils/adt/json.c | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index aaf99bddf27..d8046c5b54d 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex) { char *s; int len; + int hi_surrogate = -1; if (lex->strval != NULL) resetStringInfo(lex->strval); @@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex) int utf8len; char *converted; + if (ch >= 0xd800 && ch <= 0xdbff) + { + if (hi_surrogate != -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("high order surrogate must not follow a high order surrogate."), + report_json_context(lex))); + hi_surrogate = (ch & 0x3ff) << 10; + continue; + } + else if (ch >= 0xdc00 && ch <= 0xdfff) + { + if (hi_surrogate == -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("low order surrogate must follow a high order surrogate."), + report_json_context(lex))); + ch = 0x10000 + hi_surrogate + (ch & 0x3ff); + hi_surrogate = -1; + } + + if (hi_surrogate != -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("low order surrogate must follow a high order surrogate."), + report_json_context(lex))); + unicode_to_utf8(ch, (unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str); utf8str[utf8len] = '\0'; @@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex) } else if (lex->strval != NULL) { + if (hi_surrogate != -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("low order surrogate must follow a high order surrogate."), + report_json_context(lex))); + switch (*s) { case '"': @@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex) } else if (lex->strval != NULL) { + if (hi_surrogate != -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("low order surrogate must follow a high order surrogate."), + report_json_context(lex))); + appendStringInfoChar(lex->strval, *s); } } + if (hi_surrogate != -1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type json"), + errdetail("low order surrogate must follow a high order surrogate."), + report_json_context(lex))); + /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; |