Fix unescaping of JSON Unicode escapes, especially for non-UTF8.

Per discussion on -hackers. We treat Unicode escapes when unescaping them similarly to the way we treat them in PostgreSQL string literals. Escapes in the ASCII range are always accepted, no matter what the database encoding. Escapes for higher code points are only processed in UTF8 databases, and attempts to process them in other databases will result in an error. \u0000 is never unescaped, since it would result in an impermissible null byte.
author: Andrew Dunstan <andrew@dunslane.net> 2013-06-12 13:35:24 -0400
committer: Andrew Dunstan <andrew@dunslane.net> 2013-06-12 13:35:24 -0400
commit: 78ed8e03c67d7333708f5c1873ec1d239ae2d7e0 (patch)
tree: 3e55afe4518926147b449171566cba0ebccdb358 /src/backend/utils/adt/json.c
parent: c1d729b419ee876c32ddf4ac3a85fa89a6b4a89b (diff)
download: postgresql-78ed8e03c67d7333708f5c1873ec1d239ae2d7e0.tar.gz
postgresql-78ed8e03c67d7333708f5c1873ec1d239ae2d7e0.zip
1 files changed, 34 insertions, 8 deletions
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index d8046c5b54d..a1c7f51efaa 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -717,7 +717,6 @@ json_lex_string(JsonLexContext *lex)
 				{
 					char		utf8str[5];
 					int			utf8len;
-					char	   *converted;
 
 					if (ch >= 0xd800 && ch <= 0xdbff)
 					{
@@ -749,13 +748,40 @@ json_lex_string(JsonLexContext *lex)
 								 errdetail("low order surrogate must follow a high order surrogate."),
 								 report_json_context(lex)));
 
-					unicode_to_utf8(ch, (unsigned char *) utf8str);
-					utf8len = pg_utf_mblen((unsigned char *) utf8str);
-					utf8str[utf8len] = '\0';
-					converted = pg_any_to_server(utf8str, utf8len, PG_UTF8);
-					appendStringInfoString(lex->strval, converted);
-					if (converted != utf8str)
-						pfree(converted);
+					/*
+					 * For UTF8, replace the escape sequence by the actual utf8
+					 * character in lex->strval. Do this also for other encodings
+					 * if the escape designates an ASCII character, otherwise
+					 * raise an error. We don't ever unescape a \u0000, since that
+					 * would result in an impermissible nul byte.
+					 */
+
+					if (ch == 0)
+					{
+						appendStringInfoString(lex->strval, "\\u0000");
+					}
+					else if (GetDatabaseEncoding() == PG_UTF8)
+					{
+						unicode_to_utf8(ch, (unsigned char *) utf8str);
+						utf8len = pg_utf_mblen((unsigned char *) utf8str);
+						appendBinaryStringInfo(lex->strval, utf8str, utf8len);
+					}
+					else if (ch <= 0x007f)
+					{
+						/*
+						 * This is the only way to designate things like a form feed
+						 * character in JSON, so it's useful in all encodings.
+						 */
+						appendStringInfoChar(lex->strval, (char) ch);
+					}
+					else
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								 errmsg("invalid input syntax for type json"),
+								 errdetail("Unicode escape for code points higher than U+007F not permitted in non-UTF8 encoding"),
+								 report_json_context(lex)));
+					}
 
 				}
 			}
author	Andrew Dunstan <andrew@dunslane.net>	2013-06-12 13:35:24 -0400
committer	Andrew Dunstan <andrew@dunslane.net>	2013-06-12 13:35:24 -0400
commit	78ed8e03c67d7333708f5c1873ec1d239ae2d7e0 (patch)
tree	3e55afe4518926147b449171566cba0ebccdb358 /src/backend/utils/adt/json.c
parent	c1d729b419ee876c32ddf4ac3a85fa89a6b4a89b (diff)
download	postgresql-78ed8e03c67d7333708f5c1873ec1d239ae2d7e0.tar.gz postgresql-78ed8e03c67d7333708f5c1873ec1d239ae2d7e0.zip