aboutsummaryrefslogtreecommitdiff
path: root/src/fe_utils/string_utils.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/fe_utils/string_utils.c')
-rw-r--r--src/fe_utils/string_utils.c170
1 files changed, 136 insertions, 34 deletions
diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index 2945e28a277..0d914c7cb9b 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -104,6 +104,7 @@ fmtIdEnc(const char *rawid, int encoding)
const char *cp;
bool need_quotes = false;
+ size_t remaining = strlen(rawid);
/*
* These checks need to match the identifier production in scan.l. Don't
@@ -117,7 +118,8 @@ fmtIdEnc(const char *rawid, int encoding)
else
{
/* otherwise check the entire string */
- for (cp = rawid; *cp; cp++)
+ cp = rawid;
+ for (size_t i = 0; i < remaining; i++, cp++)
{
if (!((*cp >= 'a' && *cp <= 'z')
|| (*cp >= '0' && *cp <= '9')
@@ -153,17 +155,90 @@ fmtIdEnc(const char *rawid, int encoding)
else
{
appendPQExpBufferChar(id_return, '"');
- for (cp = rawid; *cp; cp++)
+
+ cp = &rawid[0];
+ while (remaining > 0)
{
- /*
- * Did we find a double-quote in the string? Then make this a
- * double double-quote per SQL99. Before, we put in a
- * backslash/double-quote pair. - thomas 2000-08-05
- */
- if (*cp == '"')
- appendPQExpBufferChar(id_return, '"');
- appendPQExpBufferChar(id_return, *cp);
+ int charlen;
+
+ /* Fast path for plain ASCII */
+ if (!IS_HIGHBIT_SET(*cp))
+ {
+ /*
+ * Did we find a double-quote in the string? Then make this a
+ * double double-quote per SQL99. Before, we put in a
+ * backslash/double-quote pair. - thomas 2000-08-05
+ */
+ if (*cp == '"')
+ appendPQExpBufferChar(id_return, '"');
+ appendPQExpBufferChar(id_return, *cp);
+ remaining--;
+ cp++;
+ continue;
+ }
+
+ /* Slow path for possible multibyte characters */
+ charlen = pg_encoding_mblen(encoding, cp);
+
+ if (remaining < charlen)
+ {
+ /*
+ * If the character is longer than the available input,
+ * replace the string with an invalid sequence. The invalid
+ * sequence ensures that the escaped string will trigger an
+ * error on the server-side, even if we can't directly report
+ * an error here.
+ */
+ enlargePQExpBuffer(id_return, 2);
+ pg_encoding_set_invalid(encoding,
+ id_return->data + id_return->len);
+ id_return->len += 2;
+ id_return->data[id_return->len] = '\0';
+
+ /* there's no more input data, so we can stop */
+ break;
+ }
+ else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
+ {
+ /*
+ * Multibyte character is invalid. It's important to verify
+ * that as invalid multi-byte characters could e.g. be used to
+ * "skip" over quote characters, e.g. when parsing
+ * character-by-character.
+ *
+ * Replace the bytes corresponding to the invalid character
+ * with an invalid sequence, for the same reason as above.
+ *
+ * It would be a bit faster to verify the whole string the
+ * first time we encounter a set highbit, but this way we can
+ * replace just the invalid characters, which probably makes
+ * it easier for users to find the invalidly encoded portion
+ * of a larger string.
+ */
+ enlargePQExpBuffer(id_return, 2);
+ pg_encoding_set_invalid(encoding,
+ id_return->data + id_return->len);
+ id_return->len += 2;
+ id_return->data[id_return->len] = '\0';
+
+ /*
+ * Copy the rest of the string after the invalid multi-byte
+ * character.
+ */
+ remaining -= charlen;
+ cp += charlen;
+ }
+ else
+ {
+ for (int i = 0; i < charlen; i++)
+ {
+ appendPQExpBufferChar(id_return, *cp);
+ remaining--;
+ cp++;
+ }
+ }
}
+
appendPQExpBufferChar(id_return, '"');
}
@@ -290,6 +365,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
size_t length = strlen(str);
const char *source = str;
char *target;
+ size_t remaining = length;
if (!enlargePQExpBuffer(buf, 2 * length + 2))
return;
@@ -297,10 +373,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
target = buf->data + buf->len;
*target++ = '\'';
- while (*source != '\0')
+ while (remaining > 0)
{
char c = *source;
- int len;
+ int charlen;
int i;
/* Fast path for plain ASCII */
@@ -312,39 +388,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
/* Copy the character */
*target++ = c;
source++;
+ remaining--;
continue;
}
/* Slow path for possible multibyte characters */
- len = PQmblen(source, encoding);
+ charlen = PQmblen(source, encoding);
- /* Copy the character */
- for (i = 0; i < len; i++)
+ if (remaining < charlen)
{
- if (*source == '\0')
- break;
- *target++ = *source++;
- }
+ /*
+ * If the character is longer than the available input, replace
+ * the string with an invalid sequence. The invalid sequence
+ * ensures that the escaped string will trigger an error on the
+ * server-side, even if we can't directly report an error here.
+ *
+ * We know there's enough space for the invalid sequence because
+ * the "target" buffer is 2 * length + 2 long, and at worst we're
+ * replacing a single input byte with two invalid bytes.
+ */
+ pg_encoding_set_invalid(encoding, target);
+ target += 2;
- /*
- * If we hit premature end of string (ie, incomplete multibyte
- * character), try to pad out to the correct length with spaces. We
- * may not be able to pad completely, but we will always be able to
- * insert at least one pad space (since we'd not have quoted a
- * multibyte character). This should be enough to make a string that
- * the server will error out on.
- */
- if (i < len)
+ /* there's no more valid input data, so we can stop */
+ break;
+ }
+ else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
{
- char *stop = buf->data + buf->maxlen - 2;
+ /*
+ * Multibyte character is invalid. It's important to verify that
+ * as invalid multi-byte characters could e.g. be used to "skip"
+ * over quote characters, e.g. when parsing
+ * character-by-character.
+ *
+ * Replace the bytes corresponding to the invalid character with
+ * an invalid sequence, for the same reason as above.
+ *
+ * It would be a bit faster to verify the whole string the first
+ * time we encounter a set highbit, but this way we can replace
+ * just the invalid characters, which probably makes it easier for
+ * users to find the invalidly encoded portion of a larger string.
+ */
+ pg_encoding_set_invalid(encoding, target);
+ target += 2;
+ remaining -= charlen;
- for (; i < len; i++)
+ /*
+ * Copy the rest of the string after the invalid multi-byte
+ * character.
+ */
+ source += charlen;
+ }
+ else
+ {
+ /* Copy the character */
+ for (i = 0; i < charlen; i++)
{
- if (target >= stop)
- break;
- *target++ = ' ';
+ *target++ = *source++;
+ remaining--;
}
- break;
}
}