aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/mb/mbutils.c
diff options
context:
space:
mode:
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>2021-04-01 11:45:22 +0300
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>2021-04-01 11:45:22 +0300
commitea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9 (patch)
tree1a2c73601043edd7d6512fba51859cc860215e15 /src/backend/utils/mb/mbutils.c
parente2639a767bfa1afebaf1877515a1187feb393443 (diff)
downloadpostgresql-ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9.tar.gz
postgresql-ea1b99a6619cd9dcfd46b82ac0d926b0b80e0ae9.zip
Add 'noError' argument to encoding conversion functions.
With the 'noError' argument, you can try to convert a buffer without knowing the character boundaries beforehand. The functions now need to return the number of input bytes successfully converted. This is is a backwards-incompatible change, if you have created a custom encoding conversion with CREATE CONVERSION. This adds a check to pg_upgrade for that, refusing the upgrade if there are any user-defined encoding conversions. Custom conversions are very rare, there are no commonly used extensions that I know of that uses that feature. No other objects can depend on conversions, so if you do have one, you can fairly easily drop it before upgrading, and recreate it after the upgrade with an updated version. Add regression tests for built-in encoding conversions. This doesn't cover every conversion, but it covers all the internal functions in conv.c that are used to implement the conversions. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
Diffstat (limited to 'src/backend/utils/mb/mbutils.c')
-rw-r--r--src/backend/utils/mb/mbutils.c79
1 files changed, 69 insertions, 10 deletions
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 2578573b0ab..a13c398f4ac 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len,
MemoryContextAllocHuge(CurrentMemoryContext,
(Size) len * MAX_CONVERSION_GROWTH + 1);
- OidFunctionCall5(proc,
- Int32GetDatum(src_encoding),
- Int32GetDatum(dest_encoding),
- CStringGetDatum(src),
- CStringGetDatum(result),
- Int32GetDatum(len));
+ (void) OidFunctionCall6(proc,
+ Int32GetDatum(src_encoding),
+ Int32GetDatum(dest_encoding),
+ CStringGetDatum(src),
+ CStringGetDatum(result),
+ Int32GetDatum(len),
+ BoolGetDatum(false));
/*
* If the result is large, it's worth repalloc'ing to release any extra
@@ -436,6 +437,62 @@ pg_do_encoding_conversion(unsigned char *src, int len,
}
/*
+ * Convert src string to another encoding.
+ *
+ * This function has a different API than the other conversion functions.
+ * The caller should've looked up the conversion function using
+ * FindDefaultConversionProc(). Unlike the other functions, the converted
+ * result is not palloc'd. It is written to the caller-supplied buffer
+ * instead.
+ *
+ * src_encoding - encoding to convert from
+ * dest_encoding - encoding to convert to
+ * src, srclen - input buffer and its length in bytes
+ * dest, destlen - destination buffer and its size in bytes
+ *
+ * The output is null-terminated.
+ *
+ * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
+ * wouldn't necessarily fit in the output buffer, and the function will not
+ * convert the whole input.
+ *
+ * TODO: The conversion function interface is not great. Firstly, it
+ * would be nice to pass through the destination buffer size to the
+ * conversion function, so that if you pass a shorter destination buffer, it
+ * could still continue to fill up the whole buffer. Currently, we have to
+ * assume worst case expansion and stop the conversion short, even if there
+ * is in fact space left in the destination buffer. Secondly, it would be
+ * nice to return the number of bytes written to the caller, to avoid a call
+ * to strlen().
+ */
+int
+pg_do_encoding_conversion_buf(Oid proc,
+ int src_encoding,
+ int dest_encoding,
+ unsigned char *src, int srclen,
+ unsigned char *dest, int destlen,
+ bool noError)
+{
+ Datum result;
+
+ /*
+ * If the destination buffer is not large enough to hold the result in the
+ * worst case, limit the input size passed to the conversion function.
+ */
+ if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
+ srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
+
+ result = OidFunctionCall6(proc,
+ Int32GetDatum(src_encoding),
+ Int32GetDatum(dest_encoding),
+ CStringGetDatum(src),
+ CStringGetDatum(dest),
+ Int32GetDatum(srclen),
+ BoolGetDatum(noError));
+ return DatumGetInt32(result);
+}
+
+/*
* Convert string to encoding encoding_name. The source
* encoding is the DB encoding.
*
@@ -762,12 +819,13 @@ perform_default_encoding_conversion(const char *src, int len,
MemoryContextAllocHuge(CurrentMemoryContext,
(Size) len * MAX_CONVERSION_GROWTH + 1);
- FunctionCall5(flinfo,
+ FunctionCall6(flinfo,
Int32GetDatum(src_encoding),
Int32GetDatum(dest_encoding),
CStringGetDatum(src),
CStringGetDatum(result),
- Int32GetDatum(len));
+ Int32GetDatum(len),
+ BoolGetDatum(false));
/*
* Release extra space if there might be a lot --- see comments in
@@ -849,12 +907,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
c_as_utf8[c_as_utf8_len] = '\0';
/* Convert, or throw error if we can't */
- FunctionCall5(Utf8ToServerConvProc,
+ FunctionCall6(Utf8ToServerConvProc,
Int32GetDatum(PG_UTF8),
Int32GetDatum(server_encoding),
CStringGetDatum(c_as_utf8),
CStringGetDatum(s),
- Int32GetDatum(c_as_utf8_len));
+ Int32GetDatum(c_as_utf8_len),
+ BoolGetDatum(false));
}