aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/varlena.c
diff options
context:
space:
mode:
authorJeff Davis <jdavis@postgresql.org>2023-11-01 22:47:06 -0700
committerJeff Davis <jdavis@postgresql.org>2023-11-01 22:47:06 -0700
commita02b37fc083239a07f1ac02951d208235efb218b (patch)
tree368caec52740ec496992c2793b6a0c33db5aa020 /src/backend/utils/adt/varlena.c
parent7021d3b1766420ac4968fa0ff81873e81b7fd641 (diff)
downloadpostgresql-a02b37fc083239a07f1ac02951d208235efb218b.tar.gz
postgresql-a02b37fc083239a07f1ac02951d208235efb218b.zip
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r--src/backend/utils/adt/varlena.c61
1 files changed, 61 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index ec4e580d7fe..f6b1156dd13 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -23,7 +23,9 @@
#include "catalog/pg_type.h"
#include "common/hashfn.h"
#include "common/int.h"
+#include "common/unicode_category.h"
#include "common/unicode_norm.h"
+#include "common/unicode_version.h"
#include "funcapi.h"
#include "lib/hyperloglog.h"
#include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
return form;
}
+/*
+ * Returns version of Unicode used by Postgres in "major.minor" format (the
+ * same format as the Unicode version reported by ICU). The third component
+ * ("update version") never involves additions to the character repertiore and
+ * is unimportant for most purposes.
+ *
+ * See: https://unicode.org/versions/
+ */
+Datum
+unicode_version(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
+}
+
+/*
+ * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
+ */
+Datum
+icu_unicode_version(PG_FUNCTION_ARGS)
+{
+#ifdef USE_ICU
+ PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
+#else
+ PG_RETURN_NULL();
+#endif
+}
+
+/*
+ * Check whether the string contains only assigned Unicode code
+ * points. Requires that the database encoding is UTF-8.
+ */
+Datum
+unicode_assigned(PG_FUNCTION_ARGS)
+{
+ text *input = PG_GETARG_TEXT_PP(0);
+ unsigned char *p;
+ int size;
+
+ if (GetDatabaseEncoding() != PG_UTF8)
+ ereport(ERROR,
+ (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
+
+ /* convert to pg_wchar */
+ size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+ p = (unsigned char *) VARDATA_ANY(input);
+ for (int i = 0; i < size; i++)
+ {
+ pg_wchar uchar = utf8_to_unicode(p);
+ int category = unicode_category(uchar);
+
+ if (category == PG_U_UNASSIGNED)
+ PG_RETURN_BOOL(false);
+
+ p += pg_utf_mblen(p);
+ }
+
+ PG_RETURN_BOOL(true);
+}
+
Datum
unicode_normalize_func(PG_FUNCTION_ARGS)
{