Additional unicode primitive functions.

Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
author: Jeff Davis <jdavis@postgresql.org> 2023-11-01 22:47:06 -0700
committer: Jeff Davis <jdavis@postgresql.org> 2023-11-01 22:47:06 -0700
commit: a02b37fc083239a07f1ac02951d208235efb218b (patch)
tree: 368caec52740ec496992c2793b6a0c33db5aa020 /src/backend/utils/adt/varlena.c
parent: 7021d3b1766420ac4968fa0ff81873e81b7fd641 (diff)
download: postgresql-a02b37fc083239a07f1ac02951d208235efb218b.tar.gz
postgresql-a02b37fc083239a07f1ac02951d208235efb218b.zip
1 files changed, 61 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index ec4e580d7fe..f6b1156dd13 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -23,7 +23,9 @@
 #include "catalog/pg_type.h"
 #include "common/hashfn.h"
 #include "common/int.h"
+#include "common/unicode_category.h"
 #include "common/unicode_norm.h"
+#include "common/unicode_version.h"
 #include "funcapi.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
 	return form;
 }
 
+/*
+ * Returns version of Unicode used by Postgres in "major.minor" format (the
+ * same format as the Unicode version reported by ICU). The third component
+ * ("update version") never involves additions to the character repertiore and
+ * is unimportant for most purposes.
+ *
+ * See: https://unicode.org/versions/
+ */
+Datum
+unicode_version(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
+}
+
+/*
+ * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
+ */
+Datum
+icu_unicode_version(PG_FUNCTION_ARGS)
+{
+#ifdef USE_ICU
+	PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
+#else
+	PG_RETURN_NULL();
+#endif
+}
+
+/*
+ * Check whether the string contains only assigned Unicode code
+ * points. Requires that the database encoding is UTF-8.
+ */
+Datum
+unicode_assigned(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	unsigned char *p;
+	int			size;
+
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (int i = 0; i < size; i++)
+	{
+		pg_wchar	uchar = utf8_to_unicode(p);
+		int			category = unicode_category(uchar);
+
+		if (category == PG_U_UNASSIGNED)
+			PG_RETURN_BOOL(false);
+
+		p += pg_utf_mblen(p);
+	}
+
+	PG_RETURN_BOOL(true);
+}
+
 Datum
 unicode_normalize_func(PG_FUNCTION_ARGS)
 {
author	Jeff Davis <jdavis@postgresql.org>	2023-11-01 22:47:06 -0700
committer	Jeff Davis <jdavis@postgresql.org>	2023-11-01 22:47:06 -0700
commit	a02b37fc083239a07f1ac02951d208235efb218b (patch)
tree	368caec52740ec496992c2793b6a0c33db5aa020 /src/backend/utils/adt/varlena.c
parent	7021d3b1766420ac4968fa0ff81873e81b7fd641 (diff)
download	postgresql-a02b37fc083239a07f1ac02951d208235efb218b.tar.gz postgresql-a02b37fc083239a07f1ac02951d208235efb218b.zip