diff options
author | Jeff Davis <jdavis@postgresql.org> | 2023-11-01 22:47:06 -0700 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2023-11-01 22:47:06 -0700 |
commit | a02b37fc083239a07f1ac02951d208235efb218b (patch) | |
tree | 368caec52740ec496992c2793b6a0c33db5aa020 /src/backend/utils/adt/varlena.c | |
parent | 7021d3b1766420ac4968fa0ff81873e81b7fd641 (diff) | |
download | postgresql-a02b37fc083239a07f1ac02951d208235efb218b.tar.gz postgresql-a02b37fc083239a07f1ac02951d208235efb218b.zip |
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and
unicode_assigned().
The latter requires introducing a new lookup table for the Unicode
General Category, which is generated along with the other Unicode
lookup tables.
Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com
Reviewed-by: Peter Eisentraut
Diffstat (limited to 'src/backend/utils/adt/varlena.c')
-rw-r--r-- | src/backend/utils/adt/varlena.c | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index ec4e580d7fe..f6b1156dd13 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -23,7 +23,9 @@ #include "catalog/pg_type.h" #include "common/hashfn.h" #include "common/int.h" +#include "common/unicode_category.h" #include "common/unicode_norm.h" +#include "common/unicode_version.h" #include "funcapi.h" #include "lib/hyperloglog.h" #include "libpq/pqformat.h" @@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr) return form; } +/* + * Returns version of Unicode used by Postgres in "major.minor" format (the + * same format as the Unicode version reported by ICU). The third component + * ("update version") never involves additions to the character repertiore and + * is unimportant for most purposes. + * + * See: https://unicode.org/versions/ + */ +Datum +unicode_version(PG_FUNCTION_ARGS) +{ + PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION)); +} + +/* + * Returns version of Unicode used by ICU, if enabled; otherwise NULL. + */ +Datum +icu_unicode_version(PG_FUNCTION_ARGS) +{ +#ifdef USE_ICU + PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION)); +#else + PG_RETURN_NULL(); +#endif +} + +/* + * Check whether the string contains only assigned Unicode code + * points. Requires that the database encoding is UTF-8. + */ +Datum +unicode_assigned(PG_FUNCTION_ARGS) +{ + text *input = PG_GETARG_TEXT_PP(0); + unsigned char *p; + int size; + + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errmsg("Unicode categorization can only be performed if server encoding is UTF8"))); + + /* convert to pg_wchar */ + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); + p = (unsigned char *) VARDATA_ANY(input); + for (int i = 0; i < size; i++) + { + pg_wchar uchar = utf8_to_unicode(p); + int category = unicode_category(uchar); + + if (category == PG_U_UNASSIGNED) + PG_RETURN_BOOL(false); + + p += pg_utf_mblen(p); + } + + PG_RETURN_BOOL(true); +} + Datum unicode_normalize_func(PG_FUNCTION_ARGS) { |