diff options
author | Jeff Davis <jdavis@postgresql.org> | 2024-03-19 15:24:41 -0700 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2024-03-19 15:24:41 -0700 |
commit | f69319f2f1fb16eda4b535bcccec90dff3a6795e (patch) | |
tree | 48077a7e6eb0309218b09a3be483aec37a6f204f /src/backend/regex | |
parent | fd0398fcb099980fbedbb7750356ef234408c1c9 (diff) | |
download | postgresql-f69319f2f1fb16eda4b535bcccec90dff3a6795e.tar.gz postgresql-f69319f2f1fb16eda4b535bcccec90dff3a6795e.zip |
Support C.UTF-8 locale in the new builtin collation provider.
The builtin C.UTF-8 locale has similar semantics to the libc locale of
the same name. That is, code point sort order (fast, memcmp-based)
combined with Unicode semantics for character operations such as
pattern matching, regular expressions, and
LOWER()/INITCAP()/UPPER(). The character semantics are based on
Unicode simple case mappings.
The builtin provider's C.UTF-8 offers several important advantages
over libc:
* faster sorting -- benefits from additional optimizations such as
abbreviated keys and varstrfastcmp_c
* faster case conversion, e.g. LOWER(), at least compared with some
libc implementations
* available on all platforms with identical semantics, and the
semantics are stable, testable, and documentable within a given
Postgres major version
Being based on memcmp, the builtin C.UTF-8 locale does not offer
natural language sort order. But it is an improvement for most use
cases that might otherwise use libc's "C.UTF-8" locale, as well as
many use cases that use libc's "C" locale.
Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/regc_pg_locale.c | 36 |
1 files changed, 35 insertions, 1 deletions
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index 6a26388bfa3..85f3238eb07 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -16,6 +16,8 @@ */ #include "catalog/pg_collation.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "utils/pg_locale.h" /* @@ -64,6 +66,7 @@ typedef enum { PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ + PG_REGEX_BUILTIN, /* built-in Unicode semantics */ PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */ PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */ PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */ @@ -266,7 +269,12 @@ pg_set_regex_collation(Oid collation) if (GetDatabaseEncoding() == PG_UTF8) { if (pg_regex_locale) - pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + { + if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN) + pg_regex_strategy = PG_REGEX_BUILTIN; + else + pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; + } else pg_regex_strategy = PG_REGEX_LOCALE_WIDE; } @@ -290,6 +298,8 @@ pg_wc_isdigit(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT)); + case PG_REGEX_BUILTIN: + return pg_u_isdigit(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswdigit((wint_t) c); @@ -322,6 +332,8 @@ pg_wc_isalpha(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA)); + case PG_REGEX_BUILTIN: + return pg_u_isalpha(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalpha((wint_t) c); @@ -354,6 +366,8 @@ pg_wc_isalnum(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM)); + case PG_REGEX_BUILTIN: + return pg_u_isalnum(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswalnum((wint_t) c); @@ -395,6 +409,8 @@ pg_wc_isupper(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER)); + case PG_REGEX_BUILTIN: + return pg_u_isupper(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswupper((wint_t) c); @@ -427,6 +443,8 @@ pg_wc_islower(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER)); + case PG_REGEX_BUILTIN: + return pg_u_islower(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswlower((wint_t) c); @@ -459,6 +477,8 @@ pg_wc_isgraph(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH)); + case PG_REGEX_BUILTIN: + return pg_u_isgraph(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswgraph((wint_t) c); @@ -491,6 +511,8 @@ pg_wc_isprint(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT)); + case PG_REGEX_BUILTIN: + return pg_u_isprint(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswprint((wint_t) c); @@ -523,6 +545,8 @@ pg_wc_ispunct(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT)); + case PG_REGEX_BUILTIN: + return pg_u_ispunct(c, true); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswpunct((wint_t) c); @@ -555,6 +579,8 @@ pg_wc_isspace(pg_wchar c) case PG_REGEX_LOCALE_C: return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE)); + case PG_REGEX_BUILTIN: + return pg_u_isspace(c); case PG_REGEX_LOCALE_WIDE: if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) return iswspace((wint_t) c); @@ -588,6 +614,8 @@ pg_wc_toupper(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_toupper((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_uppercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -628,6 +656,8 @@ pg_wc_tolower(pg_wchar c) if (c <= (pg_wchar) 127) return pg_ascii_tolower((unsigned char) c); return c; + case PG_REGEX_BUILTIN: + return unicode_lowercase_simple(c); case PG_REGEX_LOCALE_WIDE: /* force C behavior for ASCII characters, per comments above */ if (c <= (pg_wchar) 127) @@ -792,6 +822,9 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif break; + case PG_REGEX_BUILTIN: + max_chr = (pg_wchar) MAX_SIMPLE_CHR; + break; case PG_REGEX_LOCALE_WIDE: case PG_REGEX_LOCALE_WIDE_L: max_chr = (pg_wchar) MAX_SIMPLE_CHR; @@ -809,6 +842,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) max_chr = (pg_wchar) MAX_SIMPLE_CHR; break; default: + Assert(false); max_chr = 0; /* can't get here, but keep compiler quiet */ break; } |