diff options
author | Jeff Davis <jdavis@postgresql.org> | 2025-04-21 12:34:58 -0700 |
---|---|---|
committer | Jeff Davis <jdavis@postgresql.org> | 2025-04-21 12:34:58 -0700 |
commit | 90260e2ec6bbfc3dfa9d9501ab75c535de52f677 (patch) | |
tree | eedee7e0630fc5f52235270186f8a061777e9500 /src | |
parent | 80b727eb9deab589a8648750bc20f1623d5acd3e (diff) | |
download | postgresql-90260e2ec6bbfc3dfa9d9501ab75c535de52f677.tar.gz postgresql-90260e2ec6bbfc3dfa9d9501ab75c535de52f677.zip |
Fix INITCAP() word boundaries for PG_UNICODE_FAST.
Word boundaries are based on whether a character is alphanumeric or
not. For the PG_UNICODE_FAST collation, alphanumeric includes
non-ASCII digits; whereas for the PG_C_UTF8 collation, it only
includes digits 0-9. Pass down the right information from the
pg_locale_t into initcap_wbnext to differentiate the behavior.
Reported-by: Noah Misch <noah@leadboat.com>
Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/20250417135841.33.nmisch@google.com
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 4 | ||||
-rw-r--r-- | src/common/unicode/case_test.c | 13 | ||||
-rw-r--r-- | src/test/regress/expected/collate.utf8.out | 8 | ||||
-rw-r--r-- | src/test/regress/sql/collate.utf8.sql | 2 |
4 files changed, 23 insertions, 4 deletions
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 125b10ff7ab..f51768830cd 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -40,6 +40,7 @@ struct WordBoundaryState const char *str; size_t len; size_t offset; + bool posix; bool init; bool prev_alnum; }; @@ -58,7 +59,7 @@ initcap_wbnext(void *state) { pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); + bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) { @@ -92,6 +93,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, .str = src, .len = srclen, .offset = 0, + .posix = !locale->info.builtin.casemap_full, .init = false, .prev_alnum = false, }; diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index f0b38b3bdd7..fdfb62e8552 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -41,6 +41,7 @@ struct WordBoundaryState const char *str; size_t len; size_t offset; + bool posix; bool init; bool prev_alnum; }; @@ -55,7 +56,7 @@ initcap_wbnext(void *state) { pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str + wbstate->offset); - bool curr_alnum = pg_u_isalnum(u, true); + bool curr_alnum = pg_u_isalnum(u, wbstate->posix); if (!wbstate->init || curr_alnum != wbstate->prev_alnum) { @@ -112,10 +113,13 @@ icu_test_full(char *str) char icu_upper[BUFSZ]; char icu_fold[BUFSZ]; UErrorCode status; + + /* full case mapping doesn't use posix semantics */ struct WordBoundaryState wbstate = { .str = str, .len = strlen(str), .offset = 0, + .posix = false, .init = false, .prev_alnum = false, }; @@ -344,6 +348,12 @@ test_convert_case() test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'"); test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς"); test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ"); + /* test that alphanumerics are word characters */ + test_convert(tfunc_title, "λλ", "Λλ"); + test_convert(tfunc_title, "1a", "1a"); + /* U+FF11 FULLWIDTH ONE is alphanumeric for full case mapping */ + test_convert(tfunc_title, "\uFF11a", "\uFF11a"); + #ifdef USE_ICU icu_test_full(""); @@ -354,6 +364,7 @@ test_convert_case() icu_test_full("abc 123xyz"); icu_test_full("σςΣ ΣΣΣ"); icu_test_full("ıiIİ"); + icu_test_full("\uFF11a"); /* test <alpha><iota_subscript><acute> */ icu_test_full("\u0391\u0345\u0301"); #endif diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out index 5508622b16d..0c3ab5c89b2 100644 --- a/src/test/regress/expected/collate.utf8.out +++ b/src/test/regress/expected/collate.utf8.out @@ -52,6 +52,7 @@ INSERT INTO test_pg_c_utf8 VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -67,10 +68,11 @@ SELECT abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1A | ΛΛ 1A 1A | 12 | 12 | 12 | 12 ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 -(6 rows) +(7 rows) DROP TABLE test_pg_c_utf8; -- negative test: Final_Sigma not used for builtin locale C.UTF-8 @@ -182,6 +184,7 @@ INSERT INTO test_pg_unicode_fast VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -197,10 +200,11 @@ SELECT abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19 DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 + Λλ 1a 1a | λλ 1a 1a | Λλ 1a 1a | ΛΛ 1A 1A | 12 | 12 | 12 | 12 ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 -(6 rows) +(7 rows) DROP TABLE test_pg_unicode_fast; -- test Final_Sigma diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql index 6c7c7aec9ec..d6d14220ab3 100644 --- a/src/test/regress/sql/collate.utf8.sql +++ b/src/test/regress/sql/collate.utf8.sql @@ -45,6 +45,7 @@ INSERT INTO test_pg_c_utf8 VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); @@ -100,6 +101,7 @@ INSERT INTO test_pg_unicode_fast VALUES ('abc DEF 123abc'), ('ábc sßs ßss DÉF'), ('DŽxxDŽ džxxDž Džxxdž'), + (U&'Λλ 1a \FF11a'), ('ȺȺȺ'), ('ⱥⱥⱥ'), ('ⱥȺ'); |