diff options
Diffstat (limited to 'src/common/unicode_norm.c')
-rw-r--r-- | src/common/unicode_norm.c | 31 |
1 files changed, 19 insertions, 12 deletions
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c index ab56490a075..ec5abea6bdd 100644 --- a/src/common/unicode_norm.c +++ b/src/common/unicode_norm.c @@ -1,6 +1,6 @@ /*------------------------------------------------------------------------- * unicode_norm.c - * Normalize a Unicode string to NFKC form + * Normalize a Unicode string * * This implements Unicode normalization, per the documentation at * https://www.unicode.org/reports/tr15/. @@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size) * are, in turn, decomposable. */ static int -get_decomposed_size(pg_wchar code) +get_decomposed_size(pg_wchar code, bool compat) { pg_unicode_decomposition *entry; int size = 0; @@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code) * Just count current code if no other decompositions. A NULL entry is * equivalent to a character with class 0 and no decompositions. */ - if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) return 1; /* @@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code) { uint32 lcode = decomp[i]; - size += get_decomposed_size(lcode); + size += get_decomposed_size(lcode, compat); } return size; @@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result) * in the array result. */ static void -decompose_code(pg_wchar code, pg_wchar **result, int *current) +decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) { pg_unicode_decomposition *entry; int i; @@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) * character with class 0 and no decompositions, so just leave also in * this case. */ - if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0) + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) { pg_wchar *res = *result; @@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) pg_wchar lcode = (pg_wchar) decomp[i]; /* Leave if no more decompositions */ - decompose_code(lcode, result, current); + decompose_code(lcode, compat, result, current); } } /* - * unicode_normalize_kc - Normalize a Unicode string to NFKC form. + * unicode_normalize - Normalize a Unicode string to the specified form. * * The input is a 0-terminated array of codepoints. * @@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current) * string is palloc'd instead, and OOM is reported with ereport(). */ pg_wchar * -unicode_normalize_kc(const pg_wchar *input) +unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) { + bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); + bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); pg_wchar *decomp_chars; pg_wchar *recomp_chars; int decomp_size, @@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input) */ decomp_size = 0; for (p = input; *p; p++) - decomp_size += get_decomposed_size(*p); + decomp_size += get_decomposed_size(*p, compat); decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); if (decomp_chars == NULL) @@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input) */ current_size = 0; for (p = input; *p; p++) - decompose_code(*p, &decomp_chars, ¤t_size); + decompose_code(*p, compat, &decomp_chars, ¤t_size); decomp_chars[decomp_size] = '\0'; Assert(decomp_size == current_size); @@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input) count -= 2; } + if (!recompose) + return decomp_chars; + /* - * The last phase of NFKC is the recomposition of the reordered Unicode + * The last phase of NFC and NFKC is the recomposition of the reordered Unicode * string using combining classes. The recomposed string cannot be longer * than the decomposed one, so make the allocation of the output string * based on that assumption. |