diff options
Diffstat (limited to 'src/common/unicode_case.c')
-rw-r--r-- | src/common/unicode_case.c | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c new file mode 100644 index 00000000000..842db173ba8 --- /dev/null +++ b/src/common/unicode_case.c @@ -0,0 +1,174 @@ +/*------------------------------------------------------------------------- + * unicode_case.c + * Unicode case mapping and case conversion. + * + * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode_case.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/unicode_case.h" +#include "common/unicode_case_table.h" +#include "common/unicode_category.h" +#include "mb/pg_wchar.h" + +static const pg_case_map *find_case_map(pg_wchar ucs); +static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, + CaseKind casekind); + +pg_wchar +unicode_lowercase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseLower] : code; +} + +pg_wchar +unicode_titlecase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseTitle] : code; +} + +pg_wchar +unicode_uppercase_simple(pg_wchar code) +{ + const pg_case_map *map = find_case_map(code); + + return map ? map->simplemap[CaseUpper] : code; +} + +/* + * unicode_strlower() + * + * Convert src to lowercase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + */ +size_t +unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen) +{ + return convert_case(dst, dstsize, src, srclen, CaseLower); +} + +/* + * unicode_strupper() + * + * Convert src to uppercase, and return the result length (not including + * terminating NUL). + * + * String src must be encoded in UTF-8. If srclen < 0, src must be + * NUL-terminated. + * + * Result string is stored in dst, truncating if larger than dstsize. If + * dstsize is greater than the result length, dst will be NUL-terminated; + * otherwise not. + * + * If dstsize is zero, dst may be NULL. This is useful for calculating the + * required buffer size before allocating. + */ +size_t +unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen) +{ + return convert_case(dst, dstsize, src, srclen, CaseUpper); +} + +/* + * Implement Unicode Default Case Conversion algorithm. + * + * Map each character in the string for which a mapping is available. + */ +static size_t +convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, + CaseKind casekind) +{ + size_t srcoff = 0; + size_t result_len = 0; + + while (src[srcoff] != '\0' && (srclen < 0 || srcoff < srclen)) + { + pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff); + int u1len = unicode_utf8len(u1); + const pg_case_map *casemap = find_case_map(u1); + + if (casemap) + { + pg_wchar u2 = casemap->simplemap[casekind]; + pg_wchar u2len = unicode_utf8len(u2); + + if (result_len + u2len < dstsize) + unicode_to_utf8(u2, (unsigned char *) dst + result_len); + + result_len += u2len; + } + else + { + /* no mapping; copy bytes from src */ + if (result_len + u1len < dstsize) + memcpy(dst + result_len, src + srcoff, u1len); + + result_len += u1len; + } + + srcoff += u1len; + } + + if (result_len < dstsize) + dst[result_len] = '\0'; + + return result_len; +} + +/* find entry in simple case map, if any */ +static const pg_case_map * +find_case_map(pg_wchar ucs) +{ + int min; + int mid; + int max; + + /* all chars <= 0x80 are stored in array for fast lookup */ + Assert(lengthof(case_map) >= 0x80); + if (ucs < 0x80) + { + const pg_case_map *map = &case_map[ucs]; + + Assert(map->codepoint == ucs); + return map; + } + + /* otherwise, binary search */ + min = 0x80; + max = lengthof(case_map) - 1; + while (max >= min) + { + mid = (min + max) / 2; + if (ucs > case_map[mid].codepoint) + min = mid + 1; + else if (ucs < case_map[mid].codepoint) + max = mid - 1; + else + return &case_map[mid]; + } + + return NULL; +} |