aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/pg_locale_libc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/utils/adt/pg_locale_libc.c')
-rw-r--r--src/backend/utils/adt/pg_locale_libc.c337
1 files changed, 294 insertions, 43 deletions
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 199857e22db..e9f9fc1e369 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -34,6 +34,46 @@
#endif
/*
+ * For the libc provider, to provide as much functionality as possible on a
+ * variety of platforms without going so far as to implement everything from
+ * scratch, we use several implementation strategies depending on the
+ * situation:
+ *
+ * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
+ * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
+ * collations don't give a fig about multibyte characters.
+ *
+ * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * This assumes that every platform uses Unicode codepoints directly
+ * as the wchar_t representation of Unicode. (XXX: ICU makes this assumption
+ * even for non-UTF8 encodings, which may be a problem.) On some platforms
+ * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
+ *
+ * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * values up to 255, and punt for values above that. This is 100% correct
+ * only in single-byte encodings such as LATINn. However, non-Unicode
+ * multibyte encodings are mostly Far Eastern character sets for which the
+ * properties being tested here aren't very relevant for higher code values
+ * anyway. The difficulty with using the <wctype.h> functions with
+ * non-Unicode multibyte encodings is that we can have no certainty that
+ * the platform's wchar_t representation matches what we do in pg_wchar
+ * conversions.
+ *
+ * As a special case, in the "default" collation, (2) and (3) force ASCII
+ * letters to follow ASCII upcase/downcase rules, while in a non-default
+ * collation we just let the library functions do what they will. The case
+ * where this matters is treatment of I/i in Turkish, and the behavior is
+ * meant to match the upper()/lower() SQL functions.
+ *
+ * We store the active collation setting in static variables. In principle
+ * it could be passed down to here via the regex library's "struct vars" data
+ * structure; but that would require somewhat invasive changes in the regex
+ * library, and right now there's no real benefit to be gained from that.
+ *
+ * NB: the coding here assumes pg_wchar is an unsigned type.
+ */
+
+/*
* Size of stack buffer to use for string transformations, used to avoid heap
* allocations in typical cases. This should be large enough that most strings
* will fit, but small enough that we feel comfortable putting it on the
@@ -43,13 +83,6 @@
extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
-extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale);
-extern size_t strtitle_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale);
-extern size_t strupper_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale);
-
static int strncoll_libc(const char *arg1, ssize_t len1,
const char *arg2, ssize_t len2,
pg_locale_t locale);
@@ -85,6 +118,251 @@ static size_t strupper_libc_mb(char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
+static bool
+wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isdigit_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isalpha_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isalnum_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isupper_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return islower_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isgraph_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isprint_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return ispunct_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ return isspace_l((unsigned char) wc, locale->info.lt);
+}
+
+static bool
+wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswdigit_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswalpha_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswalnum_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswupper_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswlower_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswgraph_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswprint_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswpunct_l((wint_t) wc, locale->info.lt);
+}
+
+static bool
+wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ return iswspace_l((wint_t) wc, locale->info.lt);
+}
+
+static char
+char_tolower_libc(unsigned char ch, pg_locale_t locale)
+{
+ Assert(pg_database_encoding_max_length() == 1);
+ return tolower_l(ch, locale->info.lt);
+}
+
+static bool
+char_is_cased_libc(char ch, pg_locale_t locale)
+{
+ bool is_multibyte = pg_database_encoding_max_length() > 1;
+
+ if (is_multibyte && IS_HIGHBIT_SET(ch))
+ return true;
+ else
+ return isalpha_l((unsigned char) ch, locale->info.lt);
+}
+
+static pg_wchar
+toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ Assert(GetDatabaseEncoding() != PG_UTF8);
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) wc);
+ if (wc <= (pg_wchar) UCHAR_MAX)
+ return toupper_l((unsigned char) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+static pg_wchar
+toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) wc);
+ if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
+ return towupper_l((wint_t) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+static pg_wchar
+tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
+{
+ Assert(GetDatabaseEncoding() != PG_UTF8);
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) wc);
+ if (wc <= (pg_wchar) UCHAR_MAX)
+ return tolower_l((unsigned char) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+static pg_wchar
+tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) wc);
+ if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
+ return towlower_l((wint_t) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+static const struct ctype_methods ctype_methods_libc_sb = {
+ .strlower = strlower_libc_sb,
+ .strtitle = strtitle_libc_sb,
+ .strupper = strupper_libc_sb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+};
+
+/*
+ * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
+ * single-byte semantics for pattern matching.
+ */
+static const struct ctype_methods ctype_methods_libc_other_mb = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+};
+
+static const struct ctype_methods ctype_methods_libc_utf8 = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_mb,
+ .wc_isalpha = wc_isalpha_libc_mb,
+ .wc_isalnum = wc_isalnum_libc_mb,
+ .wc_isupper = wc_isupper_libc_mb,
+ .wc_islower = wc_islower_libc_mb,
+ .wc_isgraph = wc_isgraph_libc_mb,
+ .wc_isprint = wc_isprint_libc_mb,
+ .wc_ispunct = wc_ispunct_libc_mb,
+ .wc_isspace = wc_isspace_libc_mb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_mb,
+ .wc_tolower = tolower_libc_mb,
+};
+
static const struct collate_methods collate_methods_libc = {
.strncoll = strncoll_libc,
.strnxfrm = strnxfrm_libc,
@@ -119,36 +397,6 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = {
};
#endif
-size_t
-strlower_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale)
-{
- if (pg_database_encoding_max_length() > 1)
- return strlower_libc_mb(dst, dstsize, src, srclen, locale);
- else
- return strlower_libc_sb(dst, dstsize, src, srclen, locale);
-}
-
-size_t
-strtitle_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale)
-{
- if (pg_database_encoding_max_length() > 1)
- return strtitle_libc_mb(dst, dstsize, src, srclen, locale);
- else
- return strtitle_libc_sb(dst, dstsize, src, srclen, locale);
-}
-
-size_t
-strupper_libc(char *dst, size_t dstsize, const char *src,
- ssize_t srclen, pg_locale_t locale)
-{
- if (pg_database_encoding_max_length() > 1)
- return strupper_libc_mb(dst, dstsize, src, srclen, locale);
- else
- return strupper_libc_sb(dst, dstsize, src, srclen, locale);
-}
-
static size_t
strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
@@ -465,7 +713,6 @@ create_pg_locale_libc(Oid collid, MemoryContext context)
loc = make_libc_collator(collate, ctype);
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
- result->provider = COLLPROVIDER_LIBC;
result->deterministic = true;
result->collate_is_c = (strcmp(collate, "C") == 0) ||
(strcmp(collate, "POSIX") == 0);
@@ -481,6 +728,15 @@ create_pg_locale_libc(Oid collid, MemoryContext context)
#endif
result->collate = &collate_methods_libc;
}
+ if (!result->ctype_is_c)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8)
+ result->ctype = &ctype_methods_libc_utf8;
+ else if (pg_database_encoding_max_length() > 1)
+ result->ctype = &ctype_methods_libc_other_mb;
+ else
+ result->ctype = &ctype_methods_libc_sb;
+ }
return result;
}
@@ -576,8 +832,6 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
const char *arg2n;
int result;
- Assert(locale->provider == COLLPROVIDER_LIBC);
-
if (bufsize1 + bufsize2 > TEXTBUFLEN)
buf = palloc(bufsize1 + bufsize2);
@@ -632,8 +886,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
size_t bufsize = srclen + 1;
size_t result;
- Assert(locale->provider == COLLPROVIDER_LIBC);
-
if (srclen == -1)
return strxfrm_l(dest, src, destsize, locale->info.lt);
@@ -742,7 +994,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
int r;
int result;
- Assert(locale->provider == COLLPROVIDER_LIBC);
Assert(GetDatabaseEncoding() == PG_UTF8);
if (len1 == -1)