src/common/unicode/category_test.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

/*-------------------------------------------------------------------------
 * category_test.c
 *		Program to test Unicode general category functions.
 *
 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode/category_test.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres_fe.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef USE_ICU
#include <unicode/uchar.h>
#endif
#include "common/unicode_category.h"
#include "common/unicode_version.h"

/*
 * Parse version into integer for easy comparison.
 */
#ifdef USE_ICU
static int
parse_unicode_version(const char *version)
{
	int			n PG_USED_FOR_ASSERTS_ONLY;
	int			major;
	int			minor;

	n = sscanf(version, "%d.%d", &major, &minor);

	Assert(n == 2);
	Assert(minor < 100);

	return major * 100 + minor;
}
#endif

/*
 * Exhaustively test that the Unicode category for each codepoint matches that
 * returned by ICU.
 */
int
main(int argc, char **argv)
{
#ifdef USE_ICU
	int			pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
	int			icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
	int			pg_skipped_codepoints = 0;
	int			icu_skipped_codepoints = 0;

	printf("category_test: Postgres Unicode version:\t%s\n", PG_UNICODE_VERSION);
	printf("category_test: ICU Unicode version:\t\t%s\n", U_UNICODE_VERSION);

	for (UChar32 code = 0; code <= 0x10ffff; code++)
	{
		uint8_t		pg_category = unicode_category(code);
		uint8_t		icu_category = u_charType(code);

		if (pg_category != icu_category)
		{
			/*
			 * A version mismatch means that some assigned codepoints in the
			 * newer version may be unassigned in the older version. That's
			 * OK, though the test will not cover those codepoints marked
			 * unassigned in the older version (that is, it will no longer be
			 * an exhaustive test).
			 */
			if (pg_category == PG_U_UNASSIGNED &&
				pg_unicode_version < icu_unicode_version)
				pg_skipped_codepoints++;
			else if (icu_category == PG_U_UNASSIGNED &&
					 icu_unicode_version < pg_unicode_version)
				icu_skipped_codepoints++;
			else
			{
				printf("category_test: FAILURE for codepoint 0x%06x\n", code);
				printf("category_test: Postgres category:	%02d %s %s\n", pg_category,
					   unicode_category_abbrev(pg_category),
					   unicode_category_string(pg_category));
				printf("category_test: ICU category:		%02d %s %s\n", icu_category,
					   unicode_category_abbrev(icu_category),
					   unicode_category_string(icu_category));
				printf("\n");
				exit(1);
			}
		}
	}

	if (pg_skipped_codepoints > 0)
		printf("category_test: skipped %d codepoints unassigned in Postgres due to Unicode version mismatch\n",
			   pg_skipped_codepoints);
	if (icu_skipped_codepoints > 0)
		printf("category_test: skipped %d codepoints unassigned in ICU due to Unicode version mismatch\n",
			   icu_skipped_codepoints);

	printf("category_test: success\n");
	exit(0);
#else
	printf("category_test: ICU support required for test; skipping\n");
	exit(0);
#endif
}