Fix conversion table generator scripts.

convutils.pm used implicit conversion of undefined value to integer zero. Some of conversion scripts are susceptible to regexp greediness. Fix, avoiding whitespace changes in the output. Also update ICU URLs that moved. No need to back-patch, because the output of these scripts is also in the source tree so we shouldn't need to rerun them on back-branches. Author: Kyotaro Horiguchi <horikyoga.ntt@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGJ7SEGLbj%3D%3DTQCcyKRA9aqj8%2B6L%3DexSq1y25TA%3DWxLziQ%40mail.gmail.com
author: Thomas Munro <tmunro@postgresql.org> 2020-07-22 16:38:20 +1200
committer: Thomas Munro <tmunro@postgresql.org> 2020-07-22 16:50:03 +1200
commit: a5073871ea655e37759f22f30c4c70359ad9759b (patch)
tree: b5f25334430c2999f01a6c76eee7cdf9f5ecd365 /src
parent: e47c2602aa4d35a4e3eb6ada40454c6c0f1279bf (diff)
download: postgresql-a5073871ea655e37759f22f30c4c70359ad9759b.tar.gz
postgresql-a5073871ea655e37759f22f30c4c70359ad9759b.zip
5 files changed, 44 insertions, 37 deletions
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index 9084f030091..da307d8eb95 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 	$(DOWNLOAD) http://x0213.org/codetable/$(@F)
 
 gb-18030-2000.xml windows-949-2000.xml:
-	$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
+	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
 
 GB2312.TXT:
 	$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
index 092a5b44f55..6d1681a18a3 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
@@ -24,12 +24,13 @@ my @all;
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# combined characters
 		my ($c, $u1, $u2) = ($1, $2, $3);
-		my $rest = "U+" . $u1 . "+" . $u2 . $4;
+		# The "\t \t" below is just to avoid insubstantial diffs.
+		my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
 		my $code = hex($c);
 		my $ucs1 = hex($u1);
 		my $ucs2 = hex($u2);
@@ -45,7 +46,7 @@ while (my $line = <$in>)
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
index 1d88c0296ee..d8bed27e1b1 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
@@ -80,7 +80,8 @@ foreach my $i (@$ct932)
 	}
 }
 
-foreach my $i (@mapping)
+# extract only SJIS characers
+foreach my $i (grep defined $_->{sjis}, @mapping)
 {
 	my $sjis = $i->{sjis};
 
diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
index b516e91306f..b86714dd46d 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
@@ -24,12 +24,13 @@ my @mapping;
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# combined characters
 		my ($c, $u1, $u2) = ($1, $2, $3);
-		my $rest = "U+" . $u1 . "+" . $u2 . $4;
+		# The "\t \t" below is just to avoid insubstantial diffs.
+		my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
 		my $code = hex($c);
 		my $ucs1 = hex($u1);
 		my $ucs2 = hex($u2);
@@ -45,7 +46,7 @@ while (my $line = <$in>)
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index 2f64a12ea14..9d97061c6fe 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -380,7 +380,8 @@ sub print_radix_table
 	  {
 		header  => "Dummy map, for invalid values",
 		min_idx => 0,
-		max_idx => $widest_range
+		max_idx => $widest_range,
+		label => "dummy map"
 	  };
 
 	###
@@ -471,35 +472,37 @@ sub print_radix_table
 	}
 
 	# Also look up the positions of the roots in the table.
-	my $b1root = $segmap{"1-byte"};
-	my $b2root = $segmap{"2-byte"};
-	my $b3root = $segmap{"3-byte"};
-	my $b4root = $segmap{"4-byte"};
+	# Missing map represents dummy mapping.
+	my $b1root = $segmap{"1-byte"} || 0;
+	my $b2root = $segmap{"2-byte"} || 0;
+	my $b3root = $segmap{"3-byte"} || 0;
+	my $b4root = $segmap{"4-byte"} || 0;
 
 	# And the lower-upper values of each level in each radix tree.
-	my $b1_lower = $min_idx{1}{1};
-	my $b1_upper = $max_idx{1}{1};
-
-	my $b2_1_lower = $min_idx{2}{1};
-	my $b2_1_upper = $max_idx{2}{1};
-	my $b2_2_lower = $min_idx{2}{2};
-	my $b2_2_upper = $max_idx{2}{2};
-
-	my $b3_1_lower = $min_idx{3}{1};
-	my $b3_1_upper = $max_idx{3}{1};
-	my $b3_2_lower = $min_idx{3}{2};
-	my $b3_2_upper = $max_idx{3}{2};
-	my $b3_3_lower = $min_idx{3}{3};
-	my $b3_3_upper = $max_idx{3}{3};
-
-	my $b4_1_lower = $min_idx{4}{1};
-	my $b4_1_upper = $max_idx{4}{1};
-	my $b4_2_lower = $min_idx{4}{2};
-	my $b4_2_upper = $max_idx{4}{2};
-	my $b4_3_lower = $min_idx{4}{3};
-	my $b4_3_upper = $max_idx{4}{3};
-	my $b4_4_lower = $min_idx{4}{4};
-	my $b4_4_upper = $max_idx{4}{4};
+	# Missing values represent zero.
+	my $b1_lower = $min_idx{1}{1} || 0;
+	my $b1_upper = $max_idx{1}{1} || 0;
+
+	my $b2_1_lower = $min_idx{2}{1} || 0;
+	my $b2_1_upper = $max_idx{2}{1} || 0;
+	my $b2_2_lower = $min_idx{2}{2} || 0;
+	my $b2_2_upper = $max_idx{2}{2} || 0;
+
+	my $b3_1_lower = $min_idx{3}{1} || 0;
+	my $b3_1_upper = $max_idx{3}{1} || 0;
+	my $b3_2_lower = $min_idx{3}{2} || 0;
+	my $b3_2_upper = $max_idx{3}{2} || 0;
+	my $b3_3_lower = $min_idx{3}{3} || 0;
+	my $b3_3_upper = $max_idx{3}{3} || 0;
+
+	my $b4_1_lower = $min_idx{4}{1} || 0;
+	my $b4_1_upper = $max_idx{4}{1} || 0;
+	my $b4_2_lower = $min_idx{4}{2} || 0;
+	my $b4_2_upper = $max_idx{4}{2} || 0;
+	my $b4_3_lower = $min_idx{4}{3} || 0;
+	my $b4_3_upper = $max_idx{4}{3} || 0;
+	my $b4_4_lower = $min_idx{4}{4} || 0;
+	my $b4_4_upper = $max_idx{4}{4} || 0;
 
 	###
 	### Find the maximum value in the whole table, to determine if we can
@@ -607,7 +610,8 @@ sub print_radix_table
 			for (my $j = 0;
 				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
 			{
-				my $val = $seg->{values}->{$i};
+				# missing values represent zero.
+				my $val = $seg->{values}->{$i} || 0;
 
 				printf $out " 0x%0*x", $colwidth, $val;
 				$off++;
author	Thomas Munro <tmunro@postgresql.org>	2020-07-22 16:38:20 +1200
committer	Thomas Munro <tmunro@postgresql.org>	2020-07-22 16:50:03 +1200
commit	a5073871ea655e37759f22f30c4c70359ad9759b (patch)
tree	b5f25334430c2999f01a6c76eee7cdf9f5ecd365 /src
parent	e47c2602aa4d35a4e3eb6ada40454c6c0f1279bf (diff)
download	postgresql-a5073871ea655e37759f22f30c4c70359ad9759b.tar.gz postgresql-a5073871ea655e37759f22f30c4c70359ad9759b.zip