#! /usr/bin/perl # # Copyright (c) 2007, PostgreSQL Global Development Group # # $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl,v 1.1 2007/03/25 11:56:02 ishii Exp $ # # Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from # "euc-jis-2004-std.txt" (http://x0213.org) require "ucs2utf.pl"; $TEST = 1; # first generate UTF-8 --> EUC_JIS_2004 table $in_file = "euc-jis-2004-std.txt"; open( FILE, $in_file ) || die( "cannot open $in_file" ); reset 'array'; reset 'array1'; reset 'comment'; reset 'comment1'; while($line = ){ if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) { $c = $1; $u1 = $2; $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); $ucs = hex($u1); $utf1 = &ucs2utf($ucs); $ucs = hex($u2); $utf2 = &ucs2utf($ucs); $str = sprintf "%08x%08x", $utf1, $utf2; $array1{ $str } = $code; $comment1{ $str } = $rest; $count1++; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { $c = $1; $u = $2; $rest = "U+" . $u . $3; } else { next; } $ucs = hex($u); $code = hex($c); $utf = &ucs2utf($ucs); if( $array{ $utf } ne "" ){ printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; next; } $count++; $array{ $utf } = $code; $comment{ $code } = $rest; } close( FILE ); $file = "utf8_to_euc_jis_2004.map"; open( FILE, "> $file" ) || die( "cannot open $file" ); print FILE "/*\n"; print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; print FILE " */\n"; print FILE "static pg_utf_to_local ULmapEUC_JIS_2004[] = {\n"; for $index ( sort {$a <=> $b} keys( %array ) ){ $code = $array{ $index }; $count--; if( $count == 0 ){ printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, $comment{ $code }; } else { printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, $comment{ $code }; } } print FILE "};\n"; close(FILE); if ($TEST == 1) { $file1 = "utf8.data"; $file2 = "euc_jis_2004.data"; open( FILE1, "> $file1" ) || die( "cannot open $file1" ); open( FILE2, "> $file2" ) || die( "cannot open $file2" ); for $index ( sort {$a <=> $b} keys( %array ) ){ $code = $array{ $index }; if ($code > 0x00 && $code != 0x09 && $code != 0x0a && $code != 0x0d && $code != 0x5c && ($code < 0x80 || ($code >= 0x8ea1 && $code <= 0x8efe) || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) || ($code >= 0xa1a1 && $code <= 0x8fefe))) { for ($i = 3; $i >= 0; $i--) { $s = $i * 8; $mask = 0xff << $s; print FILE1 pack("C", ($index & $mask) >> $s) if $index & $mask; print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; } print FILE1 "\n"; print FILE2 "\n"; } } } $file = "utf8_to_euc_jis_2004_combined.map"; open( FILE, "> $file" ) || die( "cannot open $file" ); print FILE "/*\n"; print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; print FILE " */\n"; print FILE "static pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n"; for $index ( sort {$a cmp $b} keys( %array1 ) ){ $code = $array1{ $index }; $count1--; if( $count1 == 0 ){ printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8), substr($index, 8, 8), $code, $comment1{ $index }; } else { printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n", substr($index, 0, 8), substr($index, 8, 8), $code, $comment1{ $index }; } } print FILE "};\n"; close(FILE); if ($TEST == 1) { for $index ( sort {$a cmp $b} keys( %array1 ) ){ $code = $array1{ $index }; if ($code > 0x00 && $code != 0x09 && $code != 0x0a && $code != 0x0d && $code != 0x5c && ($code < 0x80 || ($code >= 0x8ea1 && $code <= 0x8efe) || ($code >= 0x8fa1a1 && $code <= 0x8ffefe) || ($code >= 0xa1a1 && $code <= 0x8fefe))) { $v1 = hex(substr($index, 0, 8)); $v2 = hex(substr($index, 8, 8)); for ($i = 3; $i >= 0; $i--) { $s = $i * 8; $mask = 0xff << $s; print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask; print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask; } for ($i = 3; $i >= 0; $i--) { $s = $i * 8; $mask = 0xff << $s; print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask; } print FILE1 "\n"; print FILE2 "\n"; } } close(FILE1); close(FILE2); } # then generate EUC_JIS_2004 --> UTF-8 table $in_file = "euc-jis-2004-std.txt"; open( FILE, $in_file ) || die( "cannot open $in_file" ); reset 'array'; reset 'array1'; reset 'comment'; reset 'comment1'; while($line = ){ if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) { $c = $1; $u1 = $2; $u2 = $3; $rest = "U+" . $u1 . "+" . $u2 . $4; $code = hex($c); $ucs = hex($u1); $utf1 = &ucs2utf($ucs); $ucs = hex($u2); $utf2 = &ucs2utf($ucs); $str = sprintf "%08x%08x", $utf1, $utf2; $array1{ $code } = $str; $comment1{ $code } = $rest; $count1++; next; } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) { $c = $1; $u = $2; $rest = "U+" . $u . $3; } else { next; } $ucs = hex($u); $code = hex($c); $utf = &ucs2utf($ucs); if( $array{ $code } ne "" ){ printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs; next; } $count++; $array{ $code } = $utf; $comment{ $utf } = $rest; } close( FILE ); $file = "euc_jis_2004_to_utf8.map"; open( FILE, "> $file" ) || die( "cannot open $file" ); print FILE "/*\n"; print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; print FILE " */\n"; print FILE "static pg_local_to_utf LUmapEUC_JIS_2004[] = {\n"; for $index ( sort {$a <=> $b} keys( %array ) ){ $code = $array{ $index }; $count--; if( $count == 0 ){ printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code, $comment{ $code }; } else { printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code, $comment{ $code }; } } print FILE "};\n"; close(FILE); $file = "euc_jis_2004_to_utf8_combined.map"; open( FILE, "> $file" ) || die( "cannot open $file" ); print FILE "/*\n"; print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n"; print FILE " */\n"; print FILE "static pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n"; for $index ( sort {$a <=> $b} keys( %array1 ) ){ $code = $array1{ $index }; $count1--; if( $count1 == 0 ){ printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index, substr($code, 0, 8), substr($code, 8, 8), $comment1{ $index }; } else { printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index, substr($code, 0, 8), substr($code, 8, 8), $comment1{ $index }; } } print FILE "};\n"; close(FILE);