10000 Add new encoding EUC_JIS_2004 and SHIFT_JIS_2004, · postgrespro/postgres_cluster@75c6519 · GitHub
[go: up one dir, main page]

Skip to content

Commit 75c6519

Browse files
committed
Add new encoding EUC_JIS_2004 and SHIFT_JIS_2004,
along with new conversions among EUC_JIS_2004, SHIFT_JIS_2004 and UTF-8. catalog version has been bump up.
1 parent 7b4726e commit 75c6519

File tree

41 files changed

+70346
-120
lines changed
  • utf8_and_johab
  • utf8_and_shift_jis_2004
  • utf8_and_sjis
  • utf8_and_uhc
  • utf8_and_win
  • include
  • test/regress
  • Some content is hidden

    Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

    41 files changed

    +70346
    -120
    lines changed

    doc/src/sgml/charset.sgml

    Lines changed: 17 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -1,4 +1,4 @@
    1-
    <!-- $PostgreSQL: pgsql/doc/src/sgml/charset.sgml,v 2.81 2007/01/31 20:56:16 momjian Exp $ -->
    1+
    <!-- $PostgreSQL: pgsql/doc/src/sgml/charset.sgml,v 2.82 2007/03/25 11:56:01 ishii Exp $ -->
    22

    33
    <chapter id="charset">
    44
    <title>Localization</>
    @@ -364,6 +364,14 @@ initdb --locale=sv_SE
    364364
    <entry>1-3</entry>
    365365
    <entry></entry>
    366366
    </row>
    367+
    <row>
    368+
    <entry><literal>EUC_JIS_2004</literal></entry>
    369+
    <entry>Extended UNIX Code-JP, JIS X 0213</entry>
    370+
    <entry>Japanese</entry>
    371+
    <entry>Yes</entry>
    372+
    <entry>1-3</entry>
    373+
    <entry></entry>
    374+
    </row>
    367375
    <row>
    368376
    <entry><literal>EUC_KR</literal></entry>
    369377
    <entry>Extended UNIX Code-KR</entry>
    @@ -540,6 +548,14 @@ initdb --locale=sv_SE
    540548
    <entry>1-2</entry>
    541549
    <entry><literal>Mskanji</>, <literal>ShiftJIS</>, <literal>WIN932</>, <literal>Windows932</></entry>
    542550
    </row>
    551+
    <row>
    552+
    <entry><literal>SHIFT_JIS_2004</literal></entry>
    553+
    <entry>Shift JIS, JIS X 0213</entry>
    554+
    <entry>Japanese</entry>
    555+
    <entry>No</entry>
    556+
    <entry>1-2</entry>
    557+
    <entry></entry>
    558+
    </row>
    543559
    <row>
    544560
    <entry><literal>SQL_ASCII</literal></entry>
    545561
    <entry>unspecified (see text)</entry>

    doc/src/sgml/func.sgml

    Lines changed: 37 additions & 1 deletion
    Original file line numberDiff line numberDiff line change
    @@ -1,4 +1,4 @@
    1-
    <!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.370 2007/03/20 05:44:59 neilc Exp $ -->
    1+
    <!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.371 2007/03/25 11:56:01 ishii Exp $ -->
    22

    33
    <chapter id="functions">
    44
    <title>Functions and Operators</title>
    @@ -2394,6 +2394,42 @@
    23942394
    <entry><literal>UTF8</literal></entry>
    23952395
    </row>
    23962396

    2397+
    <row>
    2398+
    <entry><literal>euc_jis_2004_to_utf8</literal></entry>
    2399+
    <entry><literal>EUC_JIS_2004</literal></entry>
    2400+
    <entry><literal>UTF8</literal></entry>
    2401+
    </row>
    2402+
    2403+
    <row>
    2404+
    <entry><literal>ut8_to_euc_jis_2004</literal></entry>
    2405+
    <entry><literal>UTF8</literal></entry>
    2406+
    <entry><literal>EUC_JIS_2004</literal></entry>
    2407+
    </row>
    2408+
    2409+
    <row>
    2410+
    <entry><literal>shift_jis_2004_to_utf8</literal></entry>
    2411+
    <entry><literal>SHIFT_JIS_2004</literal></entry>
    2412+
    <entry><literal>UTF8</literal></entry>
    2413+
    </row>
    2414+
    2415+
    <row>
    2416+
    <entry><literal>ut8_to_shift_jis_2004</literal></entry>
    2417+
    <entry><literal>UTF8</literal></entry>
    2418+
    <entry><literal>SHIFT_JIS_2004</literal></entry>
    2419+
    </row>
    2420+
    2421+
    <row>
    2422+
    <entry><literal>euc_jis_2004_to_shift_jis_2004</literal></entry>
    2423+
    <entry><literal>EUC_JIS_2004</literal></entry>
    2424+
    <entry><literal>SHIFT_JIS_2004</literal></entry>
    2425+
    </row>
    2426+
    2427+
    <row>
    2428+
    <entry><literal>shift_jis_2004_to_euc_jis_2004</literal></entry>
    2429+
    <entry><literal>SHIFT_JIS_2004</literal></entry>
    2430+
    <entry><literal>EUC_JIS_2004</literal></entry>
    2431+
    </row>
    2432+
    23972433
    </tbody>
    23982434
    </tgroup>
    23992435
    </table>
    Lines changed: 248 additions & 0 deletions
    +
    Original file line numberDiff line numberDiff line change
    @@ -0,0 +1,248 @@
    1+
    #! /usr/bin/perl
    2+
    #
    3+
    # Copyright (c) 2007, PostgreSQL Global Development Group
    4+
    #
    5+
    # $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl,v 1.1 2007/03/25 11:56:02 ishii Exp $
    6+
    #
    7+
    # Generate UTF-8 <--> EUC_JIS_2004 code conversion tables from
    8+
    # "euc-jis-2004-std.txt" (http://x0213.org)
    9+
    10+
    require "ucs2utf.pl";
    11+
    12+
    $TEST = 1;
    13+
    14+
    # first generate UTF-8 --> EUC_JIS_2004 table
    15+
    16+
    $in_file = "euc-jis-2004-std.txt";
    17+
    18+
    open( FILE, $in_file ) || die( "cannot open $in_file" );
    19+
    20+
    reset 'array';
    21+
    reset 'array1';
    22+
    reset 'comment';
    23+
    reset 'comment1';
    24+
    25+
    while($line = <FILE> ){
    26+
    if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) {
    27+
    $c = $1;
    28+
    $u1 = $2;
    29+
    $u2 = $3;
    30+
    $rest = "U+" . $u1 . "+" . $u2 . $4;
    31+
    $code = hex($c);
    32+
    $ucs = hex($u1);
    33+
    $utf1 = &ucs2utf($ucs);
    34+
    $ucs = hex($u2);
    35+
    $utf2 = &ucs2utf($ucs);
    36+
    $str = sprintf "%08x%08x", $utf1, $utf2;
    37+
    $array1{ $str } = $code;
    38+
    $comment1{ $str } = $rest;
    39+
    $count1++;
    40+
    next;
    41+
    } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) {
    42+
    $c = $1;
    43+
    $u = $2;
    44+
    $rest = "U+" . $u . $3;
    45+
    } else {
    46+
    next;
    47+
    }
    48+
    49+
    $ucs = hex($u);
    50+
    $code = hex($c);
    51+
    $utf = &ucs2utf($ucs);
    52+
    if( $array{ $utf } ne "" ){
    53+
    printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
    54+
    next;
    55+
    }
    56+
    $count++;
    57+
    58+
    $array{ $utf } = $code;
    59+
    $comment{ $code } = $rest;
    60+
    }
    61+
    close( FILE );
    62+
    63+
    $file = "utf8_to_euc_jis_2004.map";
    64+
    open( FILE, "> $file" ) || die( "cannot open $file" );
    65+
    print FILE "/*\n";
    66+
    print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
    67+
    print FILE " */\n";
    68+
    print FILE "static pg_utf_to_local ULmapEUC_JIS_2004[] = {\n";
    69+
    70+
    for $index ( sort {$a <=> $b} keys( %array ) ){
    71+
    $code = $array{ $index };
    72+
    $count--;
    73+
    if( $count == 0 ){
    74+
    printf FILE " {0x%08x, 0x%06x} /* %s */\n", $index, $code, $comment{ $code };
    75+
    } else {
    76+
    printf FILE " {0x%08x, 0x%06x}, /* %s */\n", $index, $code, $comment{ $code };
    77+
    }
    78+
    }
    79+
    80+
    print FILE "};\n";
    81+
    close(FILE);
    82+
    83+
    if ($TEST == 1) {
    84+
    $file1 = "utf8.data";
    85+
    $file2 = "euc_jis_2004.data";
    86+
    open( FILE1, "> $file1" ) || die( "cannot open $file1" );
    87+
    open( FILE2, "> $file2" ) || die( "cannot open $file2" );
    88+
    89+
    for $index ( sort {$a <=> $b} keys( %array ) ){
    90+
    $code = $array{ $index };
    91+
    if ($code > 0x00 && $code != 0x09 && $code != 0x0a && $code != 0x0d &&
    92+
    $code != 0x5c &&
    93+
    ($code < 0x80 ||
    94+
    ($code >= 0x8ea1 && $code <= 0x8efe) ||
    95+
    ($code >= 0x8fa1a1 && $code <= 0x8ffefe) ||
    96+
    ($code >= 0xa1a1 && $code <= 0x8fefe))) {
    97+
    for ($i = 3; $i >= 0; $i--) {
    98+
    $s = $i * 8;
    99+
    $mask = 0xff << $s;
    100+
    print FILE1 pack("C", ($index & $mask) >> $s) if $index & $mask;
    101+
    print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
    102+
    }
    103+
    print FILE1 "\n";
    104+
    print FILE2 "\n";
    105+
    }
    106+
    }
    107+
    }
    108+
    109+
    $file = "utf8_to_euc_jis_2004_combined.map";
    110+
    open( FILE, "> $file" ) || die( "cannot open $file" );
    111+
    print FILE "/*\n";
    112+
    print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
    113+
    print FILE " */\n";
    114+
    print FILE "static pg_utf_to_local_combined ULmapEUC_JIS_2004_combined[] = {\n";
    115+
    116+
    for $index ( sort {$a cmp $b} keys( %array1 ) ){
    117+
    $code = $array1{ $index };
    118+
    $count1--;
    119+
    if( $count1 == 0 ){
    120+
    printf FILE " {0x%s, 0x%s, 0x%06x} /* %s */\n", substr($index, 0, 8), substr($index, 8, 8), $code, $comment1{ $index };
    121+
    } else {
    122+
    printf FILE " {0x%s, 0x%s, 0x%06x}, /* %s */\n", substr($index, 0, 8), substr($index, 8, 8), $code, $comment1{ $index };
    123+
    }
    124+
    }
    125
    126+
    print FILE "};\n";
    127+
    close(FILE);
    128+
    129+
    if ($TEST == 1) {
    130+
    for $index ( sort {$a cmp $b} keys( %array1 ) ){
    131+
    $code = $array1{ $index };
    132+
    if ($code > 0x00 && $code != 0x09 && $code != 0x0a && $code != 0x0d &&
    133+
    $code != 0x5c &&
    134+
    ($code < 0x80 ||
    135+
    ($code >= 0x8ea1 && $code <= 0x8efe) ||
    136+
    ($code >= 0x8fa1a1 && $code <= 0x8ffefe) ||
    137+
    ($code >= 0xa1a1 && $code <= 0x8fefe))) {
    138+
    139+
    $v1 = hex(substr($index, 0, 8));
    140+
    $v2 = hex(substr($index, 8, 8));
    141+
    142+
    for ($i = 3; $i >= 0; $i--) {
    143+
    $s = $i * 8;
    144+
    $mask = 0xff << $s;
    145+
    print FILE1 pack("C", ($v1 & $mask) >> $s) if $v1 & $mask;
    146+
    print FILE2 pack("C", ($code & $mask) >> $s) if $code & $mask;
    147+
    }
    148+
    for ($i = 3; $i >= 0; $i--) {
    149+
    $s = $i * 8;
    150+
    $mask = 0xff << $s;
    151+
    print FILE1 pack("C", ($v2 & $mask) >> $s) if $v2 & $mask;
    152+
    }
    153+
    print FILE1 "\n";
    154+
    print FILE2 "\n";
    155+
    }
    156+
    }
    157+
    close(FILE1);
    158+
    close(FILE2);
    159+
    }
    160+
    161+
    # then generate EUC_JIS_2004 --> UTF-8 table
    162+
    163+
    $in_file = "euc-jis-2004-std.txt";
    164+
    165+
    open( FILE, $in_file ) || die( "cannot open $in_file" );
    166+
    167+
    reset 'array';
    168+
    reset 'array1';
    169+
    reset 'comment';
    170+
    reset 'comment1';
    171+
    172+
    while($line = <FILE> ){
    173+
    if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/) {
    174+
    $c = $1;
    175+
    $u1 = $2;
    176+
    $u2 = $3;
    177+
    $rest = "U+" . $u1 . "+" . $u2 . $4;
    178+
    $code = hex($c);
    179+
    $ucs = hex($u1);
    180+
    $utf1 = &ucs2utf($ucs);
    181+
    $ucs = hex($u2);
    182+
    $utf2 = &ucs2utf($ucs);
    183+
    $str = sprintf "%08x%08x", $utf1, $utf2;
    184+
    $array1{ $code } = $str;
    185+
    $comment1{ $code } = $rest;
    186+
    $count1++;
    187+
    next;
    188+
    } elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/) {
    189+
    $c = $1;
    190+
    $u = $2;
    191+
    $rest = "U+" . $u . $3;
    192+
    } else {
    193+
    next;
    194+
    }
    195+
    196+
    $ucs = hex($u);
    197+
    $code = hex($c);
    198+
    $utf = &ucs2utf($ucs);
    199+
    if( $array{ $code } ne "" ){
    200+
    printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
    201+
    next;
    202+
    }
    203+
    $count++;
    204+
    205+
    $array{ $code } = $utf;
    206+
    $comment{ $utf } = $rest;
    207+
    }
    208+
    close( FILE );
    209+
    210+
    $file = "euc_jis_2004_to_utf8.map";
    211+
    open( FILE, "> $file" ) || die( "cannot open $file" );
    212+
    print FILE "/*\n";
    213+
    print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
    214+
    print FILE " */\n";
    215+
    print FILE "static pg_local_to_utf LUmapEUC_JIS_2004[] = {\n";
    216+
    217+
    for $index ( sort {$a <=> $b} keys( %array ) ){
    218+
    $code = $array{ $index };
    219+
    $count--;
    220+
    if( $count == 0 ){
    221+
    printf FILE " {0x%06x, 0x%08x} /* %s */\n", $index, $code, $comment{ $code };
    222+
    } else {
    223+
    printf FILE " {0x%06x, 0x%08x}, /* %s */\n", $index, $code, $comment{ $code };
    224+
    }
    225+
    }
    226+
    227+
    print FILE "};\n";
    228+
    close(FILE);
    229+
    230+
    $file = "euc_jis_2004_to_utf8_combined.map";
    231+
    open( FILE, "> $file" ) || die( "cannot open $file" );
    232+
    print FILE "/*\n";
    233+
    print FILE " * This file was generated by UCS_to_EUC_JIS_2004.pl\n";
    234+
    print FILE " */\n";
    235+
    print FILE "static pg_local_to_utf_combined LUmapEUC_JIS_2004_combined[] = {\n";
    236+
    237+
    for $index ( sort {$a <=> $b} keys( %array1 ) ){
    238+
    $code = $array1{ $index };
    239+
    $count1--;
    240+
    if( $count1 == 0 ){
    241+
    printf FILE " {0x%06x, 0x%s, 0x%s} /* %s */\n", $index, substr($code, 0, 8), substr($code, 8, 8), $comment1{ $index };
    242+
    } else {
    243+
    printf FILE " {0x%06x, 0x%s, 0x%s}, /* %s */\n", $index, substr($code, 0, 8), substr($code, 8, 8), $comment1{ $index };
    244+
    }
    245+
    }
    246+
    247+
    print FILE "};\n";
    248+
    close(FILE);

    0 commit comments

    Comments
     (0)
    0