8000 Add direct conversion routines between EUC_TW and Big5. · postgrespro/postgres@6c55760 · GitHub
[go: up one dir, main page]

Skip to content
  • Commit 6c55760

    Browse files
    committed
    Add direct conversion routines between EUC_TW and Big5.
    Conversions between EUC_TW and Big5 were previously implemented by converting the whole input to MIC first, and then from MIC to the target encoding. Implement functions to convert directly between the two. The reason to do this now is that I'm working on a patch that will change the conversion function signature so that if the input is invalid, we convert as much as we can and return the number of bytes successfully converted. That's not possible if we use an intermediary format, because if an error happens in the intermediary -> final conversion, we lose track of the location of the invalid character in the original input. Avoiding the intermediate step makes the conversions faster, too. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/b9e3167f-f84b-7aa4-5738-be578a4db924%40iki.fi
    1 parent b80e106 commit 6c55760

    File tree

    1 file changed

    +134
    -10
    lines changed

    1 file changed

    +134
    -10
    lines changed

    src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c

    Lines changed: 134 additions & 10 deletions
    Original file line numberDiff line numberDiff line change
    @@ -37,6 +37,8 @@ PG_FUNCTION_INFO_V1(mic_to_big5);
    3737
    * ----------
    3838
    */
    3939

    40+
    static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len);
    41+
    static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len);
    4042
    static void big52mic(const unsigned char *big5, unsigned char *p, int len);
    4143
    static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
    4244
    static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
    @@ -48,14 +50,10 @@ euc_tw_to_big5(PG_FUNCTION_ARGS)
    4850
    unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
    4951
    unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
    5052
    int len = PG_GETARG_INT32(4);
    51-
    unsigned char *buf;
    5253

    5354
    CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5);
    5455

    55-
    buf = palloc(len * ENCODING_GROWTH_RATE + 1);
    56-
    euc_tw2mic(src, buf, len);
    57-
    mic2big5(buf, dest, strlen((char *) buf));
    58-
    pfree(buf);
    56+
    euc_tw2big5(src, dest, len);
    5957

    6058
    PG_RETURN_VOID();
    6159
    }
    @@ -66,14 +64,10 @@ big5_to_euc_tw(PG_FUNCTION_ARGS)
    6664
    unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
    6765
    unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
    6866
    int len = PG_GETARG_INT32(4);
    69-
    unsigned char *buf;
    7067

    7168
    CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW);
    7269

    73-
    buf = palloc(len * ENCODING_GROWTH_RATE + 1);
    74-
    big52mic(src, buf, len);
    75-
    mic2euc_tw(buf, dest, strlen((char *) buf));
    76-
    pfree(buf);
    70+
    big52euc_tw(src, dest, len);
    7771

    7872
    PG_RETURN_VOID();
    7973
    }
    @@ -134,6 +128,136 @@ mic_to_big5(PG_FUNCTION_ARGS)
    134128
    PG_RETURN_VOID();
    135129
    }
    136130

    131+
    132+
    /*
    133+
    * EUC_TW ---> Big5
    134+
    */
    135+
    static void
    136+
    euc_tw2big5(const unsigned char *euc, unsigned char *p, int len)
    137+
    {
    138+
    unsigned char c1;
    139+
    unsigned short big5buf,
    140+
    cnsBuf;
    141+
    unsigned char lc;
    142+
    int l;
    143+
    144+
    while (len > 0)
    145+
    {
    146+
    c1 = *euc;
    147+
    if (IS_HIGHBIT_SET(c1))
    148+
    {
    149+
    /* Verify and decode the next EUC_TW input character */
    150+
    l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len);
    151+
    if (l < 0)
    152+
    report_invalid_encoding(PG_EUC_TW,
    153+
    (const char *) euc, len);
    154+
    if (c1 == SS2)
    155+
    {
    156+
    c1 = euc[1]; /* plane No. */
    157+
    if (c1 == 0xa1)
    158+
    lc = LC_CNS11643_1;
    159+
    else if (c1 == 0xa2)
    160+
    lc = LC_CNS11643_2;
    161+
    else
    162+
    lc = c1 - 0xa3 + LC_CNS11643_3;
    163+
    cnsBuf = (euc[2] << 8) | euc[3];
    164+
    }
    165+
    else
    166+
    { /* CNS11643-1 */
    167+
    lc = LC_CNS11643_1;
    168+
    cnsBuf = (c1 << 8) | euc[1];
    169+
    }
    170+
    171+
    /* Write it out in Big5 */
    172+
    big5buf = CNStoBIG5(cnsBuf, lc);
    173+
    if (big5buf == 0)
    174+
    report_untranslatable_char(PG_EUC_TW, PG_BIG5,
    175+
    (const char *) euc, len);
    176+
    *p++ = (big5buf >> 8) & 0x00ff;
    177+
    *p++ = big5buf & 0x00ff;
    178+
    179+
    euc += l;
    180+
    len -= l;
    181+
    }
    182+
    else
    183+
    { /* should be ASCII */
    184+
    if (c1 == 0)
    185+
    report_invalid_encoding(PG_EUC_TW,
    186+
    (const char *) euc, len);
    187+
    *p++ = c1;
    188+
    euc++;
    189+
    len--;
    190+
    }
    191+
    }
    192+
    *p = '\0';
    193+
    }
    194+
    195+
    /*
    196+
    * Big5 ---> EUC_TW
    197+
    */
    198+
    static void
    199+
    big52euc_tw(const unsigned char *big5, unsigned char *p, int len)
    200+
    {
    201+
    unsigned short c1;
    202+
    unsigned short big5buf,
    203+
    cnsBuf;
    204+
    unsigned char lc;
    205+
    int l;
    206+
    207+
    while (len > 0)
    208+
    {
    209+
    /* Verify and decode the next Big5 input character */
    210+
    c1 = *big5;
    211+
    if (IS_HIGHBIT_SET(c1))
    212+
    {
    213+
    l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len);
    214+
    if (l < 0)
    215+
    report_invalid_encoding(PG_BIG5,
    216+
    (const char *) big5, len);
    217+
    big5buf = (c1 << 8) | big5[1];
    218+
    cnsBuf = BIG5toCNS(big5buf, &lc);
    219+
    220+
    if (lc == LC_CNS11643_1)
    221+
    {
    222+
    *p++ = (cnsBuf >> 8) & 0x00ff;
    223+
    *p++ = cnsBuf & 0x00ff;
    224+
    }
    225+
    else if (lc == LC_CNS11643_2)
    226+
    {
    227+
    *p++ = SS2;
    228+
    *p++ = 0xa2;
    229+
    *p++ = (cnsBuf >> 8) & 0x00ff;
    230+
    *p++ = cnsBuf & 0x00ff;
    231+
    }
    232+
    else if (lc >= LC_CNS11643_3 && lc <= LC_CNS11643_7)
    233+
    {
    234+
    *p++ = SS2;
    235+
    *p++ = lc - LC_CNS11643_3 + 0xa3;
    236+
    *p++ = (cnsBuf >> 8) & 0x00ff;
    237+
    *p++ = cnsBuf & 0x00ff;
    238+
    }
    239+
    else
    240+
    report_untranslatable_char(PG_BIG5, PG_EUC_TW,
    241+
    (const char *) big5, len);
    242+
    243+
    big5 += l;
    244+
    len -= l;
    245+
    }
    246+
    else
    247+
    {
    248+
    /* ASCII */
    249+
    if (c1 == 0)
    250+
    report_invalid_encoding(PG_BIG5,
    251+
    (const char *) big5, len);
    252+
    *p++ = c1;
    253+
    big5++;
    254+
    len--;
    255+
    continue;
    256+
    }
    257+
    }
    258+
    *p = '\0';
    259+
    }
    260+
    137261
    /*
    138262
    * EUC_TW ---> MIC
    139263
    */

    0 commit comments

    Comments
     (0)
    0