8000 Back-patch addition of pg_wchar-to-multibyte conversion functionality. · sureandrew/postgres@6b30408 · GitHub
[go: up one dir, main page]

Skip to content

Commit 6b30408

Browse files
committed
Back-patch addition of pg_wchar-to-multibyte conversion functionality.
Back-patch of commits 72dd629, f6a05fd, and 60e9c22. This is needed to support fixing the regex prefix extraction bug in back branches.
1 parent 1590731 commit 6b30408

File tree

3 files changed

+293
-48
lines changed

3 files changed

+293
-48
lines changed

src/backend/utils/mb/mbutils.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,28 @@ pg_encoding_mb2wchar_with_len(int encoding,
577577
return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
578578
}
579579

580+
/* convert a wchar string to a multibyte */
581+
int
582+
pg_wchar2mb(const pg_wchar *from, char *to)
583+
{
584+
return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, pg_wchar_strlen(from));
585+
}
586+
587+
/* convert a wchar string to a multibyte with a limited length */
588+
int
589+
pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
590+
{
591+
return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, len);
592+
}
593+
594+
/* same, with any encoding */
595+
int
596+
pg_encoding_wchar2mb_with_len(int encoding,
597+
const pg_wchar *from, char *to, int len)
598+
{
599+
return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *)to, len);
600+
}
601+
580602
/* returns the byte length of a multibyte word */
581603
int
582604
pg_mblen(const char *mbstr)

src/backend/utils/mb/wchar.c

Lines changed: 244 additions & 43 deletions
57AE
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,7 @@ pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
9999
*to |= *from++;
100100
len -= 2;
101101
}
102-
else
103-
/* must be ASCII */
102+
else /* must be ASCII */
104103
{
105104
*to = *from++;
106105
len--;
@@ -339,6 +338,55 @@ pg_euctw_dsplen(const unsigned char *s)
339338
return len;
340339
}
341340

341+
/*
342+
* Convert pg_wchar to EUC_* encoding.
343+
* caller must allocate enough space for "to", including a trailing zero!
344+
* len: length of from.
345+
* "from" not necessarily null terminated.
346+
*/
347+
static int
348+
pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
349+
{
350+
int cnt = 0;
351+
352+
while (len > 0 && *from)
353+
{
354+
unsigned char c;
355+
356+
if ((c = (*from >> 24)))
357+
{
358+
*to++ = c;
359+
*to++ = (*from >> 16) & 0xff;
360+
*to++ = (*from >> 8) & 0xff;
361+
*to++ = *from & 0xff;
362+
cnt += 4;
363+
}
364+
else if ((c = (*from >> 16)))
365+
{
366+
*to++ = c;
367+
*to++ = (*from >> 8) & 0xff;
368+
*to++ = *from & 0xff;
369+
cnt += 3;
370+
}
371+
else if ((c = (*from >> 8)))
372+
{
373+
*to++ = c;
374+
*to++ = *from & 0xff;
375+
cnt += 2;
376+
}
377+
else
378+
{
379+
*to++ = *from;
380+
cnt++;
381+
}
382+
from++;
383+
len--;
384+
}
385+
*to = 0;
386+
return cnt;
387+
}
388+
389+
342390
/*
343391
* JOHAB
344392
*/
@@ -419,6 +467,66 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
419467
return cnt;
420468
}
421469

470+
471+
/*
472+
* Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
473+
* space allocated.
474+
*/
475+
static unsigned char *
476+
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
477+
{
478+
if (c <= 0x7F)
479+
{
480+
utf8string[0] = c;
481+
}
482+
else if (c <= 0x7FF)
483+
{
484+
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
485+
utf8string[1] = 0x80 | (c & 0x3F);
486+
}
487+
else if (c <= 0xFFFF)
488+
{
489+
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
490+
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
491+
utf8string[2] = 0x80 | (c & 0x3F);
492+
}
493+
else
494+
{
495+
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
496+
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
497+
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
498+
utf8string[3] = 0x80 | (c & 0x3F);
499+
}
500+
501+
return utf8string;
502+
}
503+
504+
/*
505+
* Trivial conversion from pg_wchar to UTF-8.
506+
* caller should allocate enough space for "to"
507+
* len: length of from.
508+
* "from" not necessarily null terminated.
509+
*/
510+
static int
511+
pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
512+
{
513+
int cnt = 0;
514+
515+
while (len > 0 && *from)
516+
{
517+
int char_len;
518+
519+
unicode_to_utf8(*from, to);
520+
char_len = pg_utf_mblen(to);
521+
cnt += char_len;
522+
to += char_len;
523+
from++;
524+
len--;
525+
}
526+
*to = 0;
527+
return cnt;
528+
}
529+
422530
/*
423531
* Return the byte length of a UTF8 character pointed to by s
424532
*
@@ -682,6 +790,77 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
682790
return cnt;
683791
}
684792

793+
/*
794+
* convert pg_wchar to mule internal code
795+
* caller should allocate enough space for "to"
796+
* len: length of from.
797+
* "from" not necessarily null terminated.
798+
*/
799+
static int
800+
pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
801+
{
802+
int cnt = 0;
803+
804+
while (len > 0 && *from)
805+
{
806+
unsigned char lb;
807+
808+
lb = (*from >> 16) & 0xff;
809+
if (IS_LC1(lb))
810+
{
811+
*to++ = lb;
812+
*to++ = *from & 0xff;
813+
cnt += 2;
814+
}
815+
else if (IS_LC2(lb))
816+
{
817+
*to++ = lb;
818+
*to++ = (*from >> 8) & 0xff;
819+
*to++ = *from & 0xff;
820+
cnt += 3;
821+
}
822+
else if (IS_LCPRV1_A_RANGE(lb))
823+
{
824+
*to++ = LCPRV1_A;
825+
*to++ = lb;
826+
*to++ = *from & 0xff;
827+
cnt += 3;
828+
}
829+
else if (IS_LCPRV1_B_RANGE(lb))
830+
{
831+
*to++ = LCPRV1_B;
832+
*to++ = lb;
833+
*to++ = *from & 0xff;
834+
cnt += 3;
835+
}
836+
else if (IS_LCPRV2_A_RANGE(lb))
837+
{
838+
*to++ = LCPRV2_A;
839+
*to++ = lb;
840+
*to++ = (*from >> 8) & 0xff;
841+
*to++ = *from & 0xff;
842+
cnt += 4;
843+
}
844+
else if (IS_LCPRV2_B_RANGE(lb))
845+
{
846+
*to++ = LCPRV2_B;
847+
*to++ = lb;
848+
*to++ = (*from >> 8) & 0xff;
849+
*to++ = *from & 0xff;
850+
cnt += 4;
851+
}
852+
else
853+
{
854+
*to++ = *from & 0xff;
855+
cnt += 1;
856+
}
857+
from++;
858+
len--;
859+
}
860+
*to = 0;
861+
return cnt;
862+
}
863+
685864
int
686865
pg_mule_mblen(const unsigned char *s)
687866
{
@@ -737,6 +916,28 @@ pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
737916
return cnt;
738917
}
739918

919+
/*
920+
* Trivial conversion from pg_wchar to single byte encoding. Just ignores
921+
* high bits.
922+
* caller should allocate enough space for "to"
923+
* len: length of from.
924+
* "from" not necessarily null terminated.
925+
*/
926+
static int
927+
pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
928+
{
929+
int cnt = 0;
930+
931+
while (len > 0 && *from)
932+
{
933+
*to++ = *from++;
934+
len--;
935+
cnt++;
936+
}
937+
*to = 0;
938+
return cnt;
939+
}
940+
740941
static int
741942
pg_latin1_mblen(const unsigned char *s)
742943
{
@@ -1304,47 +1505,47 @@ pg_utf8_islegal(const unsigned char *source, int length)
13041505
*-------------------------------------------------------------------
13051506
*/
13061507
pg_wchar_tbl pg_wchar_table[] = {
1307-
{pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* 0; PG_SQL_ASCII */
1308-
{pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 1; PG_EUC_JP */
1309-
{pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* 2; PG_EUC_CN */
1310-
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* 3; PG_EUC_KR */
1311-
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* 4; PG_EUC_TW */
1312-
{pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* 5; PG_EUC_JIS_2004 */
1313-
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* 6; PG_UTF8 */
1314-
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* 7; PG_MULE_INTERNAL */
1315-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 8; PG_LATIN1 */
1316-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 9; PG_LATIN2 */
1317-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 10; PG_LATIN3 */
1318-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 11; PG_LATIN4 */
1319-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 12; PG_LATIN5 */
1320-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 13; PG_LATIN6 */
1321-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 14; PG_LATIN7 */
1322-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 15; PG_LATIN8 */
1323-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 16; PG_LATIN9 */
1324-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 17; PG_LATIN10 */
1325-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 18; PG_WIN1256 */
1326-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 19; PG_WIN1258 */
1327-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 20; PG_WIN866 */
1328-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 21; PG_WIN874 */
1329-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 22; PG_KOI8R */
1330-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 23; PG_WIN1251 */
1331-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 24; PG_WIN1252 */
1332-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 25; ISO-8859-5 */
1333-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 26; ISO-8859-6 */
1334-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 27; ISO-8859-7 */
1335-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 28; ISO-8859-8 */
1336-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 29; PG_WIN1250 */
1337-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 30; PG_WIN1253 */
1338-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 31; PG_WIN1254 */
1339-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 32; PG_WIN1255 */
1340-
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* 33; PG_WIN1257 */
1341-
{0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* 34; PG_SJIS */
1342-
{0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* 35; PG_BIG5 */
1343-
{0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* 36; PG_GBK */
1344-
{0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* 37; PG_UHC */
1345-
{0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* 38; PG_GB18030 */
1346-
{0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* 39; PG_JOHAB */
1347-
{0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* 40; PG_SHIFT_JIS_2004 */
1508+
{pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1509+
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JP */
1510+
{pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2}, /* PG_EUC_CN */
1511+
{pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3}, /* PG_EUC_KR */
1512+
{pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4}, /* PG_EUC_TW */
1513+
{pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3}, /* PG_EUC_JIS_2004 */
1514+
{pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
1515+
{pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
1516+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1517+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1518+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1519+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1520+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1521+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1522+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1523+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1524+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1525+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1526+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1527+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1528+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1529+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1530+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1531+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1532+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1533+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1534+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1535+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1536+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1537+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1538+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1539+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1540+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1541+
{pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1542+
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1543+
{0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1544+
{0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2}, /* PG_GBK */
1545+
{0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2}, /* PG_UHC */
1546+
{0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4}, /* PG_GB18030 */
1547+
{0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
1548+
{0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2} /* PG_SHIFT_JIS_2004 */
13481549
};
13491550

13501551
/* returns the byte length of a word for mule internal code */

0 commit comments

Comments
 (0)
0