@@ -99,8 +99,7 @@ pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
99
99
* to |= * from ++ ;
100
100
len -= 2 ;
101
101
}
102
- else
103
- /* must be ASCII */
102
+ else /* must be ASCII */
104
103
{
105
104
* to = * from ++ ;
106
105
len -- ;
@@ -339,6 +338,55 @@ pg_euctw_dsplen(const unsigned char *s)
339
338
return len ;
340
339
}
341
340
341
+ /*
342
+ * Convert pg_wchar to EUC_* encoding.
343
+ * caller must allocate enough space for "to", including a trailing zero!
344
+ * len: length of from.
345
+ * "from" not necessarily null terminated.
346
+ */
347
+ static int
348
+ pg_wchar2euc_with_len (const pg_wchar * from , unsigned char * to , int len )
349
+ {
350
+ int cnt = 0 ;
351
+
352
+ while (len > 0 && * from )
353
+ {
354
+ unsigned char c ;
355
+
356
+ if ((c = (* from >> 24 )))
357
+ {
358
+ * to ++ = c ;
359
+ * to ++ = (* from >> 16 ) & 0xff ;
360
+ * to ++ = (* from >> 8 ) & 0xff ;
361
+ * to ++ = * from & 0xff ;
362
+ cnt += 4 ;
363
+ }
364
+ else if ((c = (* from >> 16 )))
365
+ {
366
+ * to ++ = c ;
367
+ * to ++ = (* from >> 8 ) & 0xff ;
368
+ * to ++ = * from & 0xff ;
369
+ cnt += 3 ;
370
+ }
371
+ else if ((c = (* from >> 8 )))
372
+ {
373
+ * to ++ = c ;
374
+ * to ++ = * from & 0xff ;
375
+ cnt += 2 ;
376
+ }
377
+ else
378
+ {
379
+ * to ++ = * from ;
380
+ cnt ++ ;
381
+ }
382
+ from ++ ;
383
+ len -- ;
384
+ }
385
+ * to = 0 ;
386
+ return cnt ;
387
+ }
388
+
389
+
342
390
/*
343
391
* JOHAB
344
392
*/
@@ -419,6 +467,66 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
419
467
return cnt ;
420
468
}
421
469
470
+
471
+ /*
472
+ * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
473
+ * space allocated.
474
+ */
475
+ static unsigned char *
476
+ unicode_to_utf8 (pg_wchar c , unsigned char * utf8string )
477
+ {
478
+ if (c <= 0x7F )
479
+ {
480
+ utf8string [0 ] = c ;
481
+ }
482
+ else if (c <= 0x7FF )
483
+ {
484
+ utf8string [0 ] = 0xC0 | ((c >> 6 ) & 0x1F );
485
+ utf8string [1 ] = 0x80 | (c & 0x3F );
486
+ }
487
+ else if (c <= 0xFFFF )
488
+ {
489
+ utf8string [0 ] = 0xE0 | ((c >> 12 ) & 0x0F );
490
+ utf8string [1 ] = 0x80 | ((c >> 6 ) & 0x3F );
491
+ utf8string [2 ] = 0x80 | (c & 0x3F );
492
+ }
493
+ else
494
+ {
495
+ utf8string [0 ] = 0xF0 | ((c >> 18 ) & 0x07 );
496
+ utf8string [1 ] = 0x80 | ((c >> 12 ) & 0x3F );
497
+ utf8string [2 ] = 0x80 | ((c >> 6 ) & 0x3F );
498
+ utf8string [3 ] = 0x80 | (c & 0x3F );
499
+ }
500
+
501
+ return utf8string ;
502
+ }
503
+
504
+ /*
505
+ * Trivial conversion from pg_wchar to UTF-8.
506
+ * caller should allocate enough space for "to"
507
+ * len: length of from.
508
+ * "from" not necessarily null terminated.
509
+ */
510
+ static int
511
+ pg_wchar2utf_with_len (const pg_wchar * from , unsigned char * to , int len )
512
+ {
513
+ int cnt = 0 ;
514
+
515
+ while (len > 0 && * from )
516
+ {
517
+ int char_len ;
518
+
519
+ unicode_to_utf8 (* from , to );
520
+ char_len = pg_utf_mblen (to );
521
+ cnt += char_len ;
522
+ to += char_len ;
523
+ from ++ ;
524
+ len -- ;
525
+ }
526
+ * to = 0 ;
527
+ return cnt ;
528
+ }
529
+
422
530
/*
423
531
* Return the byte length of a UTF8 character pointed to by s
424
532
*
@@ -682,6 +790,77 @@ pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
682
790
return cnt ;
683
791
}
684
792
793
+ /*
794
+ * convert pg_wchar to mule internal code
795
+ * caller should allocate enough space for "to"
796
+ * len: length of from.
797
+ * "from" not necessarily null terminated.
798
+ */
799
+ static int
800
+ pg_wchar2mule_with_len (const pg_wchar * from , unsigned char * to , int len )
801
+ {
802
+ int cnt = 0 ;
803
+
804
+ while (len > 0 && * from )
805
+ {
806
+ unsigned char lb ;
807
+
808
+ lb = (* from >> 16 ) & 0xff ;
809
+ if (IS_LC1 (lb ))
810
+ {
811
+ * to ++ = lb ;
812
+ * to ++ = * from & 0xff ;
813
+ cnt += 2 ;
814
+ }
815
+ else if (IS_LC2 (lb ))
816
+ {
817
+ * to ++ = lb ;
818
+ * to ++ = (* from >> 8 ) & 0xff ;
819
+ * to ++ = * from & 0xff ;
820
+ cnt += 3 ;
821
+ }
822
+ else if (IS_LCPRV1_A_RANGE (lb ))
823
+ {
824
+ * to ++ = LCPRV1_A ;
825
+ * to ++ = lb ;
826
+ * to ++ = * from & 0xff ;
827
+ cnt += 3 ;
828
+ }
829
+ else if (IS_LCPRV1_B_RANGE (lb ))
830
+ {
831
+ * to ++ = LCPRV1_B ;
832
+ * to ++ = lb ;
833
+ * to ++ = * from & 0xff ;
834
+ cnt += 3 ;
835
+ }
836
+ else if (IS_LCPRV2_A_RANGE (lb ))
837
+ {
838
+ * to ++ = LCPRV2_A ;
839
+ * to ++ = lb ;
840
+ * to ++ = (* from >> 8 ) & 0xff ;
841
+ * to ++ = * from & 0xff ;
842
+ cnt += 4 ;
843
+ }
844
+ else if (IS_LCPRV2_B_RANGE (lb ))
845
+ {
846
+ * to ++ = LCPRV2_B ;
847
+ * to ++ = lb ;
848
+ * to ++ = (* from >> 8 ) & 0xff ;
849
+ * to ++ = * from & 0xff ;
850
+ cnt += 4 ;
851
+ }
852
+ else
853
+ {
854
+ * to ++ = * from & 0xff ;
855
+ cnt += 1 ;
856
+ }
857
+ from ++ ;
858
+ len -- ;
859
+ }
860
+ * to = 0 ;
861
+ return cnt ;
862
+ }
863
+
685
864
int
686
865
pg_mule_mblen (const unsigned char * s )
687
866
{
@@ -737,6 +916,28 @@ pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
737
916
return cnt ;
738
917
}
739
918
919
+ /*
920
+ * Trivial conversion from pg_wchar to single byte encoding. Just ignores
921
+ * high bits.
922
+ * caller should allocate enough space for "to"
923
+ * len: length of from.
924
+ * "from" not necessarily null terminated.
925
+ */
926
+ static int
927
+ pg_wchar2single_with_len (const pg_wchar * from , unsigned char * to , int len )
928
+ {
929
+ int cnt = 0 ;
930
+
931
+ while (len > 0 && * from )
932
+ {
933
+ * to ++ = * from ++ ;
934
+ len -- ;
935
+ cnt ++ ;
936
+ }
937
+ * to = 0 ;
938
+ return cnt ;
939
+ }
940
+
740
941
static int
741
942
pg_latin1_mblen (const unsigned char * s )
742
943
{
@@ -1304,47 +1505,47 @@ pg_utf8_islegal(const unsigned char *source, int length)
1304
1505
*-------------------------------------------------------------------
1305
1506
*/
1306
1507
pg_wchar_tbl pg_wchar_table [] = {
1307
- {pg_ascii2wchar_with_len , pg_ascii_mblen , pg_ascii_dsplen , pg_ascii_verifier , 1 }, /* 0; PG_SQL_ASCII */
1308
- {pg_eucjp2wchar_with_len , pg_eucjp_mblen , pg_eucjp_dsplen , pg_eucjp_verifier , 3 }, /* 1; PG_EUC_JP */
1309
- {pg_euccn2wchar_with_len , pg_euccn_mblen , pg_euccn_dsplen , pg_euccn_verifier , 2 }, /* 2; PG_EUC_CN */
1310
- {pg_euckr2wchar_with_len , pg_euckr_mblen , pg_euckr_dsplen , pg_euckr_verifier , 3 }, /* 3; PG_EUC_KR */
1311
- {pg_euctw2wchar_with_len , pg_euctw_mblen , pg_euctw_dsplen , pg_euctw_verifier , 4 }, /* 4; PG_EUC_TW */
1312
- {pg_eucjp2wchar_with_len , pg_eucjp_mblen , pg_eucjp_dsplen , pg_eucjp_verifier , 3 }, /* 5; PG_EUC_JIS_2004 */
1313
- {pg_utf2wchar_with_len , pg_utf_mblen , pg_utf_dsplen , pg_utf8_verifier , 4 }, /* 6; PG_UTF8 */
1314
- {pg_mule2wchar_with_len , pg_mule_mblen , pg_mule_dsplen , pg_mule_verifier , 4 }, /* 7; PG_MULE_INTERNAL */
1315
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 8; PG_LATIN1 */
1316
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 9; PG_LATIN2 */
1317
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 10; PG_LATIN3 */
1318
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 11; PG_LATIN4 */
1319
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 12; PG_LATIN5 */
1320
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 13; PG_LATIN6 */
1321
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 14; PG_LATIN7 */
1322
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 15; PG_LATIN8 */
1323
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 16; PG_LATIN9 */
1324
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 17; PG_LATIN10 */
1325
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 18; PG_WIN1256 */
1326
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 19; PG_WIN1258 */
1327
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 20; PG_WIN866 */
1328
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 21; PG_WIN874 */
1329
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 22; PG_KOI8R */
1330
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 23; PG_WIN1251 */
1331
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 24; PG_WIN1252 */
1332
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 25; ISO-8859-5 */
1333
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 26; ISO-8859-6 */
1334
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 27; ISO-8859-7 */
1335
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 28; ISO-8859-8 */
1336
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 29; PG_WIN1250 */
1337
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 30; PG_WIN1253 */
1338
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 31; PG_WIN1254 */
1339
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 32; PG_WIN1255 */
1340
- {pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* 33; PG_WIN1257 */
1341
- {0 , pg_sjis_mblen , pg_sjis_dsplen , pg_sjis_verifier , 2 }, /* 34; PG_SJIS */
1342
- {0 , pg_big5_mblen , pg_big5_dsplen , pg_big5_verifier , 2 }, /* 35; PG_BIG5 */
1343
- {0 , pg_gbk_mblen , pg_gbk_dsplen , pg_gbk_verifier , 2 }, /* 36; PG_GBK */
1344
- {0 , pg_uhc_mblen , pg_uhc_dsplen , pg_uhc_verifier , 2 }, /* 37; PG_UHC */
1345
- {0 , pg_gb18030_mblen , pg_gb18030_dsplen , pg_gb18030_verifier , 4 }, /* 38; PG_GB18030 */
1346
- {0 , pg_johab_mblen , pg_johab_dsplen , pg_johab_verifier , 3 }, /* 39; PG_JOHAB */
1347
- {0 , pg_sjis_mblen , pg_sjis_dsplen , pg_sjis_verifier , 2 } /* 40; PG_SHIFT_JIS_2004 */
1508
+ {pg_ascii2wchar_with_len , pg_wchar2single_with_len , pg_ascii_mblen , pg_ascii_dsplen , pg_ascii_verifier , 1 }, /* PG_SQL_ASCII */
1509
+ {pg_eucjp2wchar_with_len , pg_wchar2euc_with_len , pg_eucjp_mblen , pg_eucjp_dsplen , pg_eucjp_verifier , 3 }, /* PG_EUC_JP */
1510
+ {pg_euccn2wchar_with_len , pg_wchar2euc_with_len , pg_euccn_mblen , pg_euccn_dsplen , pg_euccn_verifier , 2 }, /* PG_EUC_CN */
1511
+ {pg_euckr2wchar_with_len , pg_wchar2euc_with_len , pg_euckr_mblen , pg_euckr_dsplen , pg_euckr_verifier , 3 }, /* PG_EUC_KR */
1512
+ {pg_euctw2wchar_with_len , pg_wchar2euc_with_len , pg_euctw_mblen , pg_euctw_dsplen , pg_euctw_verifier , 4 }, /* PG_EUC_TW */
1513
+ {pg_eucjp2wchar_with_len , pg_wchar2euc_with_len , pg_eucjp_mblen , pg_eucjp_dsplen , pg_eucjp_verifier , 3 }, /* PG_EUC_JIS_2004 */
1514
+ {pg_utf2wchar_with_len , pg_wchar2utf_with_len , pg_utf_mblen , pg_utf_dsplen , pg_utf8_verifier , 4 }, /* PG_UTF8 */
1515
+ {pg_mule2wchar_with_len , pg_wchar2mule_with_len , pg_mule_mblen , pg_mule_dsplen , pg_mule_verifier , 4 }, /* PG_MULE_INTERNAL */
1516
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN1 */
1517
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN2 */
1518
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN3 */
1519
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN4 */
1520
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN5 */
1521
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN6 */
1522
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN7 */
1523
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN8 */
1524
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN9 */
1525
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_LATIN10 */
1526
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1256 */
1527
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1258 */
1528
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN866 */
1529
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN874 */
1530
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_KOI8R */
1531
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1251 */
1532
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1252 */
1533
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* ISO-8859-5 */
1534
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* ISO-8859-6 */
1535
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* ISO-8859-7 */
1536
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* ISO-8859-8 */
1537
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1250 */
1538
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1253 */
1539
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1254 */
1540
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1255 */
1541
+ {pg_latin12wchar_with_len , pg_wchar2single_with_len , pg_latin1_mblen , pg_latin1_dsplen , pg_latin1_verifier , 1 }, /* PG_WIN1257 */
1542
+ {0 , 0 , pg_sjis_mblen , pg_sjis_dsplen , pg_sjis_verifier , 2 }, /* PG_SJIS */
1543
+ {0 , 0 , pg_big5_mblen , pg_big5_dsplen , pg_big5_verifier , 2 }, /* PG_BIG5 */
1544
+ {0 , 0 , pg_gbk_mblen , pg_gbk_dsplen , pg_gbk_verifier , 2 }, /* PG_GBK */
1545
+ {0 , 0 , pg_uhc_mblen , pg_uhc_dsplen , pg_uhc_verifier , 2 }, /* PG_UHC */
1546
+ {0 , 0 , pg_gb18030_mblen , pg_gb18030_dsplen , pg_gb18030_verifier , 4 }, /* PG_GB18030 */
1547
+ {0 , 0 , pg_johab_mblen , pg_johab_dsplen , pg_johab_verifier , 3 }, /* PG_JOHAB */
1548
+ {0 , 0 , pg_sjis_mblen , pg_sjis_dsplen , pg_sjis_verifier , 2 } /* PG_SHIFT_JIS_2004 */
1348
1549
};
1349
1550
1350
1551
/* returns the byte length of a word for mule internal code */
0 commit comments