10000 [3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on So… · python/cpython@d3cc689 · GitHub
[go: up one dir, main page]

Skip to content

Commit d3cc689

Browse files
authored
[3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096) (GH-25847)
(cherry picked from commit 9032cf5) Co-authored-by: Jakub Kulík <Kulikjak@gmail.com>
1 parent 0593ae8 commit d3cc689

File tree

6 files changed

+194
-0
lines changed

6 files changed

+194
-0
lines changed

Include/internal/pycore_fileutils.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
4848
PyObject **decimal_point,
4949
PyObject **thousands_sep);
5050

51+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
52+
extern int _Py_LocaleUsesNonUnicodeWchar(void);
53+
54+
extern wchar_t* _Py_ 10000 DecodeNonUnicodeWchar(
55+
const wchar_t* native,
56+
Py_ssize_t size);
57+
58+
extern int _Py_EncodeNonUnicodeWchar_InPlace(
59+
wchar_t* unicode,
60+
Py_ssize_t size);
61+
#endif
62+
5163
#ifdef __cplusplus
5264
}
5365
#endif

Objects/unicodeobject.c

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
5656
#include <windows.h>
5757
#endif
5858

59+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
60+
#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
61+
#endif
62+
5963
/* Uncomment to display statistics on interned strings at exit when
6064
using Valgrind or Insecure++. */
6165
/* #define INTERNED_STATS 1 */
@@ -2211,6 +2215,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
22112215
if (size == 0)
22122216
_Py_RETURN_UNICODE_EMPTY();
22132217

2218+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2219+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
2220+
non-Unicode locales and hence needs conversion to UCS-4 first. */
2221+
if (_Py_LocaleUsesNonUnicodeWchar()) {
2222+
wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2223+
if (!converted) {
2224+
return NULL;
2225+
}
2226+
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2227+
PyMem_Free(converted);
2228+
return unicode;
2229+
}
2230+
#endif
2231+
22142232
/* Single character Unicode objects in the Latin-1 range are
22152233
shared when using this constructor */
22162234
if (size == 1 && (Py_UCS4)*u < 256)
@@ -3223,6 +3241,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
32233241
res = size;
32243242
}
32253243
unicode_copy_as_widechar(unicode, w, size);
3244+
3245+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3246+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3247+
non-Unicode locales and hence needs conversion first. */
3248+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3249+
if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3250+
return -1;
3251+
}
3252+
}
3253+
#endif
3254+
32263255
return res;
32273256
}
32283257

@@ -3249,6 +3278,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
32493278
return NULL;
32503279
}
32513280
unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3281+
3282+
#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3283+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
3284+
non-Unicode locales and hence needs conversion first. */
3285+
if (_Py_LocaleUsesNonUnicodeWchar()) {
3286+
if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3287+
return NULL;
3288+
}
3289+
}
3290+
#endif
3291+
32523292
if (size != NULL) {
32533293
*size = buflen;
32543294
}

Python/fileutils.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ extern int winerror_to_errno(int);
1717
#include <sys/ioctl.h>
1818
#endif
1919

20+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
21+
#include <iconv.h>
22+
#endif
23+
2024
#ifdef HAVE_FCNTL_H
2125
#include <fcntl.h>
2226
#endif /* HAVE_FCNTL_H */
@@ -96,6 +100,12 @@ _Py_device_encoding(int fd)
96100
static size_t
97101
is_valid_wide_char(wchar_t ch)
98102
{
103+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
104+
/* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
105+
for non-Unicode locales, which makes values higher than MAX_UNICODE
106+
possibly valid. */
107+
return 1;
108+
#endif
99109
if (Py_UNICODE_IS_SURROGATE(ch)) {
100110
// Reject lone surrogate characters
101111
return 0;
@@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
859869
current_locale, errors);
860870
}
861871

872+
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
873+
874+
/* Check whether current locale uses Unicode as internal wchar_t form. */
875+
int
876+
_Py_LocaleUsesNonUnicodeWchar(void)
877+
{
878+
/* Oracle Solaris uses non-Unicode internal wchar_t form for
879+
non-Unicode locales and hence needs conversion to UTF first. */
880+
char* codeset = nl_langinfo(CODESET);
881+
if (!codeset) {
882+
return 0;
883+
}
884+
/* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
885+
return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
886+
}
887+
888+
static wchar_t *
889+
_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
890+
const char *tocode, const char *fromcode)
891+
{
892+
Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
893+
894+
/* Ensure we won't overflow the size. */
895+
if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
896+
PyErr_NoMemory();
897+
return NULL;
898+
}
899+
900+
/* the string doesn't have to be NULL terminated */
901+
wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
902+
if (target == NULL) {
903+
PyErr_NoMemory();
904+
return NULL;
905+
}
906+
907+
iconv_t cd = iconv_open(tocode, fromcode);
908+
if (cd == (iconv_t)-1) {
909+
PyErr_Format(PyExc_ValueError, "iconv_open() failed");
910+
PyMem_Free(target);
911+
return NULL;
912+
}
913+
914+
char *inbuf = (char *) source;
915+
char *outbuf = (char *) target;
916+
size_t inbytesleft = sizeof(wchar_t) * size;
917+
size_t outbytesleft = inbytesleft;
918+
919+
size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
920+
if (ret == DECODE_ERROR) {
921+
PyErr_Format(PyExc_ValueError, "iconv() failed");
922+
PyMem_Free(target);
923+
iconv_close(cd);
924+
return NULL;
925+
}
926+
927+
iconv_close(cd);
928+
return target;
929+
}
930+
931+
/* Convert a wide character string to the UCS-4 encoded string. This
932+
is necessary on systems where internal form of wchar_t are not Unicode
933+
code points (e.g. Oracle Solaris).
934+
935+
Return a pointer to a newly allocated string, use PyMem_Free() to free
936+
the memory. Return NULL and raise exception on conversion or memory
937+
allocation error. */
938+
wchar_t *
939+
_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
940+
{
941+
return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
942+
}
943+
944+
/* Convert a UCS-4 encoded string to native wide character string. This
945+
is necessary on systems where internal form of wchar_t are not Unicode
946+
code points (e.g. Oracle Solaris).
947+
948+
The conversion is done in place. This can be done because both wchar_t
949+
and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
950+
to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
951+
which is currently the only system using these functions; it doesn't have
952+
to be for other systems).
953+
954+
Return 0 on success. Return -1 and raise exception on conversion
955+
or memory allocation error. */
956+
int
957+
_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
958+
{
959+
wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
960+
if (!result) {
961+
return -1;
962+
}
963+
memcpy(unicode, result, size * sizeof(wchar_t));
964+
PyMem_Free(result);
965+
return 0;
966+
}
967+
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
862968

863969
#ifdef MS_WINDOWS
864970
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */

configure

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15123,6 +15123,22 @@ else
1512315123
$as_echo "no" >&6; }
1512415124
fi
1512515125

15126+
case $ac_sys_system/$ac_sys_release in
15127+
SunOS/*)
15128+
if test -f /etc/os-release; then
15129+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
15130+
if test "x$OS_NAME" = "xOracle Solaris"; then
15131+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
15132+
# non-Unicode locales is not Unicode and hence cannot be used directly.
15133+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
15134+
15135+
$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
15136+
15137+
fi
15138+
fi
15139+
;;
15140+
esac
15141+
1512615142
# check for endianness
1512715143
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
1512815144
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }

configure.ac

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4759,6 +4759,22 @@ else
47594759
AC_MSG_RESULT(no)
47604760
fi
47614761

4762+
case $ac_sys_system/$ac_sys_release in
4763+
SunOS/*)
4764+
if test -f /etc/os-release; then
4765+
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
4766+
if test "x$OS_NAME" = "xOracle Solaris"; then
4767+
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
4768+
# non-Unicode locales is not Unicode and hence cannot be used directly.
4769+
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
4770+
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
4771+
[Define if the internal form of wchar_t in non-Unicode locales
4772+
is not Unicode.])
4773+
fi
4774+
fi
4775+
;;
4776+
esac
4777+
47624778
# check for endianness
47634779
AC_C_BIGENDIAN
47644780

pyconfig.h.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,10 @@
733733
/* Define to 1 if you have the `nice' function. */
734734
#undef HAVE_NICE
735735

736+
/* Define if the internal form of wchar_t in non-Unicode locales is not
737+
Unicode. */
738+
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
739+
736740
/* Define to 1 if you have the `openat' function. */
737741
#undef HAVE_OPENAT
738742

0 commit comments

Comments
 (0)
0