From e967f121442f23a43bae9955fcd6172796746e8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Kul=C3=ADk?= Date: Fri, 30 Apr 2021 15:21:42 +0200 Subject: [PATCH] [3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096). (cherry picked from commit 9032cf5cb1e33c0349089cfb0f6bf11ed3c30e86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jakub KulĂ­k --- Include/internal/pycore_fileutils.h | 12 ++++ Objects/unicodeobject.c | 40 +++++++++++ Python/fileutils.c | 106 ++++++++++++++++++++++++++++ configure | 16 +++++ configure.ac | 16 +++++ pyconfig.h.in | 4 ++ 6 files changed, 194 insertions(+) diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h index bbee58617fd05e..8cf137bb4bdf9d 100644 --- a/Include/internal/pycore_fileutils.h +++ b/Include/internal/pycore_fileutils.h @@ -48,6 +48,18 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric( PyObject **decimal_point, PyObject **thousands_sep); +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +extern int _Py_LocaleUsesNonUnicodeWchar(void); + +extern wchar_t* _Py_DecodeNonUnicodeWchar( + const wchar_t* native, + Py_ssize_t size); + +extern int _Py_EncodeNonUnicodeWchar_InPlace( + wchar_t* unicode, + Py_ssize_t size); +#endif + #ifdef __cplusplus } #endif diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 19326fa60e58c3..46a0956c8bb70e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -56,6 +56,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include #endif +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar() +#endif + /* Uncomment to display statistics on interned strings at exit when using Valgrind or Insecure++. */ /* #define INTERNED_STATS 1 */ @@ -2211,6 +2215,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size) if (size == 0) _Py_RETURN_UNICODE_EMPTY(); +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UCS-4 first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size); + if (!converted) { + return NULL; + } + PyObject *unicode = _PyUnicode_FromUCS4(converted, size); + PyMem_Free(converted); + return unicode; + } +#endif + /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ if (size == 1 && (Py_UCS4)*u < 256) @@ -3223,6 +3241,17 @@ PyUnicode_AsWideChar(PyObject *unicode, res = size; } unicode_copy_as_widechar(unicode, w, size); + +#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) { + return -1; + } + } +#endif + return res; } @@ -3249,6 +3278,17 @@ PyUnicode_AsWideCharString(PyObject *unicode, return NULL; } unicode_copy_as_widechar(unicode, buffer, buflen + 1); + +#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion first. */ + if (_Py_LocaleUsesNonUnicodeWchar()) { + if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) { + return NULL; + } + } +#endif + if (size != NULL) { *size = buflen; } diff --git a/Python/fileutils.c b/Python/fileutils.c index 769ab591ab43fb..45ea2043912597 100644 --- a/Python/fileutils.c +++ b/Python/fileutils.c @@ -17,6 +17,10 @@ extern int winerror_to_errno(int); #include #endif +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION +#include +#endif + #ifdef HAVE_FCNTL_H #include #endif /* HAVE_FCNTL_H */ @@ -96,6 +100,12 @@ _Py_device_encoding(int fd) static size_t is_valid_wide_char(wchar_t ch) { +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding + for non-Unicode locales, which makes values higher than MAX_UNICODE + possibly valid. */ + return 1; +#endif if (Py_UNICODE_IS_SURROGATE(ch)) { // Reject lone surrogate characters return 0; @@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str, current_locale, errors); } +#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + +/* Check whether current locale uses Unicode as internal wchar_t form. */ +int +_Py_LocaleUsesNonUnicodeWchar(void) +{ + /* Oracle Solaris uses non-Unicode internal wchar_t form for + non-Unicode locales and hence needs conversion to UTF first. */ + char* codeset = nl_langinfo(CODESET); + if (!codeset) { + return 0; + } + /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */ + return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0); +} + +static wchar_t * +_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size, + const char *tocode, const char *fromcode) +{ + Py_BUILD_ASSERT(sizeof(wchar_t) == 4); + + /* Ensure we won't overflow the size. */ + if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) { + PyErr_NoMemory(); + return NULL; + } + + /* the string doesn't have to be NULL terminated */ + wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t)); + if (target == NULL) { + PyErr_NoMemory(); + return NULL; + } + + iconv_t cd = iconv_open(tocode, fromcode); + if (cd == (iconv_t)-1) { + PyErr_Format(PyExc_ValueError, "iconv_open() failed"); + PyMem_Free(target); + return NULL; + } + + char *inbuf = (char *) source; + char *outbuf = (char *) target; + size_t inbytesleft = sizeof(wchar_t) * size; + size_t outbytesleft = inbytesleft; + + size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); + if (ret == DECODE_ERROR) { + PyErr_Format(PyExc_ValueError, "iconv() failed"); + PyMem_Free(target); + iconv_close(cd); + return NULL; + } + + iconv_close(cd); + return target; +} + +/* Convert a wide character string to the UCS-4 encoded string. This + is necessary on systems where internal form of wchar_t are not Unicode + code points (e.g. Oracle Solaris). + + Return a pointer to a newly allocated string, use PyMem_Free() to free + the memory. Return NULL and raise exception on conversion or memory + allocation error. */ +wchar_t * +_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size) +{ + return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t"); +} + +/* Convert a UCS-4 encoded string to native wide character string. This + is necessary on systems where internal form of wchar_t are not Unicode + code points (e.g. Oracle Solaris). + + The conversion is done in place. This can be done because both wchar_t + and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond + to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris, + which is currently the only system using these functions; it doesn't have + to be for other systems). + + Return 0 on success. Return -1 and raise exception on conversion + or memory allocation error. */ +int +_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size) +{ + wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL"); + if (!result) { + return -1; + } + memcpy(unicode, result, size * sizeof(wchar_t)); + PyMem_Free(result); + return 0; +} +#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */ #ifdef MS_WINDOWS static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */ diff --git a/configure b/configure index 8dcdbf19890053..c584866581df38 100755 --- a/configure +++ b/configure @@ -15123,6 +15123,22 @@ else $as_echo "no" >&6; } fi +case $ac_sys_system/$ac_sys_release in +SunOS/*) + if test -f /etc/os-release; then + OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release) + if test "x$OS_NAME" = "xOracle Solaris"; then + # bpo-43667: In Oracle Solaris, the internal form of wchar_t in + # non-Unicode locales is not Unicode and hence cannot be used directly. + # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html + +$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h + + fi + fi + ;; +esac + # check for endianness { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5 $as_echo_n "checking whether byte ordering is bigendian... " >&6; } diff --git a/configure.ac b/configure.ac index b1e4c6ce19de8f..a0750777c15135 100644 --- a/configure.ac +++ b/configure.ac @@ -4759,6 +4759,22 @@ else AC_MSG_RESULT(no) fi +case $ac_sys_system/$ac_sys_release in +SunOS/*) + if test -f /etc/os-release; then + OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release) + if test "x$OS_NAME" = "xOracle Solaris"; then + # bpo-43667: In Oracle Solaris, the internal form of wchar_t in + # non-Unicode locales is not Unicode and hence cannot be used directly. + # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html + AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1, + [Define if the internal form of wchar_t in non-Unicode locales + is not Unicode.]) + fi + fi + ;; +esac + # check for endianness AC_C_BIGENDIAN diff --git a/pyconfig.h.in b/pyconfig.h.in index 8510c8778b5690..6358e568f4a6f8 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -733,6 +733,10 @@ /* Define to 1 if you have the `nice' function. */ #undef HAVE_NICE +/* Define if the internal form of wchar_t in non-Unicode locales is not + Unicode. */ +#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION + /* Define to 1 if you have the `openat' function. */ #undef HAVE_OPENAT