python · vstinner · Apr 30, 2021 · Mar 30, 2021 · Apr 1, 2021 · Apr 1, 2021
diff --git a/Include/cpython/fileutils.h b/Include/cpython/fileutils.h
@@ -32,6 +32,12 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
     int current_locale,
     _Py_error_handler errors);
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+PyAPI_FUNC(char32_t*) _Py_convert_wchar_t_to_UTF32(
+    const wchar_t* u,
+    Py_ssize_t size);
+#endif
+
 
 PyAPI_FUNC(PyObject *) _Py_device_encoding(int);
 

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -97,6 +97,11 @@ Copyright (c) Corporation for National Research Initiatives.
 #  include <wchar.h>
 #endif
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#  include <uchar.h>
+#  include <langinfo.h>
+#endif
+
 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
    unicode representations. */
 typedef uint32_t Py_UCS4;

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -2217,6 +2217,18 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
     if (size == 0)
         _Py_RETURN_UNICODE_EMPTY();
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UTF first. */
+    char* codeset = nl_langinfo(CODESET);
+    if (strcmp(codeset, "UTF-8") && strcmp(codeset, "646")) {
+        char32_t* c32 = _Py_convert_wchar_t_to_UTF32(u, size);
+        PyObject *unicode = _PyUnicode_FromUCS4(c32, size);
+        PyMem_Free(c32);
+        return unicode;
+    }
+#endif
+
     /* Single character Unicode objects in the Latin-1 range are
        shared when using this constructor */
     if (size == 1 && (Py_UCS4)*u < 256)

diff --git a/Python/fileutils.c b/Python/fileutils.c
@@ -922,6 +922,86 @@ _Py_GetLocaleEncodingObject(void)
     return str;
 }
 
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
+/* Convert a wide character string to the UTF32 encoded char32_t string. This
+   is necessary on systems where internal form of wchar_t is not already
+   Unicode (e.g. Oracle Solaris).
+
+   Return a pointer to a newly allocated char32_t string, use PyMem_Free() to
+   free the memory. Return NULL and raise exception on conversion or memory
+   allocation error. */
+char32_t*
+_Py_convert_wchar_t_to_UTF32(const wchar_t* u, Py_ssize_t size)
+{
+    /* Ensure we won't overflow the size. */
+    if (size > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t)) - 1)) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* Given 'u' might not be NULL terminated (size smaller than its
+       length); copy and terminate part we are interested in. */
+    wchar_t* substr = PyMem_Malloc((size + 1) * sizeof(wchar_t));
+    if (substr == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    memcpy(substr, u, size * sizeof(wchar_t));
+    substr[size] = 0;
+
+    /* Convert given wide-character string to a character string */
+    size_t buffsize = wcstombs(NULL, substr, 0) + 1;
+    if (buffsize == (size_t)-1) {
+        PyMem_Free(substr);
+        PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
+        return NULL;
+    }
+
+    /* Ensure we won't overflow the size. */
+    if (buffsize > (PY_SSIZE_T_MAX - 1)) {
+        PyMem_Free(substr);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    char* buffer = PyMem_Malloc(buffsize * sizeof(char));
+    if (buffer == NULL) {
+        PyMem_Free(substr);
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    size_t res = wcstombs(buffer, substr, buffsize);
+    assert(res == buffsize - 1);
+
+    /* Convert character string to UTF32 encoded char32_t string.
+       Since wchar_t and char32_t have the same size on Solaris and one
+       wchar_t symbol corresponds to one UTF32 value, we can safely
+       reuse this buffer and skip additional allocation. */
+    char32_t* c32 = (char32_t*) substr;
+    mbstate_t state = {0};
+
+    Py_ssize_t i = 0;
+    char* ptr = buffer;
+    char* end = ptr + res + 1;
+
+    while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
+        if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
+            PyMem_Free(c32);
+            PyMem_Free(buffer);
+            PyErr_Format(PyExc_ValueError,
+                         "mbrtoc32() conversion failed with error code: %zd",
+                         (Py_ssize_t)res);
+            return NULL;
+        }
+        ptr += res;
+        i ++;
+    }
+    PyMem_Free(buffer);
+    return c32;
+}
+#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
 
 #ifdef MS_WINDOWS
 static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */

diff --git a/configure b/configure
@@ -15194,6 +15194,22 @@ else
 $as_echo "no" >&6; }
 fi
 
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+  if test -f /etc/os-release; then
+    OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+    if test "x$OS_NAME" = "xOracle Solaris"; then
+      # In Oracle Solaris, the internal form of wchar_t in non-Unicode locales
+      # is not Unicode and hence cannot be used directly.
+      # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+
+$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
+
+    fi
+  fi
+  ;;
+esac
+
 # check for endianness
  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
 $as_echo_n "checking whether byte ordering is bigendian... " >&6; }

diff --git a/configure.ac b/configure.ac
@@ -4763,6 +4763,22 @@ else
   AC_MSG_RESULT(no)
 fi
 
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+  if test -f /etc/os-release; then
+    OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+    if test "x$OS_NAME" = "xOracle Solaris"; then
+      # In Oracle Solaris, the internal form of wchar_t in non-Unicode locales
+      # is not Unicode and hence cannot be used directly.
+      # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+      AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
+      [Define if the internal form of wchar_t in non-Unicode locales
+       is not Unicode.])
+    fi
+  fi
+  ;;
+esac
+
 # check for endianness
 AC_C_BIGENDIAN
 

diff --git a/pyconfig.h.in b/pyconfig.h.in
@@ -742,6 +742,10 @@
 /* Define to 1 if you have the `nice' function. */
 #undef HAVE_NICE
 
+/* Define if the internal form of wchar_t in non-Unicode locales is not
+   Unicode. */
+#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
 /* Define to 1 if you have the `openat' function. */
 #undef HAVE_OPENAT