8000 bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris by kulikjak · Pull Request #25096 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris #25096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Include/cpython/fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
int current_locale,
_Py_error_handler errors);

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
PyAPI_FUNC(char32_t*) _Py_convert_wchar_t_to_UTF32(
const wchar_t* u,
Py_ssize_t size);
#endif


PyAPI_FUNC(PyObject *) _Py_device_encoding(int);

Expand Down
5 changes: 5 additions & 0 deletions Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ Copyright (c) Corporation for National Research Initiatives.
# include <wchar.h>
#endif

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
# include <uchar.h>
# include <langinfo.h>
#endif

/* Py_UCS4 and Py_UCS2 are typedefs for the respective
unicode representations. */
typedef uint32_t Py_UCS4;
Expand Down
12 changes: 12 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2217,6 +2217,18 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion to UTF first. */
char* codeset = nl_langinfo(CODESET);
if (strcmp(codeset, "UTF-8") && strcmp(codeset, "646")) {
char32_t* c32 = _Py_convert_wchar_t_to_UTF32(u, size);
PyObject *unicode = _PyUnicode_FromUCS4(c32, size);
PyMem_Free(c32);
return unicode;
}
#endif

/* Single character Unicode objects in the Latin-1 range are
shared when using this constructor */
if (size == 1 && (Py_UCS4)*u < 256)
Expand Down
80 changes: 80 additions & 0 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,86 @@ _Py_GetLocaleEncodingObject(void)
return str;
}

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION

/* Convert a wide character string to the UTF32 encoded char32_t string. This
is necessary on systems where internal form of wchar_t is not already
Unicode (e.g. Oracle Solaris).

Return a pointer to a newly allocated char32_t string, use PyMem_Free() to
free the memory. Return NULL and raise exception on conversion or memory
allocation error. */
char32_t*
_Py_convert_wchar_t_to_UTF32(const wchar_t* u, Py_ssize_t size)
{
/* Ensure we won't overflow the size. */
if (size > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t)) - 1)) {
PyErr_NoMemory();
return NULL;
}

/* Given 'u' might not be NULL terminated (size smaller than its
length); copy and terminate part we are interested in. */
wchar_t* substr = PyMem_Malloc((size + 1) * sizeof(wchar_t));
if (substr == NULL) {
PyErr_NoMemory();
return NULL;
}

memcpy(substr, u, size * sizeof(wchar_t));
substr[size] = 0;

/* Convert given wide-character string to a character string */
size_t buffsize = wcstombs(NULL, substr, 0) + 1;
if (buffsize == (size_t)-1) {
PyMem_Free(substr);
PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
return NULL;
}

/* Ensure we won't overflow the size. */
if (buffsize > (PY_SSIZE_T_MAX - 1)) {
PyMem_Free(substr);
PyErr_NoMemory();
return NULL;
}
char* buffer = PyMem_Malloc(buffsize * sizeof(char));
if (buffer == NULL) {
PyMem_Free(substr);
PyErr_NoMemory();
return NULL;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add an assertion: assert((wcslen(result) + 1) == size);. I understand that result cannot be shorter or longer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this always the case? You told me that _Py_ConvertWCharForm cannot truncate at first zero and then wcslen+1 doesn't have to correspond to size. Although I am not sure if that can happen when converting from UCS-4 to wchar_t.


size_t res = wcstombs(buffer, substr, buffsize);
assert(res == buffsize - 1);

/* Convert character string to UTF32 encoded char32_t string.
Since wchar_t and char32_t have the same size on Solaris and one
wchar_t symbol corresponds to one UTF32 value, we can safely
reuse this buffer and skip additional allocation. */
char32_t* c32 = (char32_t*) substr;
mbstate_t state = {0};

Py_ssize_t i = 0;
char* ptr = buffer;
char* end = ptr + res + 1;

while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
PyMem_Free(c32);
PyMem_Free(buffer);
PyErr_Format(PyExc_ValueError,
"mbrtoc32() conversion failed with error code: %zd",
(Py_ssize_t)res);
return NULL;
}
ptr += res;
i ++;
}
PyMem_Free(buffer);
return c32;
}
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */

#ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
Expand Down
16 changes: 16 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -15194,6 +15194,22 @@ else
$as_echo "no" >&6; }
fi

case $ac_sys_system/$ac_sys_release in
SunOS/*)
if test -f /etc/os-release; then
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
if test "x$OS_NAME" = "xOracle Solaris"; then
# In Oracle Solaris, the internal form of wchar_t in non-Unicode locales
# is not Unicode and hence cannot be used directly.
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html

$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h

fi
fi
;;
esac

# check for endianness
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
Expand Down
16 changes: 16 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -4763,6 +4763,22 @@ else
AC_MSG_RESULT(no)
fi

case $ac_sys_system/$ac_sys_release in
SunOS/*)
if test -f /etc/os-release; then
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
if test "x$OS_NAME" = "xOracle Solaris"; then
# In Oracle Solaris, the internal form of wchar_t in non-Unicode locales
# is not Unicode and hence cannot be used directly.
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
[Define if the internal form of wchar_t in non-Unicode locales
is not Unicode.])
fi
fi
;;
esac

# check for endianness
AC_C_BIGENDIAN

Expand Down
4 changes: 4 additions & 0 deletions pyconfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,10 @@
/* Define to 1 if you have the `nice' function. */
#undef HAVE_NICE

/* Define if the internal form of wchar_t in non-Unicode locales is not
Unicode. */
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION

/* Define to 1 if you have the `openat' function. */
#undef HAVE_OPENAT

Expand Down
0