8000 bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris by kulikjak · Pull Request #25096 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris #25096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 30, 2021
Merged
Prev Previous commit
Next Next commit
Rework wchar_t conversion based on suggestions
  • Loading branch information
kulikjak committed Apr 1, 2021
commit c01b7923eba61831cf67905fb7901fb7d3b47fc9
6 changes: 6 additions & 0 deletions Include/cpython/fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
int current_locale,
_Py_error_handler errors);

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
PyAPI_FUNC(char32_t*) _Py_convert_wchar_t_to_UTF32(
const wchar_t* u,
Py_ssize_t size);
#endif


PyAPI_FUNC(PyObject *) _Py_device_encoding(int);

Expand Down
2 changes: 1 addition & 1 deletion Include/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ Copyright (c) Corporation for National Research Initiatives.
# include <wchar.h>
#endif

#if defined(__sun) && defined(__SVR4)
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
# include <uchar.h>
# include <langinfo.h>
#endif
Expand Down
63 changes: 7 additions & 56 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -2194,15 +2194,6 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
return PyUnicode_FromWideChar(u, size);
}

#if defined(__sun) && defined(__SVR4)
/* Detect whether currently used locale uses UTF compatible encoding. */
int codeset_is_utf8_compatible()
{
char* res = nl_langinfo(CODESET);
return !(strcmp(res, "UTF-8") && strcmp(res, "646"));
}
#endif

PyObject *
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
{
Expand All @@ -2226,54 +2217,14 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();

#if defined(__sun) && defined(__SVR4)
/* Check whether current locale uses UTF to encode symbols */
if (!codeset_is_utf8_compatible()) {

/* Given 'u' might not be NULL terminated (size smaller than its
length); copy and terminate part we are interested in. */
wchar_t* substr = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
memcpy(substr, u, size * sizeof(wchar_t));
substr[size] = 0;

/* Convert given wide-character string to a character string */
size_t buffsize = wcstombs(NULL, substr, 0) + 1;
if (buffsize == (size_t)-1) {
PyMem_RawFree(substr);
PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
return NULL;
}

char* buffer = PyMem_RawMalloc(buffsize * sizeof(char));
size_t res = wcstombs(buffer, substr, buffsize);
assert(res == buffsize - 1);

/* Convert character string to UTF32 encoded char32_t string.
Since wchar_t and char32_t have the same size on Solaris and one
wchar_t symbol corresponds to one UTF32 value, we can safely
reuse this buffer and skip additional allocation. */
char32_t* c32 = (char32_t*) substr;
mbstate_t state = {0};

int i = 0;
char* ptr = buffer;
char* end = ptr + res + 1;
while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
PyMem_RawFree(c32);
PyMem_RawFree(buffer);
PyErr_Format(PyExc_ValueError,
"mbrtoc32() conversion failed with error code: %d",
res);
return NULL;
}
ptr += res;
i ++;
}
PyMem_RawFree(buffer);

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion to UTF first. */
char* codeset = nl_langinfo(CODESET);
if (strcmp(codeset, "UTF-8") && strcmp(codeset, "646")) {
char32_t* c32 = _Py_convert_wchar_t_to_UTF32(u, size);
PyObject *unicode = _PyUnicode_FromUCS4(c32, size);
PyMem_RawFree(c32);
PyMem_Free(c32);
return unicode;
}
#endif
Expand Down
80 changes: 80 additions & 0 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,86 @@ _Py_GetLocaleEncodingObject(void)
return str;
}

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION

/* Convert a wide character string to the UTF32 encoded char32_t string. This
is necessary on systems where internal form of wchar_t is not already
Unicode (e.g. Oracle Solaris).

Return a pointer to a newly allocated char32_t string, use PyMem_Free() to
free the memory. Return NULL and raise exception on conversion or memory
allocation error. */
char32_t*
_Py_convert_wchar_t_to_UTF32(const wchar_t* u, Py_ssize_t size)
{
/* Ensure we won't overflow the size. */
if (size > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t)) - 1)) {
PyErr_NoMemory();
return NULL;
}

/* Given 'u' might not be NULL terminated (size smaller than its
length); copy and terminate part we are interested in. */
wchar_t* substr = PyMem_Malloc((size + 1) * sizeof(wchar_t));
if (substr == NULL) {
PyErr_NoMemory();
return NULL;
}

memcpy(substr, u, size * sizeof(wchar_t));
substr[size] = 0;

/* Convert given wide-character string to a character string */
size_t buffsize = wcstombs(NULL, substr, 0) + 1;
if (buffsize == (size_t)-1) {
PyMem_Free(substr);
PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
return NULL;
}

/* Ensure we won't overflow the size. */
if (buffsize > (PY_SSIZE_T_MAX - 1)) {
PyMem_Free(substr);
PyErr_NoMemory();
return NULL;
}
char* buffer = PyMem_Malloc(buffsize * sizeof(char));
if (buffer == NULL) {
PyMem_Free(substr);
PyErr_NoMemory();
return NULL;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add an assertion: assert((wcslen(result) + 1) == size);. I understand that result cannot be shorter or longer.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this always the case? You told me that _Py_ConvertWCharForm cannot truncate at first zero and then wcslen+1 doesn't have to correspond to size. Although I am not sure if that can happen when converting from UCS-4 to wchar_t.


size_t res = wcstombs(buffer, substr, buffsize);
assert(res == buffsize - 1);

/* Convert character string to UTF32 encoded char32_t string.
Since wchar_t and char32_t have the same size on Solaris and one
wchar_t symbol corresponds to one UTF32 value, we can safely
reuse this buffer and skip additional allocation. */
char32_t* c32 = (char32_t*) substr;
mbstate_t state = {0};

Py_ssize_t i = 0;
char* ptr = buffer;
char* end = ptr + res + 1;

while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
PyMem_Free(c32);
PyMem_Free(buffer);
PyErr_Format(PyExc_ValueError,
"mbrtoc32() conversion failed with error code: %zd",
(Py_ssize_t)res);
return NULL;
}
ptr += res;
i ++;
}
PyMem_Free(buffer);
return c32;
}
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */

#ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
Expand Down
0