8000 [3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on So… by kulikjak · Pull Request #25847 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

[3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on So… #25847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Include/internal/pycore_fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
PyObject **decimal_point,
PyObject **thousands_sep);

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
extern int _Py_LocaleUsesNonUnicodeWchar(void);

extern wchar_t* _Py_DecodeNonUnicodeWchar(
const wchar_t* native,
Py_ssize_t size);

extern int _Py_EncodeNonUnicodeWchar_InPlace(
wchar_t* unicode,
Py_ssize_t size);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
40 changes: 40 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include <windows.h>
#endif

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
#endif

/* Uncomment to display statistics on interned strings at exit when
using Valgrind or Insecure++. */
/* #define INTERNED_STATS 1 */
Expand Down Expand Up @@ -2211,6 +2215,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
if (size == 0)
_Py_RETURN_UNICODE_EMPTY();

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion to UCS-4 first. */
if (_Py_LocaleUsesNonUnicodeWchar()) {
wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
if (!converted) {
return NULL;
}
PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
PyMem_Free(converted);
return unicode;
}
#endif

/* Single character Unicode objects in the Latin-1 range are
shared when using this constructor */
if (size == 1 && (Py_UCS4)*u < 256)
Expand Down Expand Up @@ -3223,6 +3241,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
res = size;
}
unicode_copy_as_widechar(unicode, w, size);

#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion first. */
if (_Py_LocaleUsesNonUnicodeWchar()) {
if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
return -1;
}
}
#endif

return res;
}

Expand All @@ -3249,6 +3278,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
return NULL;
}
unicode_copy_as_widechar(unicode, buffer, buflen + 1);

#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion first. */
if (_Py_LocaleUsesNonUnicodeWchar()) {
if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
return NULL;
}
}
#endif

if (size != NULL) {
*size = buflen;
}
Expand Down
106 changes: 106 additions & 0 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ extern int winerror_to_errno(int);
#include <sys/ioctl.h>
#endif

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
#include <iconv.h>
#endif

#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif /* HAVE_FCNTL_H */
Expand Down Expand Up @@ -96,6 +100,12 @@ _Py_device_encoding(int fd)
static size_t
is_valid_wide_char(wchar_t ch)
{
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
/* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
for non-Unicode locales, which makes values higher than MAX_UNICODE
possibly valid. */
return 1;
#endif
if (Py_UNICODE_IS_SURROGATE(ch)) {
// Reject lone surrogate characters
return 0;
Expand Down Expand Up @@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
current_locale, errors);
}

#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION

/* Check whether current locale uses Unicode as internal wchar_t form. */
int
_Py_LocaleUsesNonUnicodeWchar(void)
{
/* Oracle Solaris uses non-Unicode internal wchar_t form for
non-Unicode locales and hence needs conversion to UTF first. */
char* codeset = nl_langinfo(CODESET);
if (!codeset) {
return 0;
}
/* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
}

static wchar_t *
_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
const char *tocode, const char *fromcode)
{
Py_BUILD_ASSERT(sizeof(wchar_t) == 4);

/* Ensure we won't overflow the size. */
if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
PyErr_NoMemory();
return NULL;
}

/* the string doesn't have to be NULL terminated */
wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
if (target == NULL) {
PyErr_NoMemory();
return NULL;
}

iconv_t cd = iconv_open(tocode, fromcode);
if (cd == (iconv_t)-1) {
PyErr_Format(PyExc_ValueError, "iconv_open() failed");
PyMem_Free(target);
return NULL;
}

char *inbuf = (char *) source;
char *outbuf = (char *) target;
size_t inbytesleft = sizeof(wchar_t) * size;
size_t outbytesleft = inbytesleft;

size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (ret == DECODE_ERROR) {
PyErr_Format(PyExc_ValueError, "iconv() failed");
PyMem_Free(target);
iconv_close(cd);
return NULL;
}

iconv_close(cd);
return target;
}

/* Convert a wide character string to the UCS-4 encoded string. This
is necessary on systems where internal form of wchar_t are not Unicode
code points (e.g. Oracle Solaris).

Return a pointer to a newly allocated string, use PyMem_Free() to free
the memory. Return NULL and raise exception on conversion or memory
allocation error. */
wchar_t *
_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
{
return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
}

/* Convert a UCS-4 encoded string to native wide character string. This
is necessary on systems where internal form of wchar_t are not Unicode
code points (e.g. Oracle Solaris).

The conversion is done in place. This can be done because both wchar_t
and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
which is currently the only system using these functions; it doesn't have
to be for other systems).

Return 0 on success. Return -1 and raise exception on conversion
or memory allocation error. */
int
_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
{
wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
if (!result) {
return -1;
}
memcpy(unicode, result, size * sizeof(wchar_t));
PyMem_Free(result);
return 0;
}
#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */

#ifdef MS_WINDOWS
static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
Expand Down
16 changes: 16 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -15123,6 +15123,22 @@ else
$as_echo "no" >&6; }
fi

case $ac_sys_system/$ac_sys_release in
SunOS/*)
if test -f /etc/os-release; then
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
if test "x$OS_NAME" = "xOracle Solaris"; then
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
# non-Unicode locales is not Unicode and hence cannot be used directly.
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html

$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h

fi
fi
;;
esac

# check for endianness
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
Expand Down
16 changes: 16 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -4759,6 +4759,22 @@ else
AC_MSG_RESULT(no)
fi

case $ac_sys_system/$ac_sys_release in
SunOS/*)
if test -f /etc/os-release; then
OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
if test "x$OS_NAME" = "xOracle Solaris"; then
# bpo-43667: In Oracle Solaris, the internal form of wchar_t in
# non-Unicode locales is not Unicode and hence cannot be used directly.
# https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
[Define if the internal form of wchar_t in non-Unicode locales
is not Unicode.])
fi
fi
;;
esac

# check for endianness
AC_C_BIGENDIAN

Expand Down
4 changes: 4 additions & 0 deletions pyconfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,10 @@
/* Define to 1 if you have the `nice' function. */
#undef HAVE_NICE

/* Define if the internal form of wchar_t in non-Unicode locales is not
Unicode. */
#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION

/* Define to 1 if you have the `openat' function. */
#undef HAVE_OPENAT

Expand Down
0