8000 Add PyInterpreterState.fs_codec.utf8 (GH-18367) · python/cpython@bf305cc · GitHub
[go: up one dir, main page]

Skip to content

Commit bf305cc

Browse files
authored
Add PyInterpreterState.fs_codec.utf8 (GH-18367)
Add a fast-path for UTF-8 encoding in PyUnicode_EncodeFSDefault() and PyUnicode_DecodeFSDefaultAndSize(). Add _PyUnicode_FiniEncodings() helper function for _PyUnicode_Fini().
1 parent 0e4e735 commit bf305cc

File tree

2 files changed

+48
-46
lines changed

2 files changed

+48
-46
lines changed

Include/internal/pycore_pystate.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ struct _is {
102102
Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
103103
struct {
104104
char *encoding; /* Filesystem encoding (encoded to UTF-8) */
105+
int utf8; /* encoding=="utf-8"? */
105106
char *errors; /* Filesystem errors (encoded to UTF-8) */
106107
_Py_error_handler error_handler;
107108
} fs_codec;

Objects/unicodeobject.c

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3615,39 +3615,32 @@ PyObject *
36153615
PyUnicode_EncodeFSDefault(PyObject *unicode)
36163616
{
36173617
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3618-
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3619-
if (interp->fs_codec.encoding) {
3618+
if (interp->fs_codec.utf8) {
36203619
return unicode_encode_utf8(unicode,
36213620
interp->fs_codec.error_handler,
36223621
interp->fs_codec.errors);
36233622
}
3624-
else {
3625-
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3626-
_Py_error_handler errors;
3627-
errors = get_error_handler_wide(filesystem_errors);
3628-
assert(errors != _Py_ERROR_UNKNOWN);
3629-
return unicode_encode_utf8(unicode, errors, NULL);
3630-
}
3631-
#else
3632-
/* Bootstrap check: if the filesystem codec is implemented in Python, we
3633-
cannot use it to encode and decode filenames before it is loaded. Load
3634-
the Python codec requires to encode at least its own filename. Use the C
3635-
implementation of the locale codec until the codec registry is
3636-
initialized and the Python codec is loaded.
3637-
See _PyUnicode_InitEncodings(). */
3638-
if (interp->fs_codec.encoding) {
3623+
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3624+
else if (interp->fs_codec.encoding) {
36393625
return PyUnicode_AsEncodedString(unicode,
36403626
interp->fs_codec.encoding,
36413627
interp->fs_codec.errors);
36423628
}
3629+
#endif
36433630
else {
3631+
/* Before _PyUnicode_InitEncodings() is called, the Python codec
3632+
machinery is not ready and so cannot be used:
3633+
use wcstombs() in this case. */
36443634
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3645-
_Py_error_handler errors;
3646-
errors = get_error_handler_wide(filesystem_errors);
3635+
assert(filesystem_errors != NULL);
3636+
_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
36473637
assert(errors != _Py_ERROR_UNKNOWN);
3638+
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3639+
return unicode_encode_utf8(unicode, errors, NULL);
3640+
#else
36483641
return unicode_encode_locale(unicode, errors, 0);
3649-
}
36503642
#endif
3643+
}
36513644
}
36523645

36533646
PyObject *
@@ -3857,39 +3850,33 @@ PyObject*
38573850
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
38583851
{
38593852
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3860-
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3861-
if (interp->fs_codec.encoding) {
3853+
if (interp->fs_codec.utf8) {
38623854
return unicode_decode_utf8(s, size,
38633855
interp->fs_codec.error_handler,
38643856
interp->fs_codec.errors,
38653857
NULL);
38663858
}
3867-
else {
3868-
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3869-
_Py_error_handler errors;
3870-
errors = get_error_handler_wide(filesystem_errors);
3871-
assert(errors != _Py_ERROR_UNKNOWN);
3872-
return unicode_decode_utf8(s, size, errors, NULL, NULL);
3873-
}
3874-
#else
3875-
/* Bootstrap check: if the filesystem codec is implemented in Python, we
3876-
cannot use it to encode and decode filenames before it is loaded. Load
3877-
the Python codec requires to encode at least its own filename. Use the C
3878-
implementation of the locale codec until the codec registry is
3879-
initialized and the Python codec is loaded.
3880-
See _PyUnicode_InitEncodings(). */
3881-
if (interp->fs_codec.encoding) {
3859+
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3860+
else if (interp->fs_codec.encoding) {
38823861
return PyUnicode_Decode(s, size,
38833862
interp->fs_codec.encoding,
38843863
interp->fs_codec.errors);
38853864
}
3865+
#endif
38863866
else {
3867+
/* Before _PyUnicode_InitEncodings() is called, the Python codec
3868+
machinery is not ready and so cannot be used:
3869+
use mbstowcs() in this case. */
38873870
const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3888-
_Py_error_handler errors;
3889-
errors = get_error_handler_wide(filesystem_errors);
3871+
assert(filesystem_errors != NULL);
3872+
_Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3873+
assert(errors != _Py_ERROR_UNKNOWN);
3874+
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3875+
return unicode_decode_utf8(s, size, errors, NULL, NULL);
3876+
#else
38903877
return unicode_decode_locale(s, size, errors, 0);
3891-
}
38923878
#endif
3879+
}
38933880
}
38943881

38953882

@@ -15849,10 +15836,16 @@ init_fs_codec(PyInterpreterState *interp)
1584915836

1585015837
PyMem_RawFree(interp->fs_codec.encoding);
1585115838
interp->fs_codec.encoding = encoding;
15839+
/* encoding has been normalized by init_fs_encoding() */
15840+
interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
1585215841
PyMem_RawFree(interp->fs_codec.errors);
1585315842
interp->fs_codec.errors = errors;
1585415843
interp->fs_codec.error_handler = error_handler;
1585515844

15845+
#ifdef _Py_FORCE_UTF8_FS_ENCODING
15846+
assert(interp->fs_codec.utf8 == 1);
15847+
#endif
15848+
1585615849
/* At this point, PyUnicode_EncodeFSDefault() and
1585715850
PyUnicode_DecodeFSDefault() can now use the Python codec rather than
1585815851
the C implementation of the filesystem encoding. */
@@ -15902,6 +15895,19 @@ _PyUnicode_InitEncodings(PyThreadState *tstate)
1590215895
}
1590315896

1590415897

15898+
static void
15899+
_PyUnicode_FiniEncodings(PyThreadState *tstate)
15900+
{
15901+
PyInterpreterState *interp = tstate->interp;
15902+
PyMem_RawFree(interp->fs_codec.encoding);
15903+
interp->fs_codec.encoding = NULL;
15904+
interp->fs_codec.utf8 = 0;
15905+
PyMem_RawFree(interp->fs_codec.errors);
15906+
interp->fs_codec.errors = NULL;
15907+
interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15908+
}
15909+
15910+
1590515911
#ifdef MS_WINDOWS
1590615912
int
1590715913
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
@@ -15954,12 +15960,7 @@ _PyUnicode_Fini(PyThreadState *tstate)
1595415960
_PyUnicode_ClearStaticStrings();
1595515961
}
1595615962

15957-
PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15958-
PyMem_RawFree(interp->fs_codec.encoding);
15959-
interp->fs_codec.encoding = NULL;
15960-
PyMem_RawFree(interp->fs_codec.errors);
15961-
interp->fs_codec.errors = NULL;
15962-
interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN;
15963+
_PyUnicode_FiniEncodings(tstate);
1596315964
}
1596415965

1596515966

0 commit comments

Comments
 (0)
0