diff --git a/Include/ucnhash.h b/Include/ucnhash.h index 45362e997dfa11..9191881aeeda52 100644 --- a/Include/ucnhash.h +++ b/Include/ucnhash.h @@ -27,6 +27,8 @@ typedef struct { int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code, int with_named_seq); + PyObject *module; + } _PyUnicode_Name_CAPI; #ifdef __cplusplus diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-09-07-20-44-12.bpo-1635741.iYVVTS.rst b/Misc/NEWS.d/next/Core and Builtins/2020-09-07-20-44-12.bpo-1635741.iYVVTS.rst new file mode 100644 index 00000000000000..ae26093ae16564 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-09-07-20-44-12.bpo-1635741.iYVVTS.rst @@ -0,0 +1,2 @@ +Port the :mod:`unicodedata` extension module to multi-phase initialization +(:pep:`489`). diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 4251db2edc7b96..fc5e674b240fa2 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -13,14 +13,49 @@ PyDoc_STRVAR(unicodedata_UCD_decimal__doc__, "ValueError is raised."); #define UNICODEDATA_UCD_DECIMAL_METHODDEF \ - {"decimal", (PyCFunction)(void(*)(void))unicodedata_UCD_decimal, METH_FASTCALL, unicodedata_UCD_decimal__doc__}, + {"decimal", (PyCFunction)(void(*)(void))unicodedata_UCD_decimal, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_decimal__doc__}, static PyObject * -unicodedata_UCD_decimal_impl(PyObject *self, int chr, +unicodedata_UCD_decimal_impl(PyObject *self, PyTypeObject *cls, int chr, PyObject *default_value); static PyObject * -unicodedata_UCD_decimal(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +unicodedata_UCD_decimal(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "", NULL}; + static _PyArg_Parser _parser = {"C|O:decimal", _keywords, 0}; + int chr; + PyObject *default_value = NULL; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr, &default_value)) { + goto exit; + } + return_value = unicodedata_UCD_decimal_impl(self, cls, chr, default_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_decimal__doc__, +"decimal($module, chr, default=, /)\n" +"--\n" +"\n" +"Converts a Unicode character into its equivalent decimal value.\n" +"\n" +"Returns the decimal value assigned to the character chr as integer.\n" +"If no such value is defined, default is returned, or, if not given,\n" +"ValueError is raised."); + +#define UNICODEDATA_DECIMAL_METHODDEF \ + {"decimal", (PyCFunction)(void(*)(void))unicodedata_decimal, METH_FASTCALL, unicodedata_decimal__doc__}, + +static PyObject * +unicodedata_decimal_impl(PyObject *module, int chr, PyObject *default_value); + +static PyObject * +unicodedata_decimal(PyObject *module, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; int chr; @@ -46,7 +81,7 @@ unicodedata_UCD_decimal(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } default_value = args[1]; skip_optional: - return_value = unicodedata_UCD_decimal_impl(self, chr, default_value); + return_value = unicodedata_decimal_impl(module, chr, default_value); exit: return return_value; @@ -112,14 +147,49 @@ PyDoc_STRVAR(unicodedata_UCD_numeric__doc__, "ValueError is raised."); #define UNICODEDATA_UCD_NUMERIC_METHODDEF \ - {"numeric", (PyCFunction)(void(*)(void))unicodedata_UCD_numeric, METH_FASTCALL, unicodedata_UCD_numeric__doc__}, + {"numeric", (PyCFunction)(void(*)(void))unicodedata_UCD_numeric, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_numeric__doc__}, static PyObject * -unicodedata_UCD_numeric_impl(PyObject *self, int chr, +unicodedata_UCD_numeric_impl(PyObject *self, PyTypeObject *cls, int chr, PyObject *default_value); static PyObject * -unicodedata_UCD_numeric(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +unicodedata_UCD_numeric(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "", NULL}; + static _PyArg_Parser _parser = {"C|O:numeric", _keywords, 0}; + int chr; + PyObject *default_value = NULL; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr, &default_value)) { + goto exit; + } + return_value = unicodedata_UCD_numeric_impl(self, cls, chr, default_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_numeric__doc__, +"numeric($module, chr, default=, /)\n" +"--\n" +"\n" +"Converts a Unicode character into its equivalent numeric value.\n" +"\n" +"Returns the numeric value assigned to the character chr as float.\n" +"If no such value is defined, default is returned, or, if not given,\n" +"ValueError is raised."); + +#define UNICODEDATA_NUMERIC_METHODDEF \ + {"numeric", (PyCFunction)(void(*)(void))unicodedata_numeric, METH_FASTCALL, unicodedata_numeric__doc__}, + +static PyObject * +unicodedata_numeric_impl(PyObject *module, int chr, PyObject *default_value); + +static PyObject * +unicodedata_numeric(PyObject *module, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; int chr; @@ -145,7 +215,7 @@ unicodedata_UCD_numeric(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } default_value = args[1]; skip_optional: - return_value = unicodedata_UCD_numeric_impl(self, chr, default_value); + return_value = unicodedata_numeric_impl(module, chr, default_value); exit: return return_value; @@ -158,13 +228,43 @@ PyDoc_STRVAR(unicodedata_UCD_category__doc__, "Returns the general category assigned to the character chr as string."); #define UNICODEDATA_UCD_CATEGORY_METHODDEF \ - {"category", (PyCFunction)unicodedata_UCD_category, METH_O, unicodedata_UCD_category__doc__}, + {"category", (PyCFunction)(void(*)(void))unicodedata_UCD_category, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_category__doc__}, + +static PyObject * +unicodedata_UCD_category_impl(PyObject *self, PyTypeObject *cls, int chr); + +static PyObject * +unicodedata_UCD_category(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:category", _keywords, 0}; + int chr; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + return_value = unicodedata_UCD_category_impl(self, cls, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_category__doc__, +"category($module, chr, /)\n" +"--\n" +"\n" +"Returns the general category assigned to the character chr as string."); + +#define UNICODEDATA_CATEGORY_METHODDEF \ + {"category", (PyCFunction)unicodedata_category, METH_O, unicodedata_category__doc__}, static PyObject * -unicodedata_UCD_category_impl(PyObject *self, int chr); +unicodedata_category_impl(PyObject *module, int chr); static PyObject * -unicodedata_UCD_category(PyObject *self, PyObject *arg) +unicodedata_category(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -181,7 +281,7 @@ unicodedata_UCD_category(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_category_impl(self, chr); + return_value = unicodedata_category_impl(module, chr); exit: return return_value; @@ -196,13 +296,46 @@ PyDoc_STRVAR(unicodedata_UCD_bidirectional__doc__, "If no such value is defined, an empty string is returned."); #define UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF \ - {"bidirectional", (PyCFunction)unicodedata_UCD_bidirectional, METH_O, unicodedata_UCD_bidirectional__doc__}, + {"bidirectional", (PyCFunction)(void(*)(void))unicodedata_UCD_bidirectional, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_bidirectional__doc__}, static PyObject * -unicodedata_UCD_bidirectional_impl(PyObject *self, int chr); +unicodedata_UCD_bidirectional_impl(PyObject *self, PyTypeObject *cls, + int chr); static PyObject * -unicodedata_UCD_bidirectional(PyObject *self, PyObject *arg) +unicodedata_UCD_bidirectional(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:bidirectional", _keywords, 0}; + int chr; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + return_value = unicodedata_UCD_bidirectional_impl(self, cls, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_bidirectional__doc__, +"bidirectional($module, chr, /)\n" +"--\n" +"\n" +"Returns the bidirectional class assigned to the character chr as string.\n" +"\n" +"If no such value is defined, an empty string is returned."); + +#define UNICODEDATA_BIDIRECTIONAL_METHODDEF \ + {"bidirectional", (PyCFunction)unicodedata_bidirectional, METH_O, unicodedata_bidirectional__doc__}, + +static PyObject * +unicodedata_bidirectional_impl(PyObject *module, int chr); + +static PyObject * +unicodedata_bidirectional(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -219,7 +352,7 @@ unicodedata_UCD_bidirectional(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_bidirectional_impl(self, chr); + return_value = unicodedata_bidirectional_impl(module, chr); exit: return return_value; @@ -234,13 +367,50 @@ PyDoc_STRVAR(unicodedata_UCD_combining__doc__, "Returns 0 if no combining class is defined."); #define UNICODEDATA_UCD_COMBINING_METHODDEF \ - {"combining", (PyCFunction)unicodedata_UCD_combining, METH_O, unicodedata_UCD_combining__doc__}, + {"combining", (PyCFunction)(void(*)(void))unicodedata_UCD_combining, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_combining__doc__}, + +static int +unicodedata_UCD_combining_impl(PyObject *self, PyTypeObject *cls, int chr); + +static PyObject * +unicodedata_UCD_combining(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:combining", _keywords, 0}; + int chr; + int _return_value; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + _return_value = unicodedata_UCD_combining_impl(self, cls, chr); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyLong_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_combining__doc__, +"combining($module, chr, /)\n" +"--\n" +"\n" +"Returns the canonical combining class assigned to the character chr as integer.\n" +"\n" +"Returns 0 if no combining class is defined."); + +#define UNICODEDATA_COMBINING_METHODDEF \ + {"combining", (PyCFunction)unicodedata_combining, METH_O, unicodedata_combining__doc__}, static int -unicodedata_UCD_combining_impl(PyObject *self, int chr); +unicodedata_combining_impl(PyObject *module, int chr); static PyObject * -unicodedata_UCD_combining(PyObject *self, PyObject *arg) +unicodedata_combining(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -258,7 +428,7 @@ unicodedata_UCD_combining(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - _return_value = unicodedata_UCD_combining_impl(self, chr); + _return_value = unicodedata_combining_impl(module, chr); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -278,13 +448,51 @@ PyDoc_STRVAR(unicodedata_UCD_mirrored__doc__, "character in bidirectional text, 0 otherwise."); #define UNICODEDATA_UCD_MIRRORED_METHODDEF \ - {"mirrored", (PyCFunction)unicodedata_UCD_mirrored, METH_O, unicodedata_UCD_mirrored__doc__}, + {"mirrored", (PyCFunction)(void(*)(void))unicodedata_UCD_mirrored, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_mirrored__doc__}, static int -unicodedata_UCD_mirrored_impl(PyObject *self, int chr); +unicodedata_UCD_mirrored_impl(PyObject *self, PyTypeObject *cls, int chr); static PyObject * -unicodedata_UCD_mirrored(PyObject *self, PyObject *arg) +unicodedata_UCD_mirrored(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:mirrored", _keywords, 0}; + int chr; + int _return_value; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + _return_value = unicodedata_UCD_mirrored_impl(self, cls, chr); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyLong_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_mirrored__doc__, +"mirrored($module, chr, /)\n" +"--\n" +"\n" +"Returns the mirrored property assigned to the character chr as integer.\n" +"\n" +"Returns 1 if the character has been identified as a \"mirrored\"\n" +"character in bidirectional text, 0 otherwise."); + +#define UNICODEDATA_MIRRORED_METHODDEF \ + {"mirrored", (PyCFunction)unicodedata_mirrored, METH_O, unicodedata_mirrored__doc__}, + +static int +unicodedata_mirrored_impl(PyObject *module, int chr); + +static PyObject * +unicodedata_mirrored(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -302,7 +510,7 @@ unicodedata_UCD_mirrored(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - _return_value = unicodedata_UCD_mirrored_impl(self, chr); + _return_value = unicodedata_mirrored_impl(module, chr); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -319,13 +527,44 @@ PyDoc_STRVAR(unicodedata_UCD_east_asian_width__doc__, "Returns the east asian width assigned to the character chr as string."); #define UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF \ - {"east_asian_width", (PyCFunction)unicodedata_UCD_east_asian_width, METH_O, unicodedata_UCD_east_asian_width__doc__}, + {"east_asian_width", (PyCFunction)(void(*)(void))unicodedata_UCD_east_asian_width, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_east_asian_width__doc__}, + +static PyObject * +unicodedata_UCD_east_asian_width_impl(PyObject *self, PyTypeObject *cls, + int chr); + +static PyObject * +unicodedata_UCD_east_asian_width(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:east_asian_width", _keywords, 0}; + int chr; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + return_value = unicodedata_UCD_east_asian_width_impl(self, cls, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_east_asian_width__doc__, +"east_asian_width($module, chr, /)\n" +"--\n" +"\n" +"Returns the east asian width assigned to the character chr as string."); + +#define UNICODEDATA_EAST_ASIAN_WIDTH_METHODDEF \ + {"east_asian_width", (PyCFunction)unicodedata_east_asian_width, METH_O, unicodedata_east_asian_width__doc__}, static PyObject * -unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr); +unicodedata_east_asian_width_impl(PyObject *module, int chr); static PyObject * -unicodedata_UCD_east_asian_width(PyObject *self, PyObject *arg) +unicodedata_east_asian_width(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -342,7 +581,7 @@ unicodedata_UCD_east_asian_width(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_east_asian_width_impl(self, chr); + return_value = unicodedata_east_asian_width_impl(module, chr); exit: return return_value; @@ -357,13 +596,46 @@ PyDoc_STRVAR(unicodedata_UCD_decomposition__doc__, "An empty string is returned in case no such mapping is defined."); #define UNICODEDATA_UCD_DECOMPOSITION_METHODDEF \ - {"decomposition", (PyCFunction)unicodedata_UCD_decomposition, METH_O, unicodedata_UCD_decomposition__doc__}, + {"decomposition", (PyCFunction)(void(*)(void))unicodedata_UCD_decomposition, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_decomposition__doc__}, + +static PyObject * +unicodedata_UCD_decomposition_impl(PyObject *self, PyTypeObject *cls, + int chr); + +static PyObject * +unicodedata_UCD_decomposition(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"C:decomposition", _keywords, 0}; + int chr; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr)) { + goto exit; + } + return_value = unicodedata_UCD_decomposition_impl(self, cls, chr); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_decomposition__doc__, +"decomposition($module, chr, /)\n" +"--\n" +"\n" +"Returns the character decomposition mapping assigned to the character chr as string.\n" +"\n" +"An empty string is returned in case no such mapping is defined."); + +#define UNICODEDATA_DECOMPOSITION_METHODDEF \ + {"decomposition", (PyCFunction)unicodedata_decomposition, METH_O, unicodedata_decomposition__doc__}, static PyObject * -unicodedata_UCD_decomposition_impl(PyObject *self, int chr); +unicodedata_decomposition_impl(PyObject *module, int chr); static PyObject * -unicodedata_UCD_decomposition(PyObject *self, PyObject *arg) +unicodedata_decomposition(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int chr; @@ -380,7 +652,7 @@ unicodedata_UCD_decomposition(PyObject *self, PyObject *arg) goto exit; } chr = PyUnicode_READ_CHAR(arg, 0); - return_value = unicodedata_UCD_decomposition_impl(self, chr); + return_value = unicodedata_decomposition_impl(module, chr); exit: return return_value; @@ -395,14 +667,48 @@ PyDoc_STRVAR(unicodedata_UCD_is_normalized__doc__, "Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); #define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \ - {"is_normalized", (PyCFunction)(void(*)(void))unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__}, + {"is_normalized", (PyCFunction)(void(*)(void))unicodedata_UCD_is_normalized, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_is_normalized__doc__}, + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyTypeObject *cls, + PyObject *form, PyObject *input); + +static PyObject * +unicodedata_UCD_is_normalized(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "", NULL}; + static _PyArg_Parser _parser = {"UU:is_normalized", _keywords, 0}; + PyObject *form; + PyObject *input; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &form, &input)) { + goto exit; + } + return_value = unicodedata_UCD_is_normalized_impl(self, cls, form, input); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_is_normalized__doc__, +"is_normalized($module, form, unistr, /)\n" +"--\n" +"\n" +"Return whether the Unicode string unistr is in the normal form \'form\'.\n" +"\n" +"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); + +#define UNICODEDATA_IS_NORMALIZED_METHODDEF \ + {"is_normalized", (PyCFunction)(void(*)(void))unicodedata_is_normalized, METH_FASTCALL, unicodedata_is_normalized__doc__}, static PyObject * -unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, - PyObject *input); +unicodedata_is_normalized_impl(PyObject *module, PyObject *form, + PyObject *input); static PyObject * -unicodedata_UCD_is_normalized(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +unicodedata_is_normalized(PyObject *module, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; PyObject *form; @@ -427,7 +733,7 @@ unicodedata_UCD_is_normalized(PyObject *self, PyObject *const *args, Py_ssize_t goto exit; } input = args[1]; - return_value = unicodedata_UCD_is_normalized_impl(self, form, input); + return_value = unicodedata_is_normalized_impl(module, form, input); exit: return return_value; @@ -442,14 +748,47 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__, "Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); #define UNICODEDATA_UCD_NORMALIZE_METHODDEF \ - {"normalize", (PyCFunction)(void(*)(void))unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__}, + {"normalize", (PyCFunction)(void(*)(void))unicodedata_UCD_normalize, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_normalize__doc__}, static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, - PyObject *input); +unicodedata_UCD_normalize_impl(PyObject *self, PyTypeObject *cls, + PyObject *form, PyObject *input); static PyObject * -unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +unicodedata_UCD_normalize(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "", NULL}; + static _PyArg_Parser _parser = {"UU:normalize", _keywords, 0}; + PyObject *form; + PyObject *input; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &form, &input)) { + goto exit; + } + return_value = unicodedata_UCD_normalize_impl(self, cls, form, input); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_normalize__doc__, +"normalize($module, form, unistr, /)\n" +"--\n" +"\n" +"Return the normal form \'form\' for the Unicode string unistr.\n" +"\n" +"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); + +#define UNICODEDATA_NORMALIZE_METHODDEF \ + {"normalize", (PyCFunction)(void(*)(void))unicodedata_normalize, METH_FASTCALL, unicodedata_normalize__doc__}, + +static PyObject * +unicodedata_normalize_impl(PyObject *module, PyObject *form, PyObject *input); + +static PyObject * +unicodedata_normalize(PyObject *module, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; PyObject *form; @@ -474,7 +813,7 @@ unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t narg goto exit; } input = args[1]; - return_value = unicodedata_UCD_normalize_impl(self, form, input); + return_value = unicodedata_normalize_impl(module, form, input); exit: return return_value; @@ -490,13 +829,48 @@ PyDoc_STRVAR(unicodedata_UCD_name__doc__, "ValueError is raised."); #define UNICODEDATA_UCD_NAME_METHODDEF \ - {"name", (PyCFunction)(void(*)(void))unicodedata_UCD_name, METH_FASTCALL, unicodedata_UCD_name__doc__}, + {"name", (PyCFunction)(void(*)(void))unicodedata_UCD_name, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_name__doc__}, + +static PyObject * +unicodedata_UCD_name_impl(PyObject *self, PyTypeObject *cls, int chr, + PyObject *default_value); + +static PyObject * +unicodedata_UCD_name(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "", NULL}; + static _PyArg_Parser _parser = {"C|O:name", _keywords, 0}; + int chr; + PyObject *default_value = NULL; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &chr, &default_value)) { + goto exit; + } + return_value = unicodedata_UCD_name_impl(self, cls, chr, default_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_name__doc__, +"name($module, chr, default=, /)\n" +"--\n" +"\n" +"Returns the name assigned to the character chr as a string.\n" +"\n" +"If no name is defined, default is returned, or, if not given,\n" +"ValueError is raised."); + +#define UNICODEDATA_NAME_METHODDEF \ + {"name", (PyCFunction)(void(*)(void))unicodedata_name, METH_FASTCALL, unicodedata_name__doc__}, static PyObject * -unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value); +unicodedata_name_impl(PyObject *module, int chr, PyObject *default_value); static PyObject * -unicodedata_UCD_name(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +unicodedata_name(PyObject *module, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; int chr; @@ -522,7 +896,7 @@ unicodedata_UCD_name(PyObject *self, PyObject *const *args, Py_ssize_t nargs) } default_value = args[1]; skip_optional: - return_value = unicodedata_UCD_name_impl(self, chr, default_value); + return_value = unicodedata_name_impl(module, chr, default_value); exit: return return_value; @@ -538,14 +912,49 @@ PyDoc_STRVAR(unicodedata_UCD_lookup__doc__, "corresponding character. If not found, KeyError is raised."); #define UNICODEDATA_UCD_LOOKUP_METHODDEF \ - {"lookup", (PyCFunction)unicodedata_UCD_lookup, METH_O, unicodedata_UCD_lookup__doc__}, + {"lookup", (PyCFunction)(void(*)(void))unicodedata_UCD_lookup, METH_METHOD|METH_FASTCALL|METH_KEYWORDS, unicodedata_UCD_lookup__doc__}, + +static PyObject * +unicodedata_UCD_lookup_impl(PyObject *self, PyTypeObject *cls, + const char *name, Py_ssize_clean_t name_length); + +static PyObject * +unicodedata_UCD_lookup(PyObject *self, PyTypeObject *cls, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + static const char * const _keywords[] = {"", NULL}; + static _PyArg_Parser _parser = {"s#:lookup", _keywords, 0}; + const char *name; + Py_ssize_clean_t name_length; + + if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser, + &name, &name_length)) { + goto exit; + } + return_value = unicodedata_UCD_lookup_impl(self, cls, name, name_length); + +exit: + return return_value; +} + +PyDoc_STRVAR(unicodedata_lookup__doc__, +"lookup($module, name, /)\n" +"--\n" +"\n" +"Look up character by name.\n" +"\n" +"If a character with the given name is found, return the\n" +"corresponding character. If not found, KeyError is raised."); + +#define UNICODEDATA_LOOKUP_METHODDEF \ + {"lookup", (PyCFunction)unicodedata_lookup, METH_O, unicodedata_lookup__doc__}, static PyObject * -unicodedata_UCD_lookup_impl(PyObject *self, const char *name, - Py_ssize_clean_t name_length); +unicodedata_lookup_impl(PyObject *module, const char *name, + Py_ssize_clean_t name_length); static PyObject * -unicodedata_UCD_lookup(PyObject *self, PyObject *arg) +unicodedata_lookup(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; const char *name; @@ -554,9 +963,9 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) if (!PyArg_Parse(arg, "s#:lookup", &name, &name_length)) { goto exit; } - return_value = unicodedata_UCD_lookup_impl(self, name, name_length); + return_value = unicodedata_lookup_impl(module, name, name_length); exit: return return_value; } -/*[clinic end generated code: output=10c23477dbe8a202 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=33a45f2827e2d12f input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 8a1198a2b712d9..ff140b9ea519fd 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -26,6 +26,19 @@ _Py_IDENTIFIER(NFD); _Py_IDENTIFIER(NFKC); _Py_IDENTIFIER(NFKD); +typedef struct { + PyTypeObject *ucd_type; + _PyUnicode_Name_CAPI capsule_api; +} unicodedata_state; + +static inline unicodedata_state* +unicodedata_get_state(PyObject *module) +{ + void *state = PyModule_GetState(module); + assert(state != NULL); + return (unicodedata_state *)state; +} + /*[clinic input] module unicodedata class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' @@ -91,51 +104,33 @@ static PyMemberDef DB_members[] = { }; /* forward declaration */ -static PyTypeObject UCD_Type; -#define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type) +#define UCD_Check(state, o) Py_IS_TYPE(o, state->ucd_type) static PyObject* -new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), +new_previous_version(unicodedata_state *state, const char*name, + const change_record* (*getrecord)(Py_UCS4), Py_UCS4 (*normalization)(Py_UCS4)) { - PreviousDBVersion *self; - self = PyObject_New(PreviousDBVersion, &UCD_Type); - if (self == NULL) - return NULL; - self->name = name; - self->getrecord = getrecord; - self->normalization = normalization; - return (PyObject*)self; + PreviousDBVersion *self; + self = PyObject_New(PreviousDBVersion, state->ucd_type); + if (self == NULL) { + return NULL; + } + self->name = name; + self->getrecord = getrecord; + self->normalization = normalization; + return (PyObject*)self; } - -/* --- Module API --------------------------------------------------------- */ - -/*[clinic input] -unicodedata.UCD.decimal - - self: self - chr: int(accept={str}) - default: object=NULL - / - -Converts a Unicode character into its equivalent decimal value. - -Returns the decimal value assigned to the character chr as integer. -If no such value is defined, default is returned, or, if not given, -ValueError is raised. -[clinic start generated code]*/ - static PyObject * -unicodedata_UCD_decimal_impl(PyObject *self, int chr, - PyObject *default_value) -/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/ +unicodedata_UCD_decimal_internal(PyObject *self, unicodedata_state *state, + int chr, PyObject *default_value) { int have_old = 0; long rc; Py_UCS4 c = (Py_UCS4)chr; - if (self && UCD_Check(self)) { + if (self && UCD_Check(state, self)) { const change_record *old = get_old_record(self, c); if (old->category_changed == 0) { /* unassigned */ @@ -148,8 +143,9 @@ unicodedata_UCD_decimal_impl(PyObject *self, int chr, } } - if (!have_old) + if (!have_old) { rc = Py_UNICODE_TODECIMAL(c); + } if (rc < 0) { if (default_value == NULL) { PyErr_SetString(PyExc_ValueError, @@ -164,6 +160,231 @@ unicodedata_UCD_decimal_impl(PyObject *self, int chr, return PyLong_FromLong(rc); } +static PyObject * +unicodedata_UCD_numeric_internal(PyObject *self, unicodedata_state *state, int chr, + PyObject *default_value) +{ + int have_old = 0; + double rc; + Py_UCS4 c = (Py_UCS4)chr; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) { + /* unassigned */ + have_old = 1; + rc = -1.0; + } + else if (old->decimal_changed != 0xFF) { + have_old = 1; + rc = old->decimal_changed; + } + } + + if (!have_old) { + rc = Py_UNICODE_TONUMERIC(c); + } + if (rc == -1.0) { + if (default_value == NULL) { + PyErr_SetString(PyExc_ValueError, "not a numeric character"); + return NULL; + } + else { + Py_INCREF(default_value); + return default_value; + } + } + return PyFloat_FromDouble(rc); +} + +static PyObject * +unicodedata_UCD_category_internal(PyObject *self, unicodedata_state *state, int chr) +{ + int index; + Py_UCS4 c = (Py_UCS4)chr; + index = (int) _getrecord_ex(c)->category; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed != 0xFF) { + index = old->category_changed; + } + } + return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); +} + +static PyObject * +unicodedata_UCD_bidirectional_internal(PyObject *self, unicodedata_state *state, + int chr) +{ + int index; + Py_UCS4 c = (Py_UCS4)chr; + index = (int) _getrecord_ex(c)->bidirectional; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) + index = 0; /* unassigned */ + else if (old->bidir_changed != 0xFF) + index = old->bidir_changed; + } + return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); +} + +static int +unicodedata_UCD_combining_internal(PyObject *self, unicodedata_state *state, int chr) +{ + int index; + Py_UCS4 c = (Py_UCS4)chr; + index = (int) _getrecord_ex(c)->combining; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) + index = 0; /* unassigned */ + } + return index; +} + +static int +unicodedata_UCD_mirrored_internal(PyObject *self, unicodedata_state *state, int chr) +{ + Py_UCS4 c = (Py_UCS4)chr; + int index = (int) _getrecord_ex(c)->mirrored; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) { + index = 0; /* unassigned */ + } + else if (old->mirrored_changed != 0xFF) { + index = old->mirrored_changed; + } + } + return index; +} + +static PyObject * +unicodedata_UCD_east_asian_width_internal(PyObject *self, unicodedata_state *state, + int chr) +{ + int index; + Py_UCS4 c = (Py_UCS4)chr; + index = (int) _getrecord_ex(c)->east_asian_width; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) + index = 0; /* unassigned */ + else if (old->east_asian_width_changed != 0xFF) + index = old->east_asian_width_changed; + } + return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); +} + +static PyObject * +unicodedata_UCD_decomposition_internal(PyObject *self, + unicodedata_state *state, int chr) +{ + char decomp[256]; + int code, index, count; + size_t i; + unsigned int prefix_index; + Py_UCS4 c = (Py_UCS4)chr; + + code = (int)c; + + if (self && UCD_Check(state, self)) { + const change_record *old = get_old_record(self, c); + if (old->category_changed == 0) + return PyUnicode_FromString(""); /* unassigned */ + } + + if (code < 0 || code >= 0x110000) + index = 0; + else { + index = decomp_index1[(code>>DECOMP_SHIFT)]; + index = decomp_index2[(index<> 8; + + /* XXX: could allocate the PyString up front instead + (strlen(prefix) + 5 * count + 1 bytes) */ + + /* Based on how index is calculated above and decomp_data is generated + from Tools/unicode/makeunicodedata.py, it should not be possible + to overflow decomp_prefix. */ + prefix_index = decomp_data[index] & 255; + assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); + + /* copy prefix */ + i = strlen(decomp_prefix[prefix_index]); + memcpy(decomp, decomp_prefix[prefix_index], i); + + while (count-- > 0) { + if (i) + decomp[i++] = ' '; + assert(i < sizeof(decomp)); + PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", + decomp_data[++index]); + i += strlen(decomp + i); + } + return PyUnicode_FromStringAndSize(decomp, i); +} + +/* --- Module/UCD API ----------------------------------------------------- */ + +/*[clinic input] +unicodedata.UCD.decimal + + self: self + cls: defining_class + chr: int(accept={str}) + default: object=NULL + / + +Converts a Unicode character into its equivalent decimal value. + +Returns the decimal value assigned to the character chr as integer. +If no such value is defined, default is returned, or, if not given, +ValueError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_decimal_impl(PyObject *self, PyTypeObject *cls, int chr, + PyObject *default_value) +/*[clinic end generated code: output=21beac933d111d6c input=6ef987e727b5751f]*/ +{ + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_decimal_internal(self, state, chr, default_value); +} + +/*[clinic input] +unicodedata.decimal + + chr: int(accept={str}) + default: object=NULL + / + +Converts a Unicode character into its equivalent decimal value. + +Returns the decimal value assigned to the character chr as integer. +If no such value is defined, default is returned, or, if not given, +ValueError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_decimal_impl(PyObject *module, int chr, PyObject *default_value) +/*[clinic end generated code: output=f2757a863feb4d17 input=3170514558644556]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_decimal_internal(module, state, chr, default_value); +} + /*[clinic input] unicodedata.UCD.digit @@ -203,6 +424,7 @@ unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value) unicodedata.UCD.numeric self: self + cls: defining_class chr: int(accept={str}) default: object=NULL / @@ -215,46 +437,41 @@ ValueError is raised. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_numeric_impl(PyObject *self, int chr, +unicodedata_UCD_numeric_impl(PyObject *self, PyTypeObject *cls, int chr, PyObject *default_value) -/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/ +/*[clinic end generated code: output=7ce9a0ce2bcc3896 input=bee4424dc4286ca6]*/ { - int have_old = 0; - double rc; - Py_UCS4 c = (Py_UCS4)chr; + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_numeric_internal(self, state, chr, default_value); +} - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) { - /* unassigned */ - have_old = 1; - rc = -1.0; - } - else if (old->decimal_changed != 0xFF) { - have_old = 1; - rc = old->decimal_changed; - } - } +/*[clinic input] +unicodedata.numeric - if (!have_old) - rc = Py_UNICODE_TONUMERIC(c); - if (rc == -1.0) { - if (default_value == NULL) { - PyErr_SetString(PyExc_ValueError, "not a numeric character"); - return NULL; - } - else { - Py_INCREF(default_value); - return default_value; - } - } - return PyFloat_FromDouble(rc); + chr: int(accept={str}) + default: object=NULL + / + +Converts a Unicode character into its equivalent numeric value. + +Returns the numeric value assigned to the character chr as float. +If no such value is defined, default is returned, or, if not given, +ValueError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_numeric_impl(PyObject *module, int chr, PyObject *default_value) +/*[clinic end generated code: output=538e938e0facd290 input=aec71384740357d1]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_numeric_internal(module, state, chr, default_value); } /*[clinic input] unicodedata.UCD.category self: self + cls: defining_class chr: int(accept={str}) / @@ -262,24 +479,35 @@ Returns the general category assigned to the character chr as string. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_category_impl(PyObject *self, int chr) -/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/ +unicodedata_UCD_category_impl(PyObject *self, PyTypeObject *cls, int chr) +/*[clinic end generated code: output=c1fdfb5f4d7aa1ad input=3e6bb75b6cea115e]*/ { - int index; - Py_UCS4 c = (Py_UCS4)chr; - index = (int) _getrecord_ex(c)->category; - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed != 0xFF) - index = old->category_changed; - } - return PyUnicode_FromString(_PyUnicode_CategoryNames[index]); + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_category_internal(self, state, chr); +} + +/*[clinic input] +unicodedata.category + + chr: int(accept={str}) + / + +Returns the general category assigned to the character chr as string. +[clinic start generated code]*/ + +static PyObject * +unicodedata_category_impl(PyObject *module, int chr) +/*[clinic end generated code: output=24feeb93a07837e8 input=3cf6a92fc05dd6af]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_category_internal(module, state, chr); } /*[clinic input] unicodedata.UCD.bidirectional self: self + cls: defining_class chr: int(accept={str}) / @@ -289,26 +517,38 @@ If no such value is defined, an empty string is returned. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_bidirectional_impl(PyObject *self, int chr) -/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/ +unicodedata_UCD_bidirectional_impl(PyObject *self, PyTypeObject *cls, + int chr) +/*[clinic end generated code: output=b4cd9a1cb78fe1cb input=4184a5fc83d31de7]*/ { - int index; - Py_UCS4 c = (Py_UCS4)chr; - index = (int) _getrecord_ex(c)->bidirectional; - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) - index = 0; /* unassigned */ - else if (old->bidir_changed != 0xFF) - index = old->bidir_changed; - } - return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]); + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_bidirectional_internal(self, state, chr); +} + +/*[clinic input] +unicodedata.bidirectional + + chr: int(accept={str}) + / + +Returns the bidirectional class assigned to the character chr as string. + +If no such value is defined, an empty string is returned. +[clinic start generated code]*/ + +static PyObject * +unicodedata_bidirectional_impl(PyObject *module, int chr) +/*[clinic end generated code: output=81effc4e7de8afc3 input=63d0e38342ad789d]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_bidirectional_internal(module, state, chr); } /*[clinic input] unicodedata.UCD.combining -> int self: self + cls: defining_class chr: int(accept={str}) / @@ -318,24 +558,37 @@ Returns 0 if no combining class is defined. [clinic start generated code]*/ static int -unicodedata_UCD_combining_impl(PyObject *self, int chr) -/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/ +unicodedata_UCD_combining_impl(PyObject *self, PyTypeObject *cls, int chr) +/*[clinic end generated code: output=43b2352436ced66c input=f6eda9295714f1e0]*/ { - int index; - Py_UCS4 c = (Py_UCS4)chr; - index = (int) _getrecord_ex(c)->combining; - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) - index = 0; /* unassigned */ - } - return index; + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_combining_internal(self, state, chr); +} + +/*[clinic input] +unicodedata.combining -> int + + chr: int(accept={str}) + / + +Returns the canonical combining class assigned to the character chr as integer. + +Returns 0 if no combining class is defined. +[clinic start generated code]*/ + +static int +unicodedata_combining_impl(PyObject *module, int chr) +/*[clinic end generated code: output=501777a9e7f48488 input=65d73ad166e024a6]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_combining_internal(module, state, chr); } /*[clinic input] unicodedata.UCD.mirrored -> int self: self + cls: defining_class chr: int(accept={str}) / @@ -346,26 +599,38 @@ character in bidirectional text, 0 otherwise. [clinic start generated code]*/ static int -unicodedata_UCD_mirrored_impl(PyObject *self, int chr) -/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/ +unicodedata_UCD_mirrored_impl(PyObject *self, PyTypeObject *cls, int chr) +/*[clinic end generated code: output=c8ff22143c8548b1 input=d84dcae1be10c642]*/ { - int index; - Py_UCS4 c = (Py_UCS4)chr; - index = (int) _getrecord_ex(c)->mirrored; - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) - index = 0; /* unassigned */ - else if (old->mirrored_changed != 0xFF) - index = old->mirrored_changed; - } - return index; + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_mirrored_internal(self, state, chr); +} + +/*[clinic input] +unicodedata.mirrored -> int + + chr: int(accept={str}) + / + +Returns the mirrored property assigned to the character chr as integer. + +Returns 1 if the character has been identified as a "mirrored" +character in bidirectional text, 0 otherwise. +[clinic start generated code]*/ + +static int +unicodedata_mirrored_impl(PyObject *module, int chr) +/*[clinic end generated code: output=ffcfa6e780a8149f input=b33633d27be8d34c]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_mirrored_internal(module, state, chr); } /*[clinic input] unicodedata.UCD.east_asian_width self: self + cls: defining_class chr: int(accept={str}) / @@ -373,26 +638,36 @@ Returns the east asian width assigned to the character chr as string. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) -/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/ +unicodedata_UCD_east_asian_width_impl(PyObject *self, PyTypeObject *cls, + int chr) +/*[clinic end generated code: output=3817138a13d41ea2 input=b988424574ea3a4f]*/ { - int index; - Py_UCS4 c = (Py_UCS4)chr; - index = (int) _getrecord_ex(c)->east_asian_width; - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) - index = 0; /* unassigned */ - else if (old->east_asian_width_changed != 0xFF) - index = old->east_asian_width_changed; - } - return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_east_asian_width_internal(self, state, chr); +} + +/*[clinic input] +unicodedata.east_asian_width + + chr: int(accept={str}) + / + +Returns the east asian width assigned to the character chr as string. +[clinic start generated code]*/ + +static PyObject * +unicodedata_east_asian_width_impl(PyObject *module, int chr) +/*[clinic end generated code: output=5cd4d8eb5ce35c2e input=c157026634b52b79]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_east_asian_width_internal(module, state, chr); } /*[clinic input] unicodedata.UCD.decomposition self: self + cls: defining_class chr: int(accept={str}) / @@ -402,65 +677,41 @@ An empty string is returned in case no such mapping is defined. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_decomposition_impl(PyObject *self, int chr) -/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/ +unicodedata_UCD_decomposition_impl(PyObject *self, PyTypeObject *cls, + int chr) +/*[clinic end generated code: output=2b57177cbf535413 input=21fb28e1b0862856]*/ { - char decomp[256]; - int code, index, count; - size_t i; - unsigned int prefix_index; - Py_UCS4 c = (Py_UCS4)chr; - - code = (int)c; - - if (self && UCD_Check(self)) { - const change_record *old = get_old_record(self, c); - if (old->category_changed == 0) - return PyUnicode_FromString(""); /* unassigned */ - } - - if (code < 0 || code >= 0x110000) - index = 0; - else { - index = decomp_index1[(code>>DECOMP_SHIFT)]; - index = decomp_index2[(index<> 8; +/*[clinic input] +unicodedata.decomposition - /* XXX: could allocate the PyString up front instead - (strlen(prefix) + 5 * count + 1 bytes) */ + chr: int(accept={str}) + / - /* Based on how index is calculated above and decomp_data is generated - from Tools/unicode/makeunicodedata.py, it should not be possible - to overflow decomp_prefix. */ - prefix_index = decomp_data[index] & 255; - assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix)); +Returns the character decomposition mapping assigned to the character chr as string. - /* copy prefix */ - i = strlen(decomp_prefix[prefix_index]); - memcpy(decomp, decomp_prefix[prefix_index], i); +An empty string is returned in case no such mapping is defined. +[clinic start generated code]*/ - while (count-- > 0) { - if (i) - decomp[i++] = ' '; - assert(i < sizeof(decomp)); - PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", - decomp_data[++index]); - i += strlen(decomp + i); - } - return PyUnicode_FromStringAndSize(decomp, i); +static PyObject * +unicodedata_decomposition_impl(PyObject *module, int chr) +/*[clinic end generated code: output=7c0251de3485e5cf input=5718e7a4543bdec5]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_decomposition_internal(module, state, chr); } static void -get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) +get_decomp_record(unicodedata_state *state, PyObject *self, Py_UCS4 code, + int *index, int *prefix, int *count) { if (code >= 0x110000) { *index = 0; - } else if (self && UCD_Check(self) && + } + else if (self && UCD_Check(state, self) && get_old_record(self, code)->category_changed==0) { /* unassigned in old version */ *index = 0; @@ -490,7 +741,7 @@ get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *co #define SCount (LCount*NCount) static PyObject* -nfd_nfkd(PyObject *self, PyObject *input, int k) +nfd_nfkd(unicodedata_state *state, PyObject *self, PyObject *input, int k) { PyObject *result; Py_UCS4 *output; @@ -558,7 +809,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) continue; } /* normalization changes */ - if (self && UCD_Check(self)) { + if (self && UCD_Check(state, self)) { Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); if (value != 0) { stack[stackptr++] = value; @@ -567,7 +818,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) } /* Other decompositions. */ - get_decomp_record(self, code, &index, &prefix, &count); + get_decomp_record(state, self, code, &index, &prefix, &count); /* Copy character if it is not decomposable, or has a compatibility decomposition, but we do NFD. */ @@ -639,7 +890,7 @@ find_nfc_index(const struct reindex* nfc, Py_UCS4 code) } static PyObject* -nfc_nfkc(PyObject *self, PyObject *input, int k) +nfc_nfkc(unicodedata_state *state, PyObject *self, PyObject *input, int k) { PyObject *result; int kind; @@ -651,7 +902,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) Py_ssize_t skipped[20]; int cskipped = 0; - result = nfd_nfkd(self, input, k); + result = nfd_nfkd(state, self, input, k); if (!result) return NULL; /* result will be "ready". */ @@ -794,12 +1045,12 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult; * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms */ static QuickcheckResult -is_normalized_quickcheck(PyObject *self, PyObject *input, - bool nfc, bool k, bool yes_only) +is_normalized_quickcheck(unicodedata_state *state, PyObject *self, + PyObject *input, bool nfc, bool k, bool yes_only) { /* An older version of the database is requested, quickchecks must be disabled. */ - if (self && UCD_Check(self)) + if (self && UCD_Check(state, self)) return NO; Py_ssize_t i, len; @@ -841,23 +1092,10 @@ is_normalized_quickcheck(PyObject *self, PyObject *input, return result; } -/*[clinic input] -unicodedata.UCD.is_normalized - - self: self - form: unicode - unistr as input: unicode - / - -Return whether the Unicode string unistr is in the normal form 'form'. - -Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. -[clinic start generated code]*/ - static PyObject * -unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, - PyObject *input) -/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ +unicodedata_UCD_is_normalized_internal(PyObject *self, + unicodedata_state *state, + PyObject *form, PyObject *input) { if (PyUnicode_READY(input) == -1) { return NULL; @@ -894,10 +1132,10 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return NULL; } - m = is_normalized_quickcheck(self, input, nfc, k, false); + m = is_normalized_quickcheck(state, self, input, nfc, k, false); if (m == MAYBE) { - cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); + cmp = (nfc ? nfc_nfkc : nfd_nfkd)(state, self, input, k); if (cmp == NULL) { return NULL; } @@ -913,24 +1151,9 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, return result; } - -/*[clinic input] -unicodedata.UCD.normalize - - self: self - form: unicode - unistr as input: unicode - / - -Return the normal form 'form' for the Unicode string unistr. - -Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. -[clinic start generated code]*/ - static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, - PyObject *input) -/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ +unicodedata_UCD_normalize_internal(PyObject *self, unicodedata_state *state, + PyObject *form, PyObject *input) { if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing @@ -940,37 +1163,126 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { - if (is_normalized_quickcheck(self, input, true, false, true) == YES) { + if (is_normalized_quickcheck(state, self, input, true, false, true) == YES) { Py_INCREF(input); return input; } - return nfc_nfkc(self, input, 0); + return nfc_nfkc(state, self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { - if (is_normalized_quickcheck(self, input, true, true, true) == YES) { + if (is_normalized_quickcheck(state, self, input, true, true, true) == YES) { Py_INCREF(input); return input; } - return nfc_nfkc(self, input, 1); + return nfc_nfkc(state, self, input, 1); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { - if (is_normalized_quickcheck(self, input, false, false, true) == YES) { + if (is_normalized_quickcheck(state, self, input, false, false, true) == YES) { Py_INCREF(input); return input; } - return nfd_nfkd(self, input, 0); + return nfd_nfkd(state, self, input, 0); } if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { - if (is_normalized_quickcheck(self, input, false, true, true) == YES) { + if (is_normalized_quickcheck(state, self, input, false, true, true) == YES) { Py_INCREF(input); return input; } - return nfd_nfkd(self, input, 1); + return nfd_nfkd(state, self, input, 1); } PyErr_SetString(PyExc_ValueError, "invalid normalization form"); return NULL; } + +/*[clinic input] +unicodedata.UCD.is_normalized + + self: self + cls: defining_class + form: unicode + unistr as input: unicode + / + +Return whether the Unicode string unistr is in the normal form 'form'. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyTypeObject *cls, + PyObject *form, PyObject *input) +/*[clinic end generated code: output=dfb6c41b51532d04 input=44afbe160a4acc8b]*/ +{ + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_is_normalized_internal(self, state, form, input); +} + +/*[clinic input] +unicodedata.is_normalized + + form: unicode + unistr as input: unicode + / + +Return whether the Unicode string unistr is in the normal form 'form'. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_is_normalized_impl(PyObject *module, PyObject *form, + PyObject *input) +/*[clinic end generated code: output=714e732c01f43e1f input=f25094e2feebba3a]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_is_normalized_internal(module, state, form, input); +} + +/*[clinic input] +unicodedata.UCD.normalize + + self: self + cls: defining_class + form: unicode + unistr as input: unicode + / + +Return the normal form 'form' for the Unicode string unistr. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_normalize_impl(PyObject *self, PyTypeObject *cls, + PyObject *form, PyObject *input) +/*[clinic end generated code: output=24303e3f127c20ff input=34ff7064912867c8]*/ +{ + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_normalize_internal(self, state, form, input); +} + +/*[clinic input] +unicodedata.normalize + + form: unicode + unistr as input: unicode + / + +Return the normal form 'form' for the Unicode string unistr. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_normalize_impl(PyObject *module, PyObject *form, PyObject *input) +/*[clinic end generated code: output=f4f1011395e64a7a input=eadfbef9a7ddb2ed]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_normalize_internal(module, state, form, input); +} + + /* -------------------------------------------------------------------- */ /* unicode character name tables */ @@ -1048,8 +1360,8 @@ is_unified_ideograph(Py_UCS4 code) (cp < named_sequences_end)) static int -_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, - int with_alias_and_seq) +_getucname_internal(unicodedata_state *state, PyObject *self, Py_UCS4 code, + char* buffer, int buflen, int with_alias_and_seq) { /* Find the name associated with the given code point. * If with_alias_and_seq is 1, check for names in the Private Use Area 15 @@ -1066,7 +1378,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code))) return 0; - if (self && UCD_Check(self)) { + if (state && self && UCD_Check(state, self)) { /* in 3.2.0 there are no aliases and named sequences */ const change_record *old; if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) @@ -1149,13 +1461,23 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen, return 1; } +static int +_getucname(PyObject *module, Py_UCS4 code, char* buffer, int buflen, + int with_alias_and_seq) +{ + unicodedata_state *state = unicodedata_get_state(module); + return _getucname_internal(state, module, code, buffer, buflen, + with_alias_and_seq); +} + static int -_cmpname(PyObject *self, int code, const char* name, int namelen) +_cmpname(unicodedata_state *state, PyObject *self, int code, const char* name, + int namelen) { /* check if code corresponds to the given name */ int i; char buffer[NAME_MAXLEN+1]; - if (!_getucname(self, code, buffer, NAME_MAXLEN, 1)) + if (!_getucname_internal(state, self, code, buffer, NAME_MAXLEN, 1)) return 0; for (i = 0; i < namelen; i++) { if (Py_TOUPPER(name[i]) != buffer[i]) @@ -1200,8 +1522,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq) } static int -_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, - int with_named_seq) +_getcode_internal(unicodedata_state *state, PyObject* self, const char* name, + int namelen, Py_UCS4* code, int with_named_seq) { /* Return the code point associated with the given name. * Named aliases are resolved too (unless self != NULL (i.e. we are using @@ -1262,7 +1584,7 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) + if (_cmpname(state, self, v, name, namelen)) return _check_alias_and_seq(v, code, with_named_seq); incr = (h ^ (h >> 3)) & mask; if (!incr) @@ -1272,7 +1594,7 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, v = code_hash[i]; if (!v) return 0; - if (_cmpname(self, v, name, namelen)) + if (_cmpname(state, self, v, name, namelen)) return _check_alias_and_seq(v, code, with_named_seq); incr = incr << 1; if (incr > mask) @@ -1280,38 +1602,22 @@ _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code, } } -static const _PyUnicode_Name_CAPI hashAPI = +static int +_getcode(PyObject* module, const char* name, + int namelen, Py_UCS4* code, int with_named_seq) { - sizeof(_PyUnicode_Name_CAPI), - _getucname, - _getcode -}; - -/* -------------------------------------------------------------------- */ -/* Python bindings */ - -/*[clinic input] -unicodedata.UCD.name - - self: self - chr: int(accept={str}) - default: object=NULL - / - -Returns the name assigned to the character chr as a string. - -If no name is defined, default is returned, or, if not given, -ValueError is raised. -[clinic start generated code]*/ + unicodedata_state *state = unicodedata_get_state(module); + return _getcode_internal(state, module, name, namelen, code, with_named_seq); +} static PyObject * -unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) -/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/ +unicodedata_UCD_name_internal(PyObject *self, unicodedata_state *state, + int chr, PyObject *default_value) { char name[NAME_MAXLEN+1]; Py_UCS4 c = (Py_UCS4)chr; - if (!_getucname(self, c, name, NAME_MAXLEN, 0)) { + if (!_getucname_internal(state, self, c, name, NAME_MAXLEN, 0)) { if (default_value == NULL) { PyErr_SetString(PyExc_ValueError, "no such name"); return NULL; @@ -1325,23 +1631,9 @@ unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value) return PyUnicode_FromString(name); } -/*[clinic input] -unicodedata.UCD.lookup - - self: self - name: str(accept={str, robuffer}, zeroes=True) - / - -Look up character by name. - -If a character with the given name is found, return the -corresponding character. If not found, KeyError is raised. -[clinic start generated code]*/ - static PyObject * -unicodedata_UCD_lookup_impl(PyObject *self, const char *name, - Py_ssize_clean_t name_length) -/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/ +unicodedata_UCD_lookup_internal(PyObject *self,unicodedata_state *state, + const char *name, Py_ssize_clean_t name_length) { Py_UCS4 code; unsigned int index; @@ -1350,7 +1642,7 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name, return NULL; } - if (!_getcode(self, name, (int)name_length, &code, 1)) { + if (!_getcode_internal(state, self, name, (int)name_length, &code, 1)) { PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name); return NULL; } @@ -1365,6 +1657,98 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name, return PyUnicode_FromOrdinal(code); } +/* -------------------------------------------------------------------- */ +/* Python bindings */ + +/*[clinic input] +unicodedata.UCD.name + + self: self + cls: defining_class + chr: int(accept={str}) + default: object=NULL + / + +Returns the name assigned to the character chr as a string. + +If no name is defined, default is returned, or, if not given, +ValueError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_name_impl(PyObject *self, PyTypeObject *cls, int chr, + PyObject *default_value) +/*[clinic end generated code: output=0e8f826834e0d2e6 input=c8b850637aa4b6f9]*/ +{ + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_name_internal(self, state, chr, default_value); +} + +/*[clinic input] +unicodedata.name + + chr: int(accept={str}) + default: object=NULL + / + +Returns the name assigned to the character chr as a string. + +If no name is defined, default is returned, or, if not given, +ValueError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_name_impl(PyObject *module, int chr, PyObject *default_value) +/*[clinic end generated code: output=f272f2b4c6ce69a2 input=4518b54cd9861c18]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_name_internal(module, state, chr, default_value); +} + +/*[clinic input] +unicodedata.UCD.lookup + + self: self + cls: defining_class + name: str(accept={str, robuffer}, zeroes=True) + / + +Look up character by name. + +If a character with the given name is found, return the +corresponding character. If not found, KeyError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_lookup_impl(PyObject *self, PyTypeObject *cls, + const char *name, Py_ssize_clean_t name_length) +/*[clinic end generated code: output=1c48ff36ee68f769 input=a62833021fd9342b]*/ +{ + unicodedata_state *state = PyType_GetModuleState(cls); + return unicodedata_UCD_lookup_internal(self, state, name, name_length); +} + +/*[clinic input] +unicodedata.lookup + + name: str(accept={str, robuffer}, zeroes=True) + / + +Look up character by name. + +If a character with the given name is found, return the +corresponding character. If not found, KeyError is raised. +[clinic start generated code]*/ + +static PyObject * +unicodedata_lookup_impl(PyObject *module, const char *name, + Py_ssize_clean_t name_length) +/*[clinic end generated code: output=1d8f4308412419ec input=5562a658a372be05]*/ +{ + unicodedata_state *state = unicodedata_get_state(module); + return unicodedata_UCD_lookup_internal(module, state, name, name_length); +} + /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { @@ -1384,50 +1768,44 @@ static PyMethodDef unicodedata_functions[] = { {NULL, NULL} /* sentinel */ }; -static PyTypeObject UCD_Type = { - /* The ob_type field must be initialized in the module init function - * to be portable to Windows without using C++. */ - PyVarObject_HEAD_INIT(NULL, 0) - "unicodedata.UCD", /*tp_name*/ - sizeof(PreviousDBVersion), /*tp_basicsize*/ - 0, /*tp_itemsize*/ - /* methods */ - (destructor)PyObject_Del, /*tp_dealloc*/ - 0, /*tp_vectorcall_offset*/ - 0, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_as_async*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash*/ - 0, /*tp_call*/ - 0, /*tp_str*/ - PyObject_GenericGetAttr,/*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - 0, /*tp_doc*/ - 0, /*tp_traverse*/ - 0, /*tp_clear*/ - 0, /*tp_richcompare*/ - 0, /*tp_weaklistoffset*/ - 0, /*tp_iter*/ - 0, /*tp_iternext*/ - unicodedata_functions, /*tp_methods*/ - DB_members, /*tp_members*/ - 0, /*tp_getset*/ - 0, /*tp_base*/ - 0, /*tp_dict*/ - 0, /*tp_descr_get*/ - 0, /*tp_descr_set*/ - 0, /*tp_dictoffset*/ - 0, /*tp_init*/ - 0, /*tp_alloc*/ - 0, /*tp_new*/ - 0, /*tp_free*/ - 0, /*tp_is_gc*/ +static PyMethodDef unicodedata_module_functions[] = { + UNICODEDATA_DECIMAL_METHODDEF + UNICODEDATA_UCD_DIGIT_METHODDEF //doesn't require module state + UNICODEDATA_NUMERIC_METHODDEF + UNICODEDATA_CATEGORY_METHODDEF + UNICODEDATA_BIDIRECTIONAL_METHODDEF + UNICODEDATA_COMBINING_METHODDEF + UNICODEDATA_MIRRORED_METHODDEF + UNICODEDATA_EAST_ASIAN_WIDTH_METHODDEF + UNICODEDATA_DECOMPOSITION_METHODDEF + UNICODEDATA_NAME_METHODDEF + UNICODEDATA_LOOKUP_METHODDEF + UNICODEDATA_IS_NORMALIZED_METHODDEF + UNICODEDATA_NORMALIZE_METHODDEF + {NULL, NULL} /* sentinel */ +}; + +static void +ucd_dealloc(PreviousDBVersion *self) +{ + PyTypeObject *tp = Py_TYPE(self); + PyObject_Del(self); + Py_DECREF(tp); +} + +static PyType_Slot unicodedata_ucd_type_slots[] = { + {Py_tp_dealloc, ucd_dealloc}, + {Py_tp_getattro, PyObject_GenericGetAttr}, + {Py_tp_methods, unicodedata_functions}, + {Py_tp_members, DB_members}, + {0,0} +}; + +static PyType_Spec unicodedata_ucd_type_spec = { + .name = "unicodedata.UCD", + .basicsize = sizeof(PreviousDBVersion), + .flags = Py_TPFLAGS_DEFAULT, + .slots = unicodedata_ucd_type_slots }; PyDoc_STRVAR(unicodedata_docstring, @@ -1439,43 +1817,92 @@ this database is based on the UnicodeData.txt file version\n\ The module uses the same names and symbols as defined by the\n\ UnicodeData File Format " UNIDATA_VERSION "."); -static struct PyModuleDef unicodedatamodule = { - PyModuleDef_HEAD_INIT, - "unicodedata", - unicodedata_docstring, - -1, - unicodedata_functions, - NULL, - NULL, - NULL, - NULL -}; +static int +unicodedata_traverse(PyObject *module, visitproc visit, void *arg) +{ + unicodedata_state *state = unicodedata_get_state(module); + Py_VISIT(state->ucd_type); + return 0; +} -PyMODINIT_FUNC -PyInit_unicodedata(void) +static int +unicodedata_clear(PyObject *module) { - PyObject *m, *v; + unicodedata_state *state = unicodedata_get_state(module); + Py_CLEAR(state->ucd_type); + return 0; +} - Py_SET_TYPE(&UCD_Type, &PyType_Type); +static void +unicodedata_free(void *module) +{ + unicodedata_clear((PyObject *)module); +} - m = PyModule_Create(&unicodedatamodule); - if (!m) - return NULL; +static int unicodedata_exec(PyObject *mod) +{ + unicodedata_state *state = unicodedata_get_state(mod); + state->ucd_type = (PyTypeObject *)PyType_FromModuleAndSpec( + mod, &unicodedata_ucd_type_spec, NULL); + if (state->ucd_type == NULL) { + return -1; + } - PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); - Py_INCREF(&UCD_Type); - PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type); + if (PyModule_AddStringConstant(mod, "unidata_version", + UNIDATA_VERSION) < 0) { + return -1; + } /* Previous versions */ - v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); - if (v != NULL) - PyModule_AddObject(m, "ucd_3_2_0", v); + PyObject *v = new_previous_version( + state, "3.2.0", get_change_3_2_0, normalization_3_2_0); + if (v == NULL) { + return -1; + } + if (PyModule_AddObject(mod, "ucd_3_2_0", v) < 0) { + Py_DECREF(v); + return -1; + } + + state->capsule_api.size = sizeof(_PyUnicode_Name_CAPI); + state->capsule_api.getname = _getucname; + state->capsule_api.getcode = _getcode; + state->capsule_api.module = mod; /* Export C API */ - v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL); - if (v != NULL) - PyModule_AddObject(m, "ucnhash_CAPI", v); - return m; + v = PyCapsule_New((void *)&state->capsule_api, + PyUnicodeData_CAPSULE_NAME, NULL); + if (v == NULL) { + return -1; + } + if (PyModule_AddObject(mod, "ucnhash_CAPI", v) < 0) { + Py_DECREF(v); + return -1; + } + return 0; +} + +static PyModuleDef_Slot unicodedata_slots[] = { + {Py_mod_exec, unicodedata_exec}, + {0, NULL} +}; + +static struct PyModuleDef unicodedata_module = { + PyModuleDef_HEAD_INIT, + .m_name = "unicodedata", + .m_doc = unicodedata_docstring, + .m_size = sizeof(unicodedata_state), + .m_methods = unicodedata_module_functions, + .m_slots = unicodedata_slots, + .m_traverse = unicodedata_traverse, + .m_clear = unicodedata_clear, + .m_free = unicodedata_free, +}; + +PyMODINIT_FUNC +PyInit_unicodedata(void) +{ + return PyModuleDef_Init(&unicodedata_module); } /* diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 82e09ad05fcd13..095bd4fa60e806 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6523,8 +6523,8 @@ _PyUnicode_DecodeUnicodeEscape(const char *s, s++; ch = 0xffffffff; /* in case 'getcode' messes up */ if (namelen <= INT_MAX && - ucnhash_CAPI->getcode(NULL, start, (int)namelen, - &ch, 0)) { + ucnhash_CAPI->getcode(ucnhash_CAPI->module, start, + (int)namelen, &ch, 0)) { assert(ch <= MAX_UNICODE); WRITE_CHAR(ch); continue; diff --git a/Python/codecs.c b/Python/codecs.c index 0f18c27e5fe461..f644a3023f512c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -986,7 +986,8 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) for (i = start, ressize = 0; i < end; ++i) { /* object is guaranteed to be "ready" */ c = PyUnicode_READ_CHAR(object, i); - if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + if (ucnhash_CAPI->getname(ucnhash_CAPI->module, c, + buffer, sizeof(buffer), 1)) { replsize = 1+1+1+(int)strlen(buffer)+1; } else if (c >= 0x10000) { @@ -1009,7 +1010,8 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc) i < end; ++i) { c = PyUnicode_READ_CHAR(object, i); *outp++ = '\\'; - if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { + if (ucnhash_CAPI->getname(ucnhash_CAPI->module, c, + buffer, sizeof(buffer), 1)) { *outp++ = 'N'; *outp++ = '{'; strcpy((char *)outp, buffer);