@@ -755,100 +755,113 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
755755
756756PyObject * PyCodec_XMLCharRefReplaceErrors (PyObject * exc )
757757{
758- if (PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
759- PyObject * restuple ;
760- PyObject * object ;
761- Py_ssize_t i ;
762- Py_ssize_t start ;
763- Py_ssize_t end ;
764- PyObject * res ;
765- Py_UCS1 * outp ;
766- Py_ssize_t ressize ;
767- Py_UCS4 ch ;
768- if (PyUnicodeEncodeError_GetStart (exc , & start ))
769- return NULL ;
770- if (PyUnicodeEncodeError_GetEnd (exc , & end ))
771- return NULL ;
772- if (!(object = PyUnicodeEncodeError_GetObject (exc )))
773- return NULL ;
774- if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1 ))
775- end = start + PY_SSIZE_T_MAX / (2 + 7 + 1 );
776- for (i = start , ressize = 0 ; i < end ; ++ i ) {
777- /* object is guaranteed to be "ready" */
778- ch = PyUnicode_READ_CHAR (object , i );
779- if (ch < 10 )
780- ressize += 2 + 1 + 1 ;
781- else if (ch < 100 )
782- ressize += 2 + 2 + 1 ;
783- else if (ch < 1000 )
784- ressize += 2 + 3 + 1 ;
785- else if (ch < 10000 )
786- ressize += 2 + 4 + 1 ;
787- else if (ch < 100000 )
788- ressize += 2 + 5 + 1 ;
789- else if (ch < 1000000 )
790- ressize += 2 + 6 + 1 ;
791- else
792- ressize += 2 + 7 + 1 ;
758+ if (!PyObject_TypeCheck (exc , (PyTypeObject * )PyExc_UnicodeEncodeError )) {
759+ wrong_exception_type (exc );
760+ return NULL ;
761+ }
762+
763+ PyObject * obj ;
764+ Py_ssize_t objlen , start , end , slen ;
765+ if (_PyUnicodeError_GetParams (exc ,
766+ & obj , & objlen ,
767+ & start , & end , & slen , false) < 0 )
768+ {
769+ return NULL ;
770+ }
771+
772+ // The number of characters that each character 'ch' contributes
773+ // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
774+ // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
775+ // range is below 10^7, each "block" requires at most 2 + 7 + 1
776+ // characters.
777+ if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1 )) {
778+ end = start + PY_SSIZE_T_MAX / (2 + 7 + 1 );
779+ end = Py_MIN (end , objlen );
780+ slen = Py_MAX (0 , end - start );
781+ }
782+
783+ Py_ssize_t ressize = 0 ;
784+ for (Py_ssize_t i = start ; i < end ; ++ i ) {
785+ /* object is guaranteed to be "ready" */
786+ Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
787+ if (ch < 10 ) {
788+ ressize += 2 + 1 + 1 ;
793789 }
794- /* allocate replacement */
795- res = PyUnicode_New (ressize , 127 );
796- if (res == NULL ) {
797- Py_DECREF (object );
798- return NULL ;
790+ else if (ch < 100 ) {
791+ ressize += 2 + 2 + 1 ;
799792 }
800- outp = PyUnicode_1BYTE_DATA (res );
801- /* generate replacement */
802- for (i = start ; i < end ; ++ i ) {
803- int digits ;
804- int base ;
805- ch = PyUnicode_READ_CHAR (object , i );
806- * outp ++ = '&' ;
807- * outp ++ = '#' ;
808- if (ch < 10 ) {
809- digits = 1 ;
810- base = 1 ;
811- }
812- else if (ch < 100 ) {
813- digits = 2 ;
814- base = 10 ;
815- }
816- else if (ch < 1000 ) {
817- digits = 3 ;
818- base = 100 ;
819- }
820- else if (ch < 10000 ) {
821- digits = 4 ;
822- base = 1000 ;
823- }
824- else if (ch < 100000 ) {
825- digits = 5 ;
826- base = 10000 ;
827- }
828- else if (ch < 1000000 ) {
829- digits = 6 ;
830- base = 100000 ;
831- }
832- else {
833- digits = 7 ;
834- base = 1000000 ;
835- }
836- while (digits -- > 0 ) {
837- * outp ++ = '0' + ch /base ;
838- ch %= base ;
839- base /= 10 ;
840- }
841- * outp ++ = ';' ;
793+ else if (ch < 1000 ) {
794+ ressize += 2 + 3 + 1 ;
795+ }
796+ else if (ch < 10000 ) {
797+ ressize += 2 + 4 + 1 ;
798+ }
799+ else if (ch < 100000 ) {
800+ ressize += 2 + 5 + 1 ;
801+ }
802+ else if (ch < 1000000 ) {
803+ ressize += 2 + 6 + 1 ;
804+ }
805+ else {
806+ assert (ch < 10000000 );
807+ ressize += 2 + 7 + 1 ;
842808 }
843- assert (_PyUnicode_CheckConsistency (res , 1 ));
844- restuple = Py_BuildValue ("(Nn)" , res , end );
845- Py_DECREF (object );
846- return restuple ;
847809 }
848- else {
849- wrong_exception_type (exc );
810+
811+ /* allocate replacement */
812+ PyObject * res = PyUnicode_New (ressize , 127 );
813+ if (res == NULL ) {
814+ Py_DECREF (obj );
850815 return NULL ;
851816 }
817+ Py_UCS1 * outp = PyUnicode_1BYTE_DATA (res );
818+ /* generate replacement */
819+ for (Py_ssize_t i = start ; i < end ; ++ i ) {
820+ int digits , base ;
821+ Py_UCS4 ch = PyUnicode_READ_CHAR (obj , i );
822+ if (ch < 10 ) {
823+ digits = 1 ;
824+ base = 1 ;
825+ }
826+ else if (ch < 100 ) {
827+ digits = 2 ;
828+ base = 10 ;
829+ }
830+ else if (ch < 1000 ) {
831+ digits = 3 ;
832+ base = 100 ;
833+ }
834+ else if (ch < 10000 ) {
835+ digits = 4 ;
836+ base = 1000 ;
837+ }
838+ else if (ch < 100000 ) {
839+ digits = 5 ;
840+ base = 10000 ;
841+ }
842+ else if (ch < 1000000 ) {
843+ digits = 6 ;
844+ base = 100000 ;
845+ }
846+ else {
847+ assert (ch < 10000000 );
848+ digits = 7 ;
849+ base = 1000000 ;
850+ }
851+ * outp ++ = '&' ;
852+ * outp ++ = '#' ;
853+ while (digits -- > 0 ) {
854+ assert (base >= 1 );
855+ * outp ++ = '0' + ch / base ;
856+ ch %= base ;
857+ base /= 10 ;
858+ }
859+ * outp ++ = ';' ;
860+ }
861+ assert (_PyUnicode_CheckConsistency (res , 1 ));
862+ PyObject * restuple = Py_BuildValue ("(Nn)" , res , end );
863+ Py_DECREF (obj );
864+ return restuple ;
852865}
853866
854867PyObject * PyCodec_BackslashReplaceErrors (PyObject * exc )
0 commit comments