1515# error C 'long' size should be either 4 or 8!
1616#endif
1717
18- Py_LOCAL_INLINE (int )
19- STRINGLIB (utf8_try_decode )(const char * start , const char * end ,
20- STRINGLIB_CHAR * dest ,
21- const char * * src_pos , Py_ssize_t * dest_index )
18+ Py_LOCAL_INLINE (Py_UCS4 )
19+ STRINGLIB (utf8_decode )(const char * * inptr , const char * end ,
20+ STRINGLIB_CHAR * dest ,
21+ Py_ssize_t * outpos )
2222{
23- int ret ;
24- Py_ssize_t n ;
25- const char * s = start ;
23+ Py_UCS4 ch ;
24+ const char * s = * inptr ;
2625 const char * aligned_end = (const char * ) ((size_t ) end & ~LONG_PTR_MASK );
27- STRINGLIB_CHAR * p = dest ;
26+ STRINGLIB_CHAR * p = dest + * outpos ;
2827
2928 while (s < end ) {
30- Py_UCS4 ch = (unsigned char )* s ;
29+ ch = (unsigned char )* s ;
3130
3231 if (ch < 0x80 ) {
3332 /* Fast path for runs of ASCII characters. Given that common UTF-8
@@ -48,15 +47,33 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
4847 unsigned long value = * (unsigned long * ) _s ;
4948 if (value & ASCII_CHAR_MASK )
5049 break ;
51- _p [0 ] = _s [0 ];
52- _p [1 ] = _s [1 ];
53- _p [2 ] = _s [2 ];
54- _p [3 ] = _s [3 ];
55- #if (SIZEOF_LONG == 8 )
56- _p
D2FD
span>[4 ] = _s [4 ];
57- _p [5 ] = _s [5 ];
58- _p [6 ] = _s [6 ];
59- _p [7 ] = _s [7 ];
50+ #ifdef BYTEORDER_IS_LITTLE_ENDIAN
51+ _p [0 ] = (STRINGLIB_CHAR )(value & 0xFFu );
52+ _p [1 ] = (STRINGLIB_CHAR )((value >> 8 ) & 0xFFu );
53+ _p [2 ] = (STRINGLIB_CHAR )((value >> 16 ) & 0xFFu );
54+ _p [3 ] = (STRINGLIB_CHAR )((value >> 24 ) & 0xFFu );
55+ # if SIZEOF_LONG == 8
56+ _p [4 ] = (STRINGLIB_CHAR )((value >> 32 ) & 0xFFu );
57+ _p [5 ] = (STRINGLIB_CHAR )((value >> 40 ) & 0xFFu );
58+ _p [6 ] = (STRINGLIB_CHAR )((value >> 48 ) & 0xFFu );
59+ _p [7 ] = (STRINGLIB_CHAR )((value >> 56 ) & 0xFFu );
60+ # endif
61+ #else
62+ # if SIZEOF_LONG == 8
63+ _p [0 ] = (STRINGLIB_CHAR )((value >> 56 ) & 0xFFu );
64+ _p [1 ] = (STRINGLIB_CHAR )((value >> 48 ) & 0xFFu );
65+ _p [2 ] = (STRINGLIB_CHAR )((value >> 40 ) & 0xFFu );
66+ _p [3 ] = (STRINGLIB_CHAR )((value >> 32 ) & 0xFFu );
67+ _p [4 ] = (STRINGLIB_CHAR )((value >> 24 ) & 0xFFu );
68+ _p [5 ] = (STRINGLIB_CHAR )((value >> 16 ) & 0xFFu );
69+ _p [6 ] = (STRINGLIB_CHAR )((value >> 8 ) & 0xFFu );
70+ _p [7 ] = (STRINGLIB_CHAR )(value & 0xFFu );
71+ # else
72+ _p [0 ] = (STRINGLIB_CHAR )((value >> 24 ) & 0xFFu );
73+ _p [1 ] = (STRINGLIB_CHAR )((value >> 16 ) & 0xFFu );
74+ _p [2 ] = (STRINGLIB_CHAR )((value >> 8 ) & 0xFFu );
75+ _p [3 ] = (STRINGLIB_CHAR )(value & 0xFFu );
76+ # endif
6077#endif
6178 _s += SIZEOF_LONG ;
6279 _p += SIZEOF_LONG ;
@@ -67,87 +84,135 @@ STRINGLIB(utf8_try_decode)(const char *start, const char *end,
6784 break ;
6885 ch = (unsigned char )* s ;
6986 }
87+ if (ch < 0x80 ) {
88+ s ++ ;
89+ * p ++ = ch ;
90+ continue ;
91+ }
7092 }
7193
72- if (ch < 0x80 ) {
73- s ++ ;
74- * p ++ = ch ;
75- continue ;
76- }
77-
78- n = utf8_code_length [ch ];
79-
80- if (s + n > end ) {
81- /* unexpected end of data: the caller will decide whether
82- it's an error or not */
83- goto _error ;
94+ if (ch < 0xC2 ) {
95+ /* invalid sequence
96+ \x80-\xBF -- continuation byte
97+ \xC0-\xC1 -- fake 0000-007F */
98+ goto InvalidStart ;
8499 }
85100
86- switch (n ) {
87- case 0 :
88- /* invalid start byte */
89- goto _error ;
90- case 1 :
91- /* internal error */
92- goto _error ;
93- case 2 :
94- if ((s [1 ] & 0xc0 ) != 0x80 )
101+ if (ch < 0xE0 ) {
102+ /* \xC2\x80-\xDF\xBF -- 0080-07FF */
103+ Py_UCS4 ch2 ;
104+ if (end - s < 2 ) {
105+ /* unexpected end of data: the caller will decide whether
106+ it's an error or not */
107+ break ;
108+ }
109+ ch2 = (unsigned char )s [1 ];
110+ if ((ch2 & 0xC0 ) != 0x80 )
95111 /* invalid continuation byte */
96- goto _error ;
97- ch = ((s [0 ] & 0x1f ) << 6 ) + (s [1 ] & 0x3f );
112+ goto InvalidContinuation ;
113+ ch = (ch << 6 ) + ch2 -
114+ ((0xC0 << 6 ) + 0x80 );
98115 assert ((ch > 0x007F ) && (ch <= 0x07FF ));
99116 s += 2 ;
117+ if (STRINGLIB_MAX_CHAR <= 0x007F ||
118+ (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR ))
119+ goto Overflow ;
100120 * p ++ = ch ;
101- break ;
121+ continue ;
122+ }
102123
103- case 3 :
104- /* Decoding UTF-8 sequences in range \xed\xa0 \x80-\xed\xbf\xbf
105- will result in surrogates in range d800-dfff. Surrogates are
106- not valid UTF-8 so they are rejected.
107- See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
108- (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
109- if (( s [ 1 ] & 0xc0 ) != 0x80 ||
110- ( s [ 2 ] & 0xc0 ) != 0x80 ||
111- (( unsigned char )s [0 ] == 0xE0 &&
112- (unsigned char )s [1 ] < 0xA0 ) ||
113- (( unsigned char ) s [ 0 ] == 0xED &&
114- ( unsigned char ) s [ 1 ] > 0x9F ) ) {
124+ if ( ch < 0xF0 ) {
125+ /* \xE0\xA0 \x80-\xEF\xBF\xBF -- 0800-FFFF */
126+ Py_UCS4 ch2 , ch3 ;
127+ if ( end - s < 3 ) {
128+ /* unexpected end of data: the caller will decide whether
129+ it's an error or not */
130+ break ;
131+ }
132+ ch2 = ( unsigned char )s [1 ];
133+ ch3 = (unsigned char )s [2 ];
134+ if (( ch2 & 0xC0 ) != 0x80 ||
135+ ( ch3 & 0xC0 ) != 0x80 ) {
115136 /* invalid continuation byte */
116- goto _error ;
137+ goto InvalidContinuation ;
138+ }
139+ if (ch == 0xE0 ) {
140+ if (ch2 < 0xA0 )
141+ /* invalid sequence
142+ \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
143+ goto InvalidContinuation ;
117144 }
118- ch = ((s [0 ] & 0x0f ) << 12 ) + ((s [1 ] & 0x3f ) << 6 ) + (s [2 ] & 0x3f );
145+ else if (ch == 0xED && ch2 > 0x9F ) {
146+ /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
147+ will result in surrogates in range D800-DFFF. Surrogates are
148+ not valid UTF-8 so they are rejected.
149+ See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
150+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
151+ goto InvalidContinuation ;
152+ }
153+ ch = (ch << 12 ) + (ch2 << 6 ) + ch3 -
154+ ((0xE0 << 12 ) + (0x80 << 6 ) + 0x80 );
119155 assert ((ch > 0x07FF ) && (ch <= 0xFFFF ));
120156 s += 3 ;
157+ if (STRINGLIB_MAX_CHAR <= 0x07FF ||
158+ (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR ))
159+ goto Overflow ;
121160 * p ++ = ch ;
122- break ;
161+ continue ;
162+ }
123163
124- case 4 :
125- if ((s [1 ] & 0xc0 ) != 0x80 ||
126- (s [2 ] & 0xc0 ) != 0x80 ||
127- (s [3 ] & 0xc0 ) != 0x80 ||
128- ((unsigned char )s [0 ] == 0xF0 &&
129- (unsigned char )s [1 ] < 0x90 ) ||
130- ((unsigned char )s [0 ] == 0xF4 &&
131- (unsigned char )s [1 ] > 0x8F )) {
164+ if (ch < 0xF5 ) {
165+ /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
166+ Py_UCS4 ch2 , ch3 , ch4 ;
167+ if (end - s < 4 ) {
168+ /* unexpected end of data: the caller will decide whether
169+ it's an error or not */
170+ break ;
171+ }
172+ ch2 = (unsigned char )s [1 ];
173+ ch3 = (unsigned char )s [2 ];
174+ ch4 = (unsigned char )s [3 ];
175+ if ((ch2 & 0xC0 ) != 0x80 ||
176+ (ch3 & 0xC0 ) != 0x80 ||
177+ (ch4 & 0xC0 ) != 0x80 ) {
132178 /* invalid continuation byte */
133- goto _error ;
179+ goto InvalidContinuation ;
180+ }
181+ if (ch == 0xF0 ) {
182+ if (ch2 < 0x90 )
183+ /* invalid sequence
184+ \xF0\x80\x80\x80-\xF0\x80\xBF\xBF -- fake 0000-FFFF */
185+ goto InvalidContinuation ;
134186 }
135- ch = ((s [0 ] & 0x7 ) << 18 ) + ((s [1 ] & 0x3f ) << 12 ) +
136- ((s [2 ] & 0x3f ) << 6 ) + (s [3 ] & 0x3f );
137- assert ((ch > 0xFFFF ) && (ch <= 0x10ffff ));
187+ else if (ch == 0xF4 && ch2 > 0x8F ) {
188+ /* invalid sequence
189+ \xF4\x90\x80\80- -- 110000- overflow */
190+ goto InvalidContinuation ;
191+ }
192+ ch = (ch << 18 ) + (ch2 << 12 ) + (ch3 << 6 ) + ch4 -
193+ ((0xF0 << 18 ) + (0x80 << 12 ) + (0x80 << 6 ) + 0x80 );
194+ assert ((ch > 0xFFFF ) && (ch <= 0x10FFFF ));
138195 s += 4 ;
196+ if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
197+ (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR ))
198+ goto Overflow ;
139199 * p ++ = ch ;
140- break ;
200+ continue ;
141201 }
202+ goto InvalidStart ;
142203 }
143- ret = 0 ;
144- goto _ok ;
145- _error :
146- ret = -1 ;
147- _ok :
148- * src_pos = s ;
149- * dest_index = p - dest ;
150- return ret ;
204+ ch = 0 ;
205+ Overflow :
206+ Return :
207+ * inptr = s ;
208+ * outpos = p - dest ;
209+ return ch ;
210+ InvalidStart :
211+ ch = 1 ;
212+ goto Return ;
213+ InvalidContinuation :
214+ ch = 2 ;
215+ goto Return ;
151216}
152217
153218#undef LONG_PTR_MASK
0 commit comments