From 97bcc4bd5cf5c6dc1352d9b5e680db165c937fd1 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Tue, 9 Apr 2019 17:35:18 -0700 Subject: [PATCH 01/13] bpo-14826 bpo-36276: Disallow control chars in http URLs. Example possible fix for those issues. --- Lib/urllib/request.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index df2ff06f0fc9a2..01177856730eb8 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -350,6 +350,14 @@ def full_url(self): def full_url(self, url): # unwrap('') --> 'type://host/path' self._full_url = _unwrap(url) + # Sanity check self._full_url to avoid control characters in HTTP. + # https://bugs.python.org/issue14826 + # https://bugs.python.org/issue36276 + # The same control characters check was adopted by Golang in: + # https://go-review.googlesource.com/c/go/+/159157 + if (self._full_url.startswith('http') and + re.search("[\x00- \x7f-\x9f]", self._full_url)): + raise ValueError("URL can't contain control characters. %r" % (self._full_url,)) self._full_url, self.fragment = _splittag(self._full_url) self._parse() From bba966c73ef32b0c98f9030401744e74f0353dfb Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Tue, 9 Apr 2019 23:44:30 -0700 Subject: [PATCH 02/13] Move validation to _splittype & correct boundary. --- Lib/urllib/parse.py | 23 ++++++++++++++++++++++- Lib/urllib/request.py | 8 -------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 8b6c9b10609152..308cf24bae983f 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -991,6 +991,8 @@ def splittype(url): _typeprog = None +_control_char_re = None +_schemes_disallowing_control_chars = frozenset({'http', 'https', 'ftp'}) def _splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog @@ -1000,7 +1002,26 @@ def _splittype(url): match = _typeprog.match(url) if match: scheme, data = match.groups() - return scheme.lower(), data + scheme = scheme.lower() + if scheme in _schemes_disallowing_control_chars: + # Sanity check url data to avoid control characters. + # https://bugs.python.org/issue14826 + # https://bugs.python.org/issue36276 + # The same control characters check was adopted by Golang in: + # https://go-review.googlesource.com/c/go/+/159157 + # Isn't it odd to be performing validation within this utility + # function? Yes... but it is in wide use in all of the right + # places where URLs need a sanity check to avoid potential security + # issues in newline delimited text based protocol implementations. + # This way many things get it for free without every use needing to + # be updated to explicitly sanity check the path contents. + global _control_char_re + if _control_char_re is None: + _control_char_re = re.compile('[\x00-\x1f\x7f-\x9f]') + if _control_char_re.search(data): + raise ValueError(f"{scheme} URL can't contain control " + f"characters. {data!r}") + return scheme, data return None, url diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 01177856730eb8..df2ff06f0fc9a2 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -350,14 +350,6 @@ def full_url(self): def full_url(self, url): # unwrap('') --> 'type://host/path' self._full_url = _unwrap(url) - # Sanity check self._full_url to avoid control characters in HTTP. - # https://bugs.python.org/issue14826 - # https://bugs.python.org/issue36276 - # The same control characters check was adopted by Golang in: - # https://go-review.googlesource.com/c/go/+/159157 - if (self._full_url.startswith('http') and - re.search("[\x00- \x7f-\x9f]", self._full_url)): - raise ValueError("URL can't contain control characters. %r" % (self._full_url,)) self._full_url, self.fragment = _splittag(self._full_url) self._parse() From bde613c092c9836ad9756aba96c251c99fc16d91 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 01:25:11 -0700 Subject: [PATCH 03/13] Move the check to http.client where it belongs. --- Lib/http/client.py | 11 +++++++++++ Lib/test/test_urllib.py | 21 +++++++++++++++++++++ Lib/urllib/parse.py | 23 +---------------------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/Lib/http/client.py b/Lib/http/client.py index 5a2225276b1acd..113af5c4f547e2 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -137,6 +137,14 @@ _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search +# These characters are not allowed within http URL paths. +# https://tools.ietf.org/html/rfc3986#section-3.3 +# in order to prevent CVE-2019-9740. +_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f-\x9f]') +# Arguably only these _should_ allowed: +# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") +# We are more lenient for assumed real world compatibility purposes. + # We always set the Content-Length header for these methods because some # servers will otherwise respond with a 411 _METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} @@ -1079,6 +1087,9 @@ def putrequest(self, method, url, skip_host=False, self._method = method if not url: url = '/' + # Prevent CVE-2019-9740. + if _contains_disallowed_url_pchar_re.search(url): + raise ValueError("URL can't contain control characters. {url!r}") request = '%s %s %s' % (method, url, self._http_vsn_str) # Non-ASCII characters should have been eliminated earlier diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2ac73b58d83206..a5889d79fa52e3 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -329,6 +329,27 @@ def test_willclose(self): finally: self.unfakehttp() + def test_url_with_newline_header_injection_rejected(self): + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123" + schemeless_url = "//" + host + ":8080/test/?test=a" + try: + # We explicitly test urllib.request.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. they use different url opening codepaths. plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + with self.assertRaisesRegex(ValueError, "can't contain control"): + urllib.request.urlopen(f"http:{schemeless_url}") + with self.assertRaisesRegex(ValueError, "can't contain control"): + urllib.request.urlopen(f"https:{schemeless_url}") + resp = urlopen(f"http:{schemeless_url}") + self.assertNotIn('\r', resp.geturl()) + self.assertNotIn('\n', resp.geturl()) + finally: + self.unfakehttp() + def test_read_0_9(self): # "0.9" response accepted (but not "simple responses" without # a status line) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 308cf24bae983f..8b6c9b10609152 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -991,8 +991,6 @@ def splittype(url): _typeprog = None -_control_char_re = None -_schemes_disallowing_control_chars = frozenset({'http', 'https', 'ftp'}) def _splittype(url): """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" global _typeprog @@ -1002,26 +1000,7 @@ def _splittype(url): match = _typeprog.match(url) if match: scheme, data = match.groups() - scheme = scheme.lower() - if scheme in _schemes_disallowing_control_chars: - # Sanity check url data to avoid control characters. - # https://bugs.python.org/issue14826 - # https://bugs.python.org/issue36276 - # The same control characters check was adopted by Golang in: - # https://go-review.googlesource.com/c/go/+/159157 - # Isn't it odd to be performing validation within this utility - # function? Yes... but it is in wide use in all of the right - # places where URLs need a sanity check to avoid potential security - # issues in newline delimited text based protocol implementations. - # This way many things get it for free without every use needing to - # be updated to explicitly sanity check the path contents. - global _control_char_re - if _control_char_re is None: - _control_char_re = re.compile('[\x00-\x1f\x7f-\x9f]') - if _control_char_re.search(data): - raise ValueError(f"{scheme} URL can't contain control " - f"characters. {data!r}") - return scheme, data + return scheme.lower(), data return None, url From 448a5413073e6738799c25b459f15a26c0ce6cab Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 01:33:07 -0700 Subject: [PATCH 04/13] don't bother checking for chars above ascii. --- Lib/http/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/http/client.py b/Lib/http/client.py index 113af5c4f547e2..33c56542a7632f 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -140,7 +140,8 @@ # These characters are not allowed within http URL paths. # https://tools.ietf.org/html/rfc3986#section-3.3 # in order to prevent CVE-2019-9740. -_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f-\x9f]') +# We don't restrict chars above \x7f as putrequest() limits us to ASCII. +_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]') # Arguably only these _should_ allowed: # _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") # We are more lenient for assumed real world compatibility purposes. From 944c5fa3f708c7d57126fcc4c54f0db574bf69e6 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 01:35:06 -0700 Subject: [PATCH 05/13] missing f on f-string --- Lib/http/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/http/client.py b/Lib/http/client.py index 33c56542a7632f..0fdcd9bcf1d308 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -1090,7 +1090,7 @@ def putrequest(self, method, url, skip_host=False, url = '/' # Prevent CVE-2019-9740. if _contains_disallowed_url_pchar_re.search(url): - raise ValueError("URL can't contain control characters. {url!r}") + raise ValueError(f"URL can't contain control characters. {url!r}") request = '%s %s %s' % (method, url, self._http_vsn_str) # Non-ASCII characters should have been eliminated earlier From f039a807e3802eaa2ae23b90510b9c324706f64e Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 01:38:18 -0700 Subject: [PATCH 06/13] confirm that repr of the url is in the ValueError. --- Lib/test/test_urllib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index a5889d79fa52e3..83f6110a12f62f 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -340,9 +340,9 @@ def test_url_with_newline_header_injection_rejected(self): # urlopen uses FancyURLOpener which goes via a codepath that # calls urllib.parse.quote() on the URL which makes all of the # above attempts at injection within the url _path_ safe. - with self.assertRaisesRegex(ValueError, "can't contain control"): + with self.assertRaisesRegex(ValueError, r"contain control.*\\r"): urllib.request.urlopen(f"http:{schemeless_url}") - with self.assertRaisesRegex(ValueError, "can't contain control"): + with self.assertRaisesRegex(ValueError, r"contain control.*\\n"): urllib.request.urlopen(f"https:{schemeless_url}") resp = urlopen(f"http:{schemeless_url}") self.assertNotIn('\r', resp.geturl()) From 6686d6614aab86407a2aab88dd7149a6e0facf1a Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 01:43:54 -0700 Subject: [PATCH 07/13] also assert no ' ' in the quoted url --- Lib/test/test_urllib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 83f6110a12f62f..ab563a99bf0799 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -344,7 +344,9 @@ def test_url_with_newline_header_injection_rejected(self): urllib.request.urlopen(f"http:{schemeless_url}") with self.assertRaisesRegex(ValueError, r"contain control.*\\n"): urllib.request.urlopen(f"https:{schemeless_url}") + # This code path quotes the URL so there is no injection. resp = urlopen(f"http:{schemeless_url}") + self.assertNotIn(' ', resp.geturl()) self.assertNotIn('\r', resp.geturl()) self.assertNotIn('\n', resp.geturl()) finally: From a754bd7a56295e30bc8d34448b88b4f0f5ce5964 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" Date: Wed, 10 Apr 2019 08:53:32 +0000 Subject: [PATCH 08/13] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst diff --git a/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst new file mode 100644 index 00000000000000..4fed4d545040e9 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst @@ -0,0 +1 @@ +Address CVE-2019-9740 by disallowing URL paths with embedded whitespace or control characters through into the underlying http client request. Such potentially malicious header injection URLs now cause a ValueError to be raised. \ No newline at end of file From 95039e672471737faa07bffa59a653769cf6fc25 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 02:55:28 -0700 Subject: [PATCH 09/13] Don't abuse http.client in text_xmlrpc. Don't abuse http.client to send a bad request via the url parameter. --- Lib/test/test_xmlrpc.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index 9c8b6958c620ce..f04e84417b7ca0 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -943,8 +943,13 @@ def test_unicode_host(self): def test_partial_post(self): # Check that a partial POST doesn't make the server loop: issue #14001. - with contextlib.closing(http.client.HTTPConnection(ADDR, PORT)) as conn: - conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye') + with contextlib.closing(socket.create_connection((ADDR, PORT))) as conn: + conn.send('POST /RPC2 HTTP/1.0\r\n' + 'Content-Length: 100\r\n\r\n' + f'bye HTTP/1.1\r\n' + 'Host: {ADDR}:{PORT}\r\n' + 'Accept-Encoding: identity\r\n' + 'Content-Length: 0\r\n\r\n'.encode('ascii')) def test_context_manager(self): with xmlrpclib.ServerProxy(URL) as server: From 5589519991bcad73fadf12d6d718eef66f9eed31 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Wed, 10 Apr 2019 12:29:00 -0700 Subject: [PATCH 10/13] fix misplaced f-str --- Lib/test/test_xmlrpc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index f04e84417b7ca0..52bacc1eafa789 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -946,8 +946,8 @@ def test_partial_post(self): with contextlib.closing(socket.create_connection((ADDR, PORT))) as conn: conn.send('POST /RPC2 HTTP/1.0\r\n' 'Content-Length: 100\r\n\r\n' - f'bye HTTP/1.1\r\n' - 'Host: {ADDR}:{PORT}\r\n' + 'bye HTTP/1.1\r\n' + f'Host: {ADDR}:{PORT}\r\n' 'Accept-Encoding: identity\r\n' 'Content-Length: 0\r\n\r\n'.encode('ascii')) From 54f7fca15f150d9ce558761692b2107c04f786be Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Tue, 30 Apr 2019 11:34:52 -0700 Subject: [PATCH 11/13] Address code review comments. Adds some comments and expands an error message per vstinner's PR review. (authored and pushed from 32,000ft on the plane to CLE for PyCon US) --- Lib/http/client.py | 12 +++++++----- Lib/test/test_urllib.py | 3 ++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Lib/http/client.py b/Lib/http/client.py index 0fdcd9bcf1d308..6becd0922581ee 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -137,9 +137,10 @@ _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search -# These characters are not allowed within http URL paths. -# https://tools.ietf.org/html/rfc3986#section-3.3 -# in order to prevent CVE-2019-9740. +# These characters are not allowed within HTTP URL paths. +# See https://tools.ietf.org/html/rfc3986#section-3.3 and the +# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition. +# Prevents CVE-2019-9740. Includes control characters such as \r\n. # We don't restrict chars above \x7f as putrequest() limits us to ASCII. _contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]') # Arguably only these _should_ allowed: @@ -1089,8 +1090,9 @@ def putrequest(self, method, url, skip_host=False, if not url: url = '/' # Prevent CVE-2019-9740. - if _contains_disallowed_url_pchar_re.search(url): - raise ValueError(f"URL can't contain control characters. {url!r}") + if match := _contains_disallowed_url_pchar_re.search(url): + raise ValueError(f"URL can't contain control characters. {url!r} " + f"(found at least {match.group()!r})") request = '%s %s %s' % (method, url, self._http_vsn_str) # Non-ASCII characters should have been eliminated earlier diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index ab563a99bf0799..a09edab7639756 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -340,7 +340,8 @@ def test_url_with_newline_header_injection_rejected(self): # urlopen uses FancyURLOpener which goes via a codepath that # calls urllib.parse.quote() on the URL which makes all of the # above attempts at injection within the url _path_ safe. - with self.assertRaisesRegex(ValueError, r"contain control.*\\r"): + with self.assertRaisesRegex( + ValueError, r"contain control.*\\r.*(found at least . .)"): urllib.request.urlopen(f"http:{schemeless_url}") with self.assertRaisesRegex(ValueError, r"contain control.*\\n"): urllib.request.urlopen(f"https:{schemeless_url}") From 73db8e16a07964ec5c0120ca28950e594e424582 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Tue, 30 Apr 2019 11:58:10 -0700 Subject: [PATCH 12/13] fix indentation --- Lib/http/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/http/client.py b/Lib/http/client.py index 6becd0922581ee..99d6a68cf42823 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -1091,8 +1091,8 @@ def putrequest(self, method, url, skip_host=False, url = '/' # Prevent CVE-2019-9740. if match := _contains_disallowed_url_pchar_re.search(url): - raise ValueError(f"URL can't contain control characters. {url!r} " - f"(found at least {match.group()!r})") + raise ValueError(f"URL can't contain control characters. {url!r} " + f"(found at least {match.group()!r})") request = '%s %s %s' % (method, url, self._http_vsn_str) # Non-ASCII characters should have been eliminated earlier From aeb2be872ae3b6e5313fa8d642b39f7a99be442d Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Tue, 30 Apr 2019 12:23:15 -0700 Subject: [PATCH 13/13] Expand the tests to cover every char. all rejected chars are explicitly tested now. --- Lib/test/test_urllib.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index a09edab7639756..e87c85b92876ab 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -329,6 +329,31 @@ def test_willclose(self): finally: self.unfakehttp() + def test_url_with_control_char_rejected(self): + for char_no in list(range(0, 0x21)) + [0x7f]: + char = chr(char_no) + schemeless_url = f"//localhost:7777/test{char}/" + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + try: + # We explicitly test urllib.request.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. They use different url opening codepaths. Plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + escaped_char_repr = repr(char).replace('\\', r'\\') + with self.assertRaisesRegex( + ValueError, f"contain control.*{escaped_char_repr}"): + urllib.request.urlopen(f"http:{schemeless_url}") + with self.assertRaisesRegex( + ValueError, f"contain control.*{escaped_char_repr}"): + urllib.request.urlopen(f"https:{schemeless_url}") + # This code path quotes the URL so there is no injection. + resp = urlopen(f"http:{schemeless_url}") + self.assertNotIn(char, resp.geturl()) + finally: + self.unfakehttp() + def test_url_with_newline_header_injection_rejected(self): self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123" @@ -336,7 +361,7 @@ def test_url_with_newline_header_injection_rejected(self): try: # We explicitly test urllib.request.urlopen() instead of the top # level 'def urlopen()' function defined in this... (quite ugly) - # test suite. they use different url opening codepaths. plain + # test suite. They use different url opening codepaths. Plain # urlopen uses FancyURLOpener which goes via a codepath that # calls urllib.parse.quote() on the URL which makes all of the # above attempts at injection within the url _path_ safe.