Move the check to http.client where it belongs.

python · gpshead · May 1, 2019 · Apr 10, 2019 · Apr 10, 2019 · Apr 10, 2019
commit bde613c092c9836ad9756aba96c251c99fc16d91
diff --git a/Lib/http/client.py b/Lib/http/client.py
@@ -137,6 +137,14 @@
 _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch
 _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search
 
+# These characters are not allowed within http URL paths.
+#  https://tools.ietf.org/html/rfc3986#section-3.3
+# in order to prevent CVE-2019-9740.
+_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f-\x9f]')
+# Arguably only these _should_ allowed:
+#  _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$")
+# We are more lenient for assumed real world compatibility purposes.
+
 # We always set the Content-Length header for these methods because some
 # servers will otherwise respond with a 411
 _METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'}
@@ -1079,6 +1087,9 @@ def putrequest(self, method, url, skip_host=False,
         self._method = method
         if not url:
             url = '/'
+        # Prevent CVE-2019-9740.
+        if _contains_disallowed_url_pchar_re.search(url):
+            raise ValueError("URL can't contain control characters. {url!r}")
         request = '%s %s %s' % (method, url, self._http_vsn_str)
 
         # Non-ASCII characters should have been eliminated earlier

diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
@@ -329,6 +329,27 @@ def test_willclose(self):
         finally:
             self.unfakehttp()
 
+    def test_url_with_newline_header_injection_rejected(self):
+        self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
+        host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123"
+        schemeless_url = "//" + host + ":8080/test/?test=a"
+        try:
+            # We explicitly test urllib.request.urlopen() instead of the top
+            # level 'def urlopen()' function defined in this... (quite ugly)
+            # test suite.  they use different url opening codepaths.  plain
+            # urlopen uses FancyURLOpener which goes via a codepath that
+            # calls urllib.parse.quote() on the URL which makes all of the
+            # above attempts at injection within the url _path_ safe.
+            with self.assertRaisesRegex(ValueError, "can't contain control"):
+                urllib.request.urlopen(f"http:{schemeless_url}")
+            with self.assertRaisesRegex(ValueError, "can't contain control"):
+                urllib.request.urlopen(f"https:{schemeless_url}")
+            resp = urlopen(f"http:{schemeless_url}")
+            self.assertNotIn('\r', resp.geturl())
+            self.assertNotIn('\n', resp.geturl())
+        finally:
+            self.unfakehttp()
+
     def test_read_0_9(self):
         # "0.9" response accepted (but not "simple responses" without
         # a status line)

diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -991,8 +991,6 @@ def splittype(url):
 
 
 _typeprog = None
-_control_char_re = None
-_schemes_disallowing_control_chars = frozenset({'http', 'https', 'ftp'})
 def _splittype(url):
     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
     global _typeprog
@@ -1002,26 +1000,7 @@ def _splittype(url):
     match = _typeprog.match(url)
     if match:
         scheme, data = match.groups()
-        scheme = scheme.lower()
-        if scheme in _schemes_disallowing_control_chars:
-            # Sanity check url data to avoid control characters.
-            #  https://bugs.python.org/issue14826
-            #  https://bugs.python.org/issue36276
-            # The same control characters check was adopted by Golang in:
-            #  https://go-review.googlesource.com/c/go/+/159157
-            # Isn't it odd to be performing validation within this utility
-            # function?  Yes... but it is in wide use in all of the right
-            # places where URLs need a sanity check to avoid potential security
-            # issues in newline delimited text based protocol implementations.
-            # This way many things get it for free without every use needing to
-            # be updated to explicitly sanity check the path contents.
-            global _control_char_re
-            if _control_char_re is None:
-                _control_char_re = re.compile('[\x00-\x1f\x7f-\x9f]')
-            if _control_char_re.search(data):
-                raise ValueError(f"{scheme} URL can't contain control "
-                                 f"characters. {data!r}")
-        return scheme, data
+        return scheme.lower(), data
     return None, url