Only lstrip the URL to avoid breaking applications.

Many existing applications rely (for better or worse) on the trailing spaces being preserved by this API. So this moves more conservative and keeps those. The issue this change is addressing is triggered by leading spaces. One example library relyong on behavior: Django's URL validator library (at least in Django 3.2 and earlier; I have not checked later versions). If trailing spaces are stripped, its logic that involves urllib.parse for one logic path within its checks can fail to reject some URLs as invalid.
python · gpshead · May 17, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 8, 2023
commit c863a8107ba2d03c4039fe04b451607f3bc1103f
diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
@@ -324,9 +324,9 @@ or on combining URL components into a URL string.
    ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
    decomposed before parsing, no error will be raised.
 
-   Following the `WHATWG spec`_ that updates RFC 3986, leading and trailing C0
-   control and space characters are stripped from the URL. ``\n``, ``\r`` and
-   tab ``\t`` characters are removed from the URL at any position.
+   Following some of the `WHATWG spec`_ that updates RFC 3986, leading C0
+   control control and space characters are stripped from the URL. ``\n``,
+   ``\r`` and tab ``\t`` characters are removed from the URL at any position.
 
    .. versionchanged:: 3.6
       Out-of-range port numbers now raise :exc:`ValueError`, instead of
@@ -340,8 +340,7 @@ or on combining URL components into a URL string.
       ASCII newline and tab characters are stripped from the URL.
 
    .. versionchanged:: 3.12
-      Leading and trailing C0 control and space characters are stripped from
-      the URL.
+      Leading WHATWG C0 control and space characters are stripped from the URL.
 
 .. _WHATWG spec: https://url.spec.whatwg.org/#concept-basic-url-parser
 

diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
@@ -679,6 +679,27 @@ def test_urlsplit_strip_url(self):
         self.assertEqual(p.port, 80)
         self.assertEqual(p.geturl(), base_url.encode())
 
+        # Test that trailing space is preserved as some applications rely on
+        # this within query strings.
+        query_spaces_url = "https://www.python.org:88/doc/?query=    "
+        p = urllib.parse.urlsplit(noise.decode("utf-8") + query_spaces_url)
+        self.assertEqual(p.scheme, "https")
+        self.assertEqual(p.netloc, "www.python.org:88")
+        self.assertEqual(p.path, "/doc/")
+        self.assertEqual(p.query, "query=    ")
+        self.assertEqual(p.port, 88)
+        self.assertEqual(p.geturl(), query_spaces_url)
+
+        p = urllib.parse.urlsplit("www.pypi.org ")
+        # That "hostname" gets considered a "path" due to the
+        # trailing space and our existing logic...  YUCK...
+        # and re-assembles via geturl aka unurlsplit into the original.
+        # django.core.validators.URLValidator (at least through v3.2) relies on
+        # this, for better or worse, to catch it in a ValidationError via its
+        # regular expressions.
+        # Here we test the basic round trip concept of such a trailing space.
+        self.assertEqual(urllib.parse.urlunsplit(p), "www.pypi.org ")
+
         # with scheme as cache-key
         url = "//www.python.org/"
         scheme = noise.decode() + "https" + noise.decode()

diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
@@ -25,6 +25,10 @@
 scenarios for parsing, and for backward compatibility purposes, some
 parsing quirks from older RFCs are retained. The testcases in
 test_urlparse.py provides a good indicator of parsing behavior.
+
+The WHATWG URL Parser spec should also be considered.  We are not compliant with
+it either due to existing user code API behavior expectations (Hyrum's Law).
+It serves as a useful guide when making changes.
 """
 
 from collections import namedtuple
@@ -79,9 +83,9 @@
                 '0123456789'
                 '+-.')
 
-# Leading and trailing C0 control and space to be stripped per WHATWG spec
+# Leading and trailing C0 control and space to be stripped per WHATWG spec.
 # == "".join([chr(i) for i in range(0, 0x20 + 1)])
-_URL_CHARS_TO_STRIP = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
+_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
 
 # Unsafe bytes to be removed per WHATWG spec
 _UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
@@ -456,8 +460,10 @@ def urlsplit(url, scheme='', allow_fragments=True):
     """
 
     url, scheme, _coerce_result = _coerce_args(url, scheme)
-    url = url.strip(_URL_CHARS_TO_STRIP)
-    scheme = scheme.strip(_URL_CHARS_TO_STRIP)
+    # Only lstrip url as some applications rely on preserving trailing space.
+    # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
+    url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
+    scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
 
     for b in _UNSAFE_URL_BYTES_TO_REMOVE:
         url = url.replace(b, "")