From 0630a2dfaa97591b11b897863824fa111c5ff172 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 16 Apr 2025 19:50:19 +0100 Subject: [PATCH 1/9] GH-123599: `url2pathname()`: don't call `gethostbyname()` by default Follow-up to 0879ebc. Add *resolve_netloc* keyword-only argument to `url2pathname()`, defaulting to false. When set to true, we call `socket.gethostbyname()` to resolve the URL authority (netloc). --- Doc/library/pathlib.rst | 8 ++++---- Doc/library/urllib.request.rst | 20 +++++++++++++------ Doc/whatsnew/3.14.rst | 8 +++++--- Lib/test/test_pathlib/test_pathlib.py | 1 - Lib/test/test_urllib.py | 15 ++++++++++---- Lib/urllib/request.py | 15 +++++++++----- ...-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 9 +++++---- 7 files changed, 49 insertions(+), 27 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index b82986902861b2..4fb8709261612a 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -872,10 +872,10 @@ conforming to :rfc:`8089`. .. versionadded:: 3.13 .. versionchanged:: next - If a URL authority (e.g. a hostname) is present and resolves to a local - address, it is discarded. If an authority is present and *doesn't* - resolve to a local address, then on Windows a UNC path is returned (as - before), and on other platforms a :exc:`ValueError` is raised. + If a URL authority matches the current hostname, it is discarded. + Otherwise, if the authority isn't empty or ``localhost``, then on + Windows a UNC path is returned (as before), and on other platforms a + :exc:`ValueError` is raised. .. method:: Path.as_uri() diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index b7c0c7d5099806..b471ce49a1be6a 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -175,7 +175,7 @@ The :mod:`urllib.request` module defines the following functions: The *add_scheme* argument was added. -.. function:: url2pathname(url, *, require_scheme=False) +.. function:: url2pathname(url, *, require_scheme=False, resolve_netloc=False) Convert the given ``file:`` URL to a local path. This function uses :func:`~urllib.parse.unquote` to decode the URL. @@ -185,6 +185,13 @@ The :mod:`urllib.request` module defines the following functions: value should include the prefix; a :exc:`~urllib.error.URLError` is raised if it doesn't. + The URL authority is discarded if it empty, ``localhost``, or the current + hostname. Otherwise, if *resolve_netloc* is set to true, the authority is + resolved to an IP address using :func:`socket.gethostbyname`, and discarded + if it matches a local IP. If the authority is still unhandled, then on + Windows a UNC path is returned, and on other platforms a + :exc:`~urllib.error.URLError` is raised. + This example shows the function being used on Windows:: >>> from urllib.request import url2pathname @@ -198,15 +205,16 @@ The :mod:`urllib.request` module defines the following functions: :exc:`OSError` exception to be raised on Windows. .. versionchanged:: next - This function calls :func:`socket.gethostbyname` if the URL authority - isn't empty, ``localhost``, or the machine hostname. If the authority - resolves to a local IP address then it is discarded; otherwise, on - Windows a UNC path is returned (as before), and on other platforms a - :exc:`~urllib.error.URLError` is raised. + The URL authority is discarded if it matches the machine hostname. + Otherwise, on Windows a UNC path is returned (as before), and on other + platforms a :exc:`~urllib.error.URLError` is raised. .. versionchanged:: next The *require_scheme* argument was added. + .. versionchanged:: next + The *resolve_netloc* argument was added. + .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index c50d1669fef84c..4d5b6717f3c416 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1230,9 +1230,11 @@ urllib - Accept a complete URL when the new *require_scheme* argument is set to true. - - Discard URL authorities that resolve to a local IP address. - - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve - to a local IP address, except on Windows where we return a UNC path. + - Discard URL authority if it matches the current hostname. + - Discard URL authority if it resolves to a local IP address when the new + *resolve_netloc* argument is set to true. + - Raise :exc:`~urllib.error.URLError` if a URL authority isn't local, + except on Windows where we return a UNC path as before. In :func:`urllib.request.pathname2url`: diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 41a79d0dceb0eb..e23dac3c25fa35 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -3290,7 +3290,6 @@ def test_from_uri_posix(self): self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar')) self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar')) if not is_wasi: - self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), P('/foo/bar')) self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'), P('/foo/bar')) self.assertRaises(ValueError, P.from_uri, 'foo/bar') diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 90de828cc71249..2720cbabd21e44 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1551,7 +1551,8 @@ def test_url2pathname_require_scheme(self): urllib.request.url2pathname(url, require_scheme=True), expected_path) - error_subtests = [ + def test_url2pathname_require_scheme_errors(self): + subtests = [ '', ':', 'foo', @@ -1561,13 +1562,21 @@ def test_url2pathname_require_scheme(self): 'data:file:foo', 'data:file://foo', ] - for url in error_subtests: + for url in subtests: with self.subTest(url=url): self.assertRaises( urllib.error.URLError, urllib.request.url2pathname, url, require_scheme=True) + def test_url2pathname_resolve_netloc(self): + fn = urllib.request.url2pathname + sep = os.path.sep + self.assertRaises(urllib.error.URLError, fn, '//127.0.0.1/foo/bar') + self.assertEqual(fn('//127.0.0.1/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') + self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), f'{sep}foo{sep}bar') + self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') + @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') def test_url2pathname_win(self): @@ -1622,8 +1631,6 @@ def test_url2pathname_posix(self): self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar') self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar') self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar') - self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar') - self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_url2pathname_nonascii(self): diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 9a6b29a90a2968..2f9505527bb08a 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1466,7 +1466,7 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - localfile = url2pathname(req.full_url, require_scheme=True) + localfile = url2pathname(req.full_url, require_scheme=True, resolve_netloc=True) try: stats = os.stat(localfile) size = stats.st_size @@ -1482,7 +1482,7 @@ def open_local_file(self, req): file_open = open_local_file -def _is_local_authority(authority): +def _is_local_authority(authority, resolve): # Compare hostnames if not authority or authority == 'localhost': return True @@ -1494,6 +1494,8 @@ def _is_local_authority(authority): if authority == hostname: return True # Compare IP addresses + if not resolve: + return False try: address = socket.gethostbyname(authority) except (socket.gaierror, AttributeError): @@ -1643,11 +1645,14 @@ def data_open(self, req): # Code move from the old urllib module -def url2pathname(url, *, require_scheme=False): +def url2pathname(url, *, require_scheme=False, resolve_netloc=False): """Convert the given file URL to a local file system path. The 'file:' scheme prefix must be omitted unless *require_scheme* is set to true. + + The URL authority may be resolved with gethostbyname() if + *resolve_netloc* is set to true. """ if require_scheme: scheme, url = _splittype(url) @@ -1655,7 +1660,7 @@ def url2pathname(url, *, require_scheme=False): raise URLError("URL is missing a 'file:' scheme") authority, url = _splithost(url) if os.name == 'nt': - if not _is_local_authority(authority): + if not _is_local_authority(authority, resolve_netloc): # e.g. file://server/share/file.txt url = '//' + authority + url elif url[:3] == '///': @@ -1669,7 +1674,7 @@ def url2pathname(url, *, require_scheme=False): # Older URLs use a pipe after a drive letter url = url[:1] + ':' + url[2:] url = url.replace('/', '\\') - elif not _is_local_authority(authority): + elif not _is_local_authority(authority, resolve_netloc): raise URLError("file:// scheme is supported only on localhost") encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst index 857cc359229daa..b13011f8927053 100644 --- a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -1,5 +1,6 @@ Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with -authorities. If an authority is present and resolves to ``localhost``, it is -now discarded. If an authority is present but *doesn't* resolve to -``localhost``, then on Windows a UNC path is returned (as before), and on -other platforms a :exc:`urllib.error.URLError` is now raised. +authorities. If an authority matches the current hostname it is now discarded. +Otherwise, if the new *resolve_netloc* keyword-only argument is set to true, +the authority is resolved and discarded if it matches a local IP address. If +the authority is still unhandled, then then on Windows a UNC path is returned +(as before), and on other platforms a :exc:`urllib.error.URLError` is now raised. From 26c8688a9f971379151ef228a28ae66bb2f754c7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 16 Apr 2025 20:25:55 +0100 Subject: [PATCH 2/9] Docs + tests fixes --- Doc/library/pathlib.rst | 7 +++---- Doc/library/urllib.request.rst | 6 +++--- Lib/test/test_urllib.py | 3 ++- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 4fb8709261612a..a7864bd2e23ec6 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -872,10 +872,9 @@ conforming to :rfc:`8089`. .. versionadded:: 3.13 .. versionchanged:: next - If a URL authority matches the current hostname, it is discarded. - Otherwise, if the authority isn't empty or ``localhost``, then on - Windows a UNC path is returned (as before), and on other platforms a - :exc:`ValueError` is raised. + The URL authority is discarded if it matches the machine hostname. + Otherwise, on Windows a UNC path is returned (as before), and on other + platforms a :exc:`ValueError` is raised. .. method:: Path.as_uri() diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index b471ce49a1be6a..dc0e5fb442884b 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -187,9 +187,9 @@ The :mod:`urllib.request` module defines the following functions: The URL authority is discarded if it empty, ``localhost``, or the current hostname. Otherwise, if *resolve_netloc* is set to true, the authority is - resolved to an IP address using :func:`socket.gethostbyname`, and discarded - if it matches a local IP. If the authority is still unhandled, then on - Windows a UNC path is returned, and on other platforms a + resolved using :func:`socket.gethostbyname`, and discarded if it matches a + local IP address. If the authority is still unhandled, then on Windows a + UNC path is returned, and on other platforms a :exc:`~urllib.error.URLError` is raised. This example shows the function being used on Windows:: diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2720cbabd21e44..00a20f92c199f2 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1572,7 +1572,6 @@ def test_url2pathname_require_scheme_errors(self): def test_url2pathname_resolve_netloc(self): fn = urllib.request.url2pathname sep = os.path.sep - self.assertRaises(urllib.error.URLError, fn, '//127.0.0.1/foo/bar') self.assertEqual(fn('//127.0.0.1/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), f'{sep}foo{sep}bar') self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') @@ -1607,6 +1606,7 @@ def test_url2pathname_win(self): self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('/////server/path/to/file'), '\\\\server\\path\\to\\file') + self.assertEqual(fn('//127.0.0.1/path/to/file'), '\\\\127.0.0.1\\path\\to\\file') # Localhost paths self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') @@ -1631,6 +1631,7 @@ def test_url2pathname_posix(self): self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar') self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar') self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//127.0.0.1/foo/bar') @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII') def test_url2pathname_nonascii(self): From 1bff8b944a74a755b47cdbf3c2f9248200515044 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 16 Apr 2025 20:55:27 +0100 Subject: [PATCH 3/9] This is hard to explain D: --- Doc/library/pathlib.rst | 5 +++-- Doc/library/urllib.request.rst | 5 +++-- .../2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 13 ++++++++----- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index a7864bd2e23ec6..04175ea028e310 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -873,8 +873,9 @@ conforming to :rfc:`8089`. .. versionchanged:: next The URL authority is discarded if it matches the machine hostname. - Otherwise, on Windows a UNC path is returned (as before), and on other - platforms a :exc:`ValueError` is raised. + Otherwise, if the authority isn't empty or ``localhost``, then on + Windows a UNC path is returned (as before), and on other platforms a + :exc:`ValueError` is raised. .. method:: Path.as_uri() diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index dc0e5fb442884b..6580661fa4c5d3 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -206,8 +206,9 @@ The :mod:`urllib.request` module defines the following functions: .. versionchanged:: next The URL authority is discarded if it matches the machine hostname. - Otherwise, on Windows a UNC path is returned (as before), and on other - platforms a :exc:`~urllib.error.URLError` is raised. + Otherwise, if the authority isn't empty or ``localhost``, then on + Windows a UNC path is returned (as before), and on other platforms a + :exc:`~urllib.error.URLError` is raised. .. versionchanged:: next The *require_scheme* argument was added. diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst index b13011f8927053..61282f4ac8fce0 100644 --- a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -1,6 +1,9 @@ Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with -authorities. If an authority matches the current hostname it is now discarded. -Otherwise, if the new *resolve_netloc* keyword-only argument is set to true, -the authority is resolved and discarded if it matches a local IP address. If -the authority is still unhandled, then then on Windows a UNC path is returned -(as before), and on other platforms a :exc:`urllib.error.URLError` is now raised. +authorities. The process now works as follows: + +1. Discard authority if it is empty or ``localhost``; otherwise +2. (New) Discard authority if it matches the current hostname; otherwise +3. (New) If the new *resolve_netloc* keyword-only argument is set to true, + discard authority if it resolves to a local IP address; otherwise +4. On Windows, return a UNC path; otherwise +5. (New) Raise :exc:`urllib.error.URLError`. From 4ec391f0e75fad24a58fd74972bd9a61d51aeaa8 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 16 Apr 2025 21:04:16 +0100 Subject: [PATCH 4/9] Terminology --- Doc/library/pathlib.rst | 2 +- Doc/library/urllib.request.rst | 4 ++-- .../Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 04175ea028e310..acb91812818dab 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -872,7 +872,7 @@ conforming to :rfc:`8089`. .. versionadded:: 3.13 .. versionchanged:: next - The URL authority is discarded if it matches the machine hostname. + The URL authority is discarded if it matches the local hostname. Otherwise, if the authority isn't empty or ``localhost``, then on Windows a UNC path is returned (as before), and on other platforms a :exc:`ValueError` is raised. diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 6580661fa4c5d3..46dc7a204815c0 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -185,7 +185,7 @@ The :mod:`urllib.request` module defines the following functions: value should include the prefix; a :exc:`~urllib.error.URLError` is raised if it doesn't. - The URL authority is discarded if it empty, ``localhost``, or the current + The URL authority is discarded if it empty, ``localhost``, or the local hostname. Otherwise, if *resolve_netloc* is set to true, the authority is resolved using :func:`socket.gethostbyname`, and discarded if it matches a local IP address. If the authority is still unhandled, then on Windows a @@ -205,7 +205,7 @@ The :mod:`urllib.request` module defines the following functions: :exc:`OSError` exception to be raised on Windows. .. versionchanged:: next - The URL authority is discarded if it matches the machine hostname. + The URL authority is discarded if it matches the local hostname. Otherwise, if the authority isn't empty or ``localhost``, then on Windows a UNC path is returned (as before), and on other platforms a :exc:`~urllib.error.URLError` is raised. diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst index 61282f4ac8fce0..9b24fa2f4afe49 100644 --- a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -2,7 +2,7 @@ Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with authorities. The process now works as follows: 1. Discard authority if it is empty or ``localhost``; otherwise -2. (New) Discard authority if it matches the current hostname; otherwise +2. (New) Discard authority if it matches the local hostname; otherwise 3. (New) If the new *resolve_netloc* keyword-only argument is set to true, discard authority if it resolves to a local IP address; otherwise 4. On Windows, return a UNC path; otherwise From 8df17b414f1e49e1955714b5e8bc95b783c4c816 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 16 Apr 2025 21:06:44 +0100 Subject: [PATCH 5/9] Faffing --- Doc/library/urllib.request.rst | 2 +- Doc/whatsnew/3.14.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 46dc7a204815c0..8bd2e6833f354e 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -187,7 +187,7 @@ The :mod:`urllib.request` module defines the following functions: The URL authority is discarded if it empty, ``localhost``, or the local hostname. Otherwise, if *resolve_netloc* is set to true, the authority is - resolved using :func:`socket.gethostbyname`, and discarded if it matches a + resolved using :func:`socket.gethostbyname` and discarded if it matches a local IP address. If the authority is still unhandled, then on Windows a UNC path is returned, and on other platforms a :exc:`~urllib.error.URLError` is raised. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 4d5b6717f3c416..d2d4bad82894a0 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1230,7 +1230,7 @@ urllib - Accept a complete URL when the new *require_scheme* argument is set to true. - - Discard URL authority if it matches the current hostname. + - Discard URL authority if it matches the local hostname. - Discard URL authority if it resolves to a local IP address when the new *resolve_netloc* argument is set to true. - Raise :exc:`~urllib.error.URLError` if a URL authority isn't local, From 9e7535759072c3bd85f7b33be53f7729e6d19bad Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Mon, 28 Apr 2025 17:48:46 +0100 Subject: [PATCH 6/9] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Doc/library/urllib.request.rst | 2 +- Lib/urllib/request.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 8bd2e6833f354e..fe952f343e564f 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -185,7 +185,7 @@ The :mod:`urllib.request` module defines the following functions: value should include the prefix; a :exc:`~urllib.error.URLError` is raised if it doesn't. - The URL authority is discarded if it empty, ``localhost``, or the local + The URL authority is discarded if it is empty, ``localhost``, or the local hostname. Otherwise, if *resolve_netloc* is set to true, the authority is resolved using :func:`socket.gethostbyname` and discarded if it matches a local IP address. If the authority is still unhandled, then on Windows a diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 2f9505527bb08a..390a172877e760 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1643,7 +1643,7 @@ def data_open(self, req): return addinfourl(io.BytesIO(data), headers, url) -# Code move from the old urllib module +# Code moved from the old urllib module def url2pathname(url, *, require_scheme=False, resolve_netloc=False): """Convert the given file URL to a local file system path. From 873c19a4d1b8588ba1063572c07195cfe74835c3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 28 Apr 2025 18:05:03 +0100 Subject: [PATCH 7/9] Address some review feedback --- Doc/library/urllib.request.rst | 15 ++++++--------- Doc/whatsnew/3.14.rst | 2 +- Lib/test/test_urllib.py | 6 +++--- Lib/urllib/request.py | 10 +++++----- ...2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 12 +++--------- 5 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index fe952f343e564f..53eddad0fd03bb 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -175,7 +175,7 @@ The :mod:`urllib.request` module defines the following functions: The *add_scheme* argument was added. -.. function:: url2pathname(url, *, require_scheme=False, resolve_netloc=False) +.. function:: url2pathname(url, *, require_scheme=False, resolve_host=False) Convert the given ``file:`` URL to a local path. This function uses :func:`~urllib.parse.unquote` to decode the URL. @@ -186,11 +186,11 @@ The :mod:`urllib.request` module defines the following functions: if it doesn't. The URL authority is discarded if it is empty, ``localhost``, or the local - hostname. Otherwise, if *resolve_netloc* is set to true, the authority is + hostname. Otherwise, if *resolve_host* is set to true, the authority is resolved using :func:`socket.gethostbyname` and discarded if it matches a - local IP address. If the authority is still unhandled, then on Windows a - UNC path is returned, and on other platforms a - :exc:`~urllib.error.URLError` is raised. + local IP address (as per :rfc:`RFC 8089 §3 <8089#section-3>`). If the + authority is still unhandled, then on Windows a UNC path is returned, and + on other platforms a :exc:`~urllib.error.URLError` is raised. This example shows the function being used on Windows:: @@ -211,10 +211,7 @@ The :mod:`urllib.request` module defines the following functions: :exc:`~urllib.error.URLError` is raised. .. versionchanged:: next - The *require_scheme* argument was added. - - .. versionchanged:: next - The *resolve_netloc* argument was added. + The *require_scheme* and *resolve_host* arguments were added. .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index d2d4bad82894a0..47382d764175fe 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1232,7 +1232,7 @@ urllib true. - Discard URL authority if it matches the local hostname. - Discard URL authority if it resolves to a local IP address when the new - *resolve_netloc* argument is set to true. + *resolve_host* argument is set to true. - Raise :exc:`~urllib.error.URLError` if a URL authority isn't local, except on Windows where we return a UNC path as before. diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 00a20f92c199f2..c965860fbb10ef 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1569,12 +1569,12 @@ def test_url2pathname_require_scheme_errors(self): urllib.request.url2pathname, url, require_scheme=True) - def test_url2pathname_resolve_netloc(self): + def test_url2pathname_resolve_host(self): fn = urllib.request.url2pathname sep = os.path.sep - self.assertEqual(fn('//127.0.0.1/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') + self.assertEqual(fn('//127.0.0.1/foo/bar', resolve_host=True), f'{sep}foo{sep}bar') self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), f'{sep}foo{sep}bar') - self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar', resolve_netloc=True), f'{sep}foo{sep}bar') + self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar', resolve_host=True), f'{sep}foo{sep}bar') @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 390a172877e760..32d5bdd5fdf687 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1466,7 +1466,7 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - localfile = url2pathname(req.full_url, require_scheme=True, resolve_netloc=True) + localfile = url2pathname(req.full_url, require_scheme=True, resolve_host=True) try: stats = os.stat(localfile) size = stats.st_size @@ -1645,14 +1645,14 @@ def data_open(self, req): # Code moved from the old urllib module -def url2pathname(url, *, require_scheme=False, resolve_netloc=False): +def url2pathname(url, *, require_scheme=False, resolve_host=False): """Convert the given file URL to a local file system path. The 'file:' scheme prefix must be omitted unless *require_scheme* is set to true. The URL authority may be resolved with gethostbyname() if - *resolve_netloc* is set to true. + *resolve_host* is set to true. """ if require_scheme: scheme, url = _splittype(url) @@ -1660,7 +1660,7 @@ def url2pathname(url, *, require_scheme=False, resolve_netloc=False): raise URLError("URL is missing a 'file:' scheme") authority, url = _splithost(url) if os.name == 'nt': - if not _is_local_authority(authority, resolve_netloc): + if not _is_local_authority(authority, resolve_host): # e.g. file://server/share/file.txt url = '//' + authority + url elif url[:3] == '///': @@ -1674,7 +1674,7 @@ def url2pathname(url, *, require_scheme=False, resolve_netloc=False): # Older URLs use a pipe after a drive letter url = url[:1] + ':' + url[2:] url = url.replace('/', '\\') - elif not _is_local_authority(authority, resolve_netloc): + elif not _is_local_authority(authority, resolve_host): raise URLError("file:// scheme is supported only on localhost") encoding = sys.getfilesystemencoding() errors = sys.getfilesystemencodeerrors() diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst index 9b24fa2f4afe49..aac6b41b5e87f3 100644 --- a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -1,9 +1,3 @@ -Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with -authorities. The process now works as follows: - -1. Discard authority if it is empty or ``localhost``; otherwise -2. (New) Discard authority if it matches the local hostname; otherwise -3. (New) If the new *resolve_netloc* keyword-only argument is set to true, - discard authority if it resolves to a local IP address; otherwise -4. On Windows, return a UNC path; otherwise -5. (New) Raise :exc:`urllib.error.URLError`. +Add *resolve_host* keyword-only argument to +:func:`urllib.request.url2pathname`, and fix handling of file URLs with +authorities. From 4bbeef68abb1213e972f4916c085c04cf6adadef Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Mon, 5 May 2025 12:52:54 +0100 Subject: [PATCH 8/9] Update Lib/urllib/request.py Co-authored-by: Steve Dower --- Lib/urllib/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 32d5bdd5fdf687..41dc5d7b35dedb 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1498,7 +1498,7 @@ def _is_local_authority(authority, resolve): return False try: address = socket.gethostbyname(authority) - except (socket.gaierror, AttributeError): + except (socket.gaierror, AttributeError, UnicodeEncodeError): return False return address in FileHandler().get_names() From 2a42d88012dc9c7131dba8ec02db9a1365804c00 Mon Sep 17 00:00:00 2001 From: barneygale Date: Mon, 5 May 2025 12:56:04 +0100 Subject: [PATCH 9/9] argument -> parameter --- Doc/library/urllib.request.rst | 4 ++-- .../Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 53eddad0fd03bb..234827132bfaf0 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -172,7 +172,7 @@ The :mod:`urllib.request` module defines the following functions: the URL ``///etc/hosts``. .. versionchanged:: next - The *add_scheme* argument was added. + The *add_scheme* parameter was added. .. function:: url2pathname(url, *, require_scheme=False, resolve_host=False) @@ -211,7 +211,7 @@ The :mod:`urllib.request` module defines the following functions: :exc:`~urllib.error.URLError` is raised. .. versionchanged:: next - The *require_scheme* and *resolve_host* arguments were added. + The *require_scheme* and *resolve_host* parameters were added. .. function:: getproxies() diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst index aac6b41b5e87f3..c45c22e73448db 100644 --- a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst +++ b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst @@ -1,3 +1,3 @@ -Add *resolve_host* keyword-only argument to +Add *resolve_host* keyword-only parameter to :func:`urllib.request.url2pathname`, and fix handling of file URLs with authorities.