8000 [3.3] bpo-30500: urllib: Simplify splithost by calling into urlparse.… · python/cpython@052f9d6 · GitHub
[go: up one dir, main page]

Skip to content

Commit 052f9d6

Browse files
vstinnerned-deily
authored andcommitted
[3.3] bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2292)
The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. (cherry picked from commit 90e01e5) (cherry picked from commit cc54c1c)
1 parent b5f20ea commit 052f9d6

File tree

4 files changed

+208
-52
lines changed

4 files changed

+208
-52
lines changed

Lib/test/test_urlparse.py

Lines changed: 200 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,52 @@ def test_anyscheme(self):
664664
self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query"),
665665
(b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b''))
666666

667+
def test_default_scheme(self):
668+
# Exercise the scheme parameter of urlparse() and urlsplit()
669+
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
670+
result = func("http://example.net/", "ftp")
671+
self.assertEqual(result.scheme, "http")
672+
result = func(b"http://example.net/", b"ftp")
673+
self.assertEqual(result.scheme, b"http")
674+
self.assertEqual(func("path", "ftp").scheme, "ftp")
675+
self.assertEqual(func("path", scheme="ftp").scheme, "ftp")
676+
self.assertEqual(func(b"path", scheme=b"ftp").scheme, b"ftp")
677+
self.assertEqual(func("path").scheme, "")
678+
self.assertEqual(func(b"path").scheme, b"")
679+
self.assertEqual(func(b"path", "").scheme, b"")
680+
681+
def test_parse_fragments(self):
682+
# Exercise the allow_fragments parameter of urlparse() and urlsplit()
683+
tests = (
684+
("http:#frag", "path", "frag"),
685+
("//example.net#frag", "path", "frag"),
686+
("index.html#frag", "path", "frag"),
687+
(";a=b#frag", "params", "frag"),
688+
("?a=b#frag", "query", "frag"),
689+
("#frag", "path", "frag"),
690+
("abc#@frag", "path", "@frag"),
691+
("//abc#@frag", "path", "@frag"),
692+
("//abc:80#@frag", "path", "@frag"),
693+
("//abc#@frag:80", "path", "@frag:80"),
694+
)
695+
for url, attr, expected_frag in tests:
696+
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
697+
if attr == "params" and func is urllib.parse.urlsplit:
698+
attr = "path"
699+
result = func(url, allow_fragments=False)
700+
self.assertEqual(result.fragment, "")
701+
self.assertTrue(
702+
getattr(result, attr).endswith("#" + expected_frag))
703+
self.assertEqual(func(url, "", False).fragment, "")
704+
705+
result = func(url, allow_fragments=True)
706+
self.assertEqual(result.fragment, expected_frag)
707+
self.assertFalse(
708+
getattr(result, attr).endswith(expected_frag))
709+
self.assertEqual(func(url, "", True).fragment,
710+
expected_frag)
711+
self.assertEqual(func(url).fragment, expected_frag)
712+
667713
def test_mixed_types_rejected(self):
668714
# Several functions that process either strings or ASCII encoded bytes
669715
# accept multiple arguments. Check they reject mixed type input
@@ -749,52 +795,6 @@ def test_parse_qsl_encoding(self):
749795
errors="ignore")
750796
self.assertEqual(result, [('key', '\u0141-')])
751797

752-
def test_splitport(self):
753-
splitport = urllib.parse.splitport
754-
self.assertEqual(splitport('parrot:88'), ('parrot', '88'))
755-
self.assertEqual(splitport('parrot'), ('parrot', None))
756-
self.assertEqual(splitport('parrot:'), ('parrot', None))
757-
self.assertEqual(splitport('127.0.0.1'), ('127.0.0.1', None))
758-
self.assertEqual(splitport('parrot:cheese'), ('parrot:cheese', None))
759-
760-
def test_splitnport(self):
761-
splitnport = urllib.parse.splitnport
762-
self.assertEqual(splitnport('parrot:88'), ('parrot', 88))
763-
self.assertEqual(splitnport('parrot'), ('parrot', -1))
764-
self.assertEqual(splitnport('parrot', 55), ('parrot', 55))
765-
self.assertEqual(splitnport('parrot:'), ('parrot', -1))
766-
self.assertEqual(splitnport('parrot:', 55), ('parrot', 55))
767-
self.assertEqual(splitnport('127.0.0.1'), ('127.0.0.1', -1))
768-
self.assertEqual(splitnport('127.0.0.1', 55), ('127.0.0.1', 55))
769-
self.assertEqual(splitnport('parrot:cheese'), ('parrot', None))
770-
self.assertEqual(splitnport('parrot:cheese', 55), ('parrot', None))
771-
772-
def test_splitquery(self):
773-
# Normal cases are exercised by other tests; ensure that we also
774-
# catch cases with no port specified (testcase ensuring coverage)
775-
result = urllib.parse.splitquery('http://python.org/fake?foo=bar')
776-
self.assertEqual(result, ('http://python.org/fake', 'foo=bar'))
777-
result = urllib.parse.splitquery('http://python.org/fake?foo=bar?')
778-
self.assertEqual(result, ('http://python.org/fake?foo=bar', ''))
779-
result = urllib.parse.splitquery('http://python.org/fake')
780-
self.assertEqual(result, ('http://python.org/fake', None))
781-
782-
def test_splitvalue(self):
783-
# Normal cases are exercised by other tests; test pathological cases
784-
# with no key/value pairs. (testcase ensuring coverage)
785-
result = urllib.parse.splitvalue('foo=bar')
786-
self.assertEqual(result, ('foo', 'bar'))
787-
result = urllib.parse.splitvalue('foo=')
788-
self.assertEqual(result, ('foo', ''))
789-
result = urllib.parse.splitvalue('foobar')
790-
self.assertEqual(result, ('foobar', None))
791-
792-
def test_to_bytes(self):
793-
result = urllib.parse.to_bytes('http://www.python.org')
794-
self.assertEqual(result, 'http://www.python.org')
795-
self.assertRaises(UnicodeError, urllib.parse.to_bytes,
796-
'http://www.python.org/medi\u00e6val')
797-
798798
def test_urlencode_sequences(self):
799799
# Other tests incidentally urlencode things; test non-covered cases:
800800
# Sequence and object values.
@@ -863,9 +863,162 @@ def test_telurl_params(self):
863863
self.assertEqual(p1.path, '863-1234')
864864
self.assertEqual(p1.params, 'phone-context=+1-914-555')
865865

866+
def test_Quoter_repr(self):
867+
quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE)
868+
self.assertIn('Quoter', repr(quoter))
869+
870+
871+
class Utility_Tests(unittest.TestCase):
872+
"""Testcase to test the various utility functions in the urllib."""
873+
# In Python 2 this test class was in test_urllib.
874+
875+
def test_splittype(self):
876+
splittype = urllib.parse.splittype
877+
self.assertEqual(splittype('type:opaquestring'), ('type', 'opaquestring'))
878+
self.assertEqual(splittype('opaquestring'), (None, 'opaquestring'))
879+
self.assertEqual(splittype(':opaquestring'), (None, ':opaquestring'))
880+
self.assertEqual(splittype('type:'), ('type', ''))
881+
self.assertEqual(splittype('type:opaque:string'), ('type', 'opaque:string'))
882+
883+
def test_splithost(self):
884+
splithost = urllib.parse.splithost
885+
self.assertEqual(splithost('//www.example.org:80/foo/bar/baz.html'),
886+
('www.example.org:80', '/foo/bar/baz.html'))
887+
self.assertEqual(splithost('//www.example.org:80'),
888+
('www.example.org:80', ''))
889+
self.assertEqual(splithost('/foo/bar/baz.html'),
890+
(None, '/foo/bar/baz.html'))
891+
892+
# bpo-30500: # starts a fragment.
893+
self.assertEqual(splithost('//127.0.0.1#@host.com'),
894+
('127.0.0.1', '/#@host.com'))
895+
self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
896+
('127.0.0.1', '/#@host.com:80'))
897+
self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
898+
('127.0.0.1:80', '/#@host.com'))
899+
900+
# Empty host is returned as empty string.
901+
self.assertEqual(splithost("///file"),
902+
('', '/file'))
903+
904+
# Trailing semicolon, question mark and hash symbol are kept.
905+
self.assertEqual(splithost("//example.net/file;"),
906+
('example.net', '/file;'))
907+
self.assertEqual(splithost("//example.net/file?"),
908+
('example.net', '/file?'))
909+
self.assertEqual(splithost("//example.net/file#"),
910+
('example.net', '/file#'))
911+
912+
def test_splituser(self):
913+
splituser = urllib.parse.splituser
914+
self.assertEqual(splituser('User:Pass@www.python.org:080'),
915+
('User:Pass', 'www.python.org:080'))
916+
self.assertEqual(splituser('@www.python.org:080'),
917+
('', 'www.python.org:080'))
918+
self.assertEqual(splituser('www.python.org:080'),
919+
(None, 'www.python.org:080'))
920+
self.assertEqual(splituser('User:Pass@'),
921+
('User:Pass', ''))
922+
self.assertEqual(splituser('User@example.com:Pass@www.python.org:080'),
923+
('User@example.com:Pass', 'www.python.org:080'))
924+
925+
def test_splitpasswd(self):
926+
# Some of the password examples are not sensible, but it is added to
927+
# confirming to RFC2617 and addressing issue4675.
928+
splitpasswd = urllib.parse.splitpasswd
929+
self.assertEqual(splitpasswd('user:ab'), ('user', 'ab'))
930+
self.assertEqual(splitpasswd('user:a\nb'), ('user', 'a\nb'))
931+
self.assertEqual(splitpasswd('user:a\tb'), ('user', 'a\tb'))
932+
self.assertEqual(splitpasswd('user:a\rb'), ('user', 'a\rb'))
933+
self.assertEqual(splitpasswd('user:a\fb'), ('user', 'a\fb'))
934+
self.assertEqual(splitpasswd('user:a\vb'), ('user', 'a\vb'))
935+
self.assertEqual(splitpasswd('user:a:b'), ('user', 'a:b'))
936+
self.assertEqual(splitpasswd('user:a b'), ('user', 'a b'))
937+
self.assertEqual(splitpasswd('user 2:ab'), ('user 2', 'ab'))
938+
self.assertEqual(splitpasswd('user+1:a+b'), ('user+1', 'a+b'))
939+
self.assertEqual(splitpasswd('user:'), ('user', ''))
940+
self.assertEqual(splitpasswd('user'), ('user', None))
941+
self.assertEqual(splitpasswd(':ab'), ('', 'ab'))
942+
943+
def test_splitport(self):
944+
splitport = urllib.parse.splitport
945+
self.assertEqual(splitport('parrot:88'), ('parrot', '88'))
946+
self.assertEqual(splitport('parrot'), ('parrot', None))
947+
self.assertEqual(splitport('parrot:'), ('parrot', None))
948+
self.assertEqual(splitport('127.0.0.1'), ('127.0.0.1', None))
949+
self.assertEqual(splitport('parrot:cheese'), ('parrot:cheese', None))
950+
self.assertEqual(splitport('[::1]:88'), ('[::1]', '88'))
951+
self.assertEqual(splitport('[::1]'), ('[::1]', None))
952+
self.assertEqual(splitport(':88'), ('', '88'))
953+
954+
def test_splitnport(self):
955+
splitnport = urllib.parse.splitnport
956+
self.assertEqual(splitnport('parrot:88'), ('parrot', 88))
957+
self.assertEqual(splitnport('parrot'), ('parrot', -1))
958+
self.assertEqual(splitnport('parrot', 55), ('parrot', 55))
959+
self.assertEqual(splitnport('parrot:'), ('parrot', -1))
960+
self.assertEqual(splitnport('parrot:', 55), ('parrot', 55))
961+
self.assertEqual(splitnport('127.0.0.1'), ('127.0.0.1', -1))
962+
self.assertEqual(splitnport('127.0.0.1', 55), ('127.0.0.1', 55))
963+
self.assertEqual(splitnport('parrot:cheese'), ('parrot', None))
964+
self.assertEqual(splitnport('parrot:cheese', 55), ('parrot', None))
965+
966+
def test_splitquery(self):
967+
# Normal cases are exercised by other tests; ensure that we also
968+
# catch cases with no port specified (testcase ensuring coverage)
969+
splitquery = urllib.parse.splitquery
970+
self.assertEqual(splitquery('http://python.org/fake?foo=bar'),
971+
('http://python.org/fake', 'foo=bar'))
972+
self.assertEqual(splitquery('http://python.org/fake?foo=bar?'),
973+
('http://python.org/fake?foo=bar', ''))
974+
self.assertEqual(splitquery('http://python.org/fake'),
975+
('http://python.org/fake', None))
976+
self.assertEqual(splitquery('?foo=bar'), ('', 'foo=bar'))
977+
978+
def test_splittag(self):
979+
splittag = urllib.parse.splittag
980+
self.assertEqual(splittag('http://example.com?foo=bar#baz'),
981+
('http://example.com?foo=bar', 'baz'))
982+
self.assertEqual(splittag('http://example.com?foo=bar#'),
983+
('http://example.com?foo=bar', ''))
984+
self.assertEqual(splittag('#baz'), ('', 'baz'))
985+
self.assertEqual(splittag('http://example.com?foo=bar'),
986+
('http://example.com?foo=bar', None))
987+
self.assertEqual(splittag('http://example.com?foo=bar#baz#boo'),
988+
('http://example.com?foo=bar#baz', 'boo'))
989+
990+
def test_splitattr(self):
991+
splitattr = urllib.parse.splitattr
992+
self.assertEqual(splitattr('/path;attr1=value1;attr2=value2'),
993+
('/path', ['attr1=value1', 'attr2=value2']))
994+
self.assertEqual(splitattr('/path;'), ('/path', ['']))
995+
self.assertEqual(splitattr(';attr1=value1;attr2=value2'),
996+
('', ['attr1=value1', 'attr2=value2']))
997+
self.assertEqual(splitattr('/path'), ('/path', []))
998+
999+
def test_splitvalue(self):
1000+
# Normal cases are exercised by other tests; test pathological cases
1001+
# with no key/value pairs. (testcase ensuring coverage)
1002+
splitvalue = urllib.parse.splitvalue
1003+
self.assertEqual(splitvalue('foo=bar'), ('foo', 'bar'))
1004+
self.assertEqual(splitvalue('foo='), ('foo', ''))
1005+
self.assertEqual(splitvalue('=bar'), ('', 'bar'))
1006+
self.assertEqual(splitvalue('foobar'), ('foobar', None))
1007+
self.assertEqual(splitvalue('foo=bar=baz'), ('foo', 'bar=baz'))
1008+
1009+
def test_to_bytes(self):
1010+
result = urllib.parse.to_bytes('http://www.python.org')
1011+
self.assertEqual(result, 'http://www.python.org')
1012+
self.assertRaises(UnicodeError, urllib.parse.to_bytes,
1013+
'http://www.python.org/medi\u00e6val')
1014+
1015+
def test_unwrap(self):
1016+
url = urllib.parse.unwrap('<URL:type://host/path>')
1017+
self.assertEqual(url, 'type://host/path')
1018+
8661019

8671020
def test_main():
868-
support.run_unittest(UrlParseTestCase)
1021+
support.run_unittest(UrlParseTestCase, Utility_Tests)
8691022

8701023
if __name__ == "__main__":
8711024
test_main()

Lib/urllib/parse.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -860,14 +860,12 @@ def splithost(url):
860860
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
861861
global _hostprog
862862
if _hostprog is None:
863-
import re
864-
_hostprog = re.compile('^//([^/?]*)(.*)$')
863+
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
865864

866865
match = _hostprog.match(url)
867866
if match:
868-
host_port = match.group(1)
869-
path = match.group(2)
870-
if path and not path.startswith('/'):
867+
host_port, path = match.groups()
868+
if path and path[0] != '/':
871869
path = '/' + path
872870
return host_port, path
873871
return None, url

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -892,6 +892,7 @@ Chad Netzer
892892
Max Neunhöffer
893893
George Neville-Neil
894894
Hieu Nguyen
895+
Nam Nguyen
895896
Johannes Nicolai
896897
Samuel Nicolary
897898
Jonathan Niehof
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix urllib.parse.splithost() to correctly parse fragments. For example,
2+
``splithost('//127.0.0.1#@evil.com/')`` now correctly returns the
3+
``127.0.0.1`` host, instead of treating ``@evil.com`` as the host in an
4+
authentification (``login@host``).

0 commit comments

Comments
 (0)
0