From b49b425080fc18e93111e45b563bc9a5f54c71ba Mon Sep 17 00:00:00 2001 From: Fidget-Spinner <28750310+Fidget-Spinner@users.noreply.github.com> Date: Wed, 20 Jan 2021 23:19:28 +0800 Subject: [PATCH 1/5] conform to HTML5 recommendations --- Lib/test/test_urlparse.py | 131 ++++++++++++++++++++++++++++++-------- Lib/urllib/parse.py | 28 ++++++-- 2 files changed, 127 insertions(+), 32 deletions(-) diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 762500789f73ac..4cbc6b108de65f 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -11,7 +11,33 @@ # Each parse_qsl testcase is a two-tuple that contains # a string with the query and a list with the expected result. -parse_qsl_test_cases = [ +parse_qsl_test_cases_allow_semicolon = [ + (";", []), + (";;", []), + (";a=b", [('a', 'b')]), + ("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]), + ("a=1;a=2", [('a', '1'), ('a', '2')]), + (b";", []), + (b";;", []), + (b";a=b", [(b'a', b'b')]), + (b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), + (b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]), +] + +parse_qsl_test_cases_deny_semicolon = [ + (";", [(';', '')]), + (";;", [(';;', '')]), + (";a=b", [(';a', 'b')]), + ("a=a+b;b=b+c", [('a', 'a b;b=b c')]), + ("a=1;a=2", [('a', '1;a=2')]), + (b";", [(b';', b'')]), + (b";;", [(b';;', b'')]), + (b";a=b", [(b';a', b'b')]), + (b"a=a+b;b=b+c", [(b'a', b'a b;b=b c')]), + (b"a=1;a=2", [(b'a', b'1;a=2')]), +] + +parse_qsl_test_cases_ampersand = [ ("", []), ("&", []), ("&&", []), @@ -32,22 +58,43 @@ (b"&a=b", [(b'a', b'b')]), (b"a=a+b&b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), (b"a=1&a=2", [(b'a', b'1'), (b'a', b'2')]), - (";", []), - (";;", []), - (";a=b", [('a', 'b')]), - ("a=a+b;b=b+c", [('a', 'a b'), ('b', 'b c')]), - ("a=1;a=2", [('a', '1'), ('a', '2')]), - (b";", []), - (b";;", []), - (b";a=b", [(b'a', b'b')]), - (b"a=a+b;b=b+c", [(b'a', b'a b'), (b'b', b'b c')]), - (b"a=1;a=2", [(b'a', b'1'), (b'a', b'2')]), ] +parse_qsl_test_cases = (parse_qsl_test_cases_ampersand + + parse_qsl_test_cases_allow_semicolon) + +parse_qsl_test_cases_semicolon = (parse_qsl_test_cases_ampersand + + parse_qsl_test_cases_deny_semicolon) + # Each parse_qs testcase is a two-tuple that contains # a string with the query and a dictionary with the expected result. +parse_qs_test_cases_allow_semicolon = [ + (";", {}), + (";;", {}), + (";a=b", {'a': ['b']}), + ("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}), + ("a=1;a=2", {'a': ['1', '2']}), + (b";", {}), + (b";;", {}), + (b";a=b", {b'a': [b'b']}), + (b"a=a+b;b=b+c", {b'a': [b'a b'], b'b': [b'b c']}), + (b"a=1;a=2", {b'a': [b'1', b'2']}), +] -parse_qs_test_cases = [ +parse_qs_test_cases_deny_semicolon = [ + (";", {';': ['']}), + (";;", {';;': ['']}), + (";a=b", {';a': ['b']}), + ("a=a+b;b=b+c", {'a': ['a b;b=b c']}), + ("a=1;a=2", {'a': ['1;a=2']}), + (b";", {b';': [b'']}), + (b";;", {b';;': [b'']}), + (b";a=b", {b';a': [b'b']}), + (b"a=a+b;b=b+c", {b'a': [b'a b;b=b c']}), + (b"a=1;a=2", {b'a': [b'1;a=2']}), +] + +parse_qs_test_cases_ampersand = [ ("", {}), ("&", {}), ("&&", {}), @@ -68,18 +115,15 @@ (b"&a=b", {b'a': [b'b']}), (b"a=a+b&b=b+c", {b'a': [b'a b'], b'b': [b'b c']}), (b"a=1&a=2", {b'a': [b'1', b'2']}), - (";", {}), - (";;", {}), - (";a=b", {'a': ['b']}), - ("a=a+b;b=b+c", {'a': ['a b'], 'b': ['b c']}), - ("a=1;a=2", {'a': ['1', '2']}), - (b";", {}), - (b";;", {}), - (b";a=b", {b'a': [b'b']}), - (b"a=a+b;b=b+c", {b'a': [b'a b'], b'b': [b'b c']}), - (b"a=1;a=2", {b'a': [b'1', b'2']}), ] +parse_qs_test_cases = (parse_qs_test_cases_ampersand + + parse_qs_test_cases_allow_semicolon) + +parse_qs_test_cases_semicolon = (parse_qs_test_cases_ampersand + + parse_qs_test_cases_deny_semicolon) + + class UrlParseTestCase(unittest.TestCase): def checkRoundtrips(self, url, parsed, split): @@ -135,23 +179,55 @@ def checkRoundtrips(self, url, parsed, split): def test_qsl(self): for orig, expect in parse_qsl_test_cases: - result = urllib.parse.parse_qsl(orig, keep_blank_values=True) + result = urllib.parse.parse_qsl(orig, keep_blank_values=True, + semicolon_sep=True) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = [v for v in expect if len(v[1])] - result = urllib.parse.parse_qsl(orig, keep_blank_values=False) + result = urllib.parse.parse_qsl(orig, keep_blank_values=False, + semicolon_sep=True) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) def test_qs(self): for orig, expect in parse_qs_test_cases: - result = urllib.parse.parse_qs(orig, keep_blank_values=True) + result = urllib.parse.parse_qs(orig, keep_blank_values=True, + semicolon_sep=True) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = {v: expect[v] for v in expect if len(expect[v][0])} - result = urllib.parse.parse_qs(orig, keep_blank_values=False) + result = urllib.parse.parse_qs(orig, keep_blank_values=False, + semicolon_sep=True) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) + def test_qsl_no_semicolon(self): + # See bpo-42967 for more information. + for orig, expect in parse_qsl_test_cases_semicolon: + with self.subTest(f"Original: {orig}, Expected: {expect}"): + result = urllib.parse.parse_qsl(orig, keep_blank_values=True, + semicolon_sep=False) + self.assertEqual(result, expect, + "Error parsing %r" % orig) + expect_without_blanks = [v for v in expect if len(v[1])] + result = urllib.parse.parse_qsl(orig, keep_blank_values=False, + semicolon_sep=False) + self.assertEqual(result, expect_without_blanks, + "Error parsing %r" % orig) + + def test_qs_no_semicolon(self): + + for orig, expect in parse_qs_test_cases_semicolon: + with self.subTest(f"Original: {orig}, Expected: {expect}"): + result = urllib.parse.parse_qs(orig, keep_blank_values=True, + semicolon_sep=False) + self.assertEqual(result, expect, "Error parsing %r" % orig) + expect_without_blanks = {v: expect[v] + for v in expect if len(expect[v][0])} + result = urllib.parse.parse_qs(orig, keep_blank_values=False, + semicolon_sep=False) + self.assertEqual(result, expect_without_blanks, + "Error parsing %r" % orig) + def test_roundtrips(self): str_cases = [ ('file:///tmp/junk.txt', @@ -887,7 +963,8 @@ def test_parse_qsl_max_num_fields(self): with self.assertRaises(ValueError): urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10) with self.assertRaises(ValueError): - urllib.parse.parse_qs(';'.join(['a=a']*11), max_num_fields=10) + urllib.parse.parse_qs(';'.join(['a=a']*11), max_num_fields=10, + semicolon_sep=True) urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10) def test_urlencode_sequences(self): diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index ea897c3032257b..4a1f7b25c0a19b 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -662,7 +662,8 @@ def unquote(string, encoding='utf-8', errors='replace'): def parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace', max_num_fields=None): + encoding='utf-8', errors='replace', max_num_fields=None, + semicolon_sep=False): """Parse a query given as a string argument. Arguments: @@ -686,12 +687,17 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, max_num_fields: int. If set, then throws a ValueError if there are more than n fields read by parse_qsl(). + semicolon_sep: flag indicating whether ``;`` should be treated as a + valid separator. Defaults to False due to recommendation by the W3C + (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) + Returns a dictionary. """ parsed_result = {} pairs = parse_qsl(qs, keep_blank_values, strict_parsing, encoding=encoding, errors=errors, - max_num_fields=max_num_fields) + max_num_fields=max_num_fields, + semicolon_sep=semicolon_sep) for name, value in pairs: if name in parsed_result: parsed_result[name].append(value) @@ -701,7 +707,8 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace', max_num_fields=None): + encoding='utf-8', errors='replace', max_num_fields=None, + semicolon_sep=False): """Parse a query given as a string argument. Arguments: @@ -724,6 +731,10 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, max_num_fields: int. If set, then throws a ValueError if there are more than n fields read by parse_qsl(). + semicolon_sep: flag indicating whether ``;`` should be treated as a + valid separator. Defaults to False due to recommendation by the W3C + (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) + Returns a list, as G-d intended. """ qs, _coerce_result = _coerce_args(qs) @@ -732,11 +743,18 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, # is less than max_num_fields. This prevents a memory exhaustion DOS # attack via post bodies with many fields. if max_num_fields is not None: - num_fields = 1 + qs.count('&') + qs.count(';') + if semicolon_sep: + num_fields = 1 + qs.count('&') + qs.count(';') + else: + num_fields = 1 + qs.count('&') if max_num_fields < num_fields: raise ValueError('Max number of fields exceeded') - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + if semicolon_sep: + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + else: + pairs = [s for s in qs.split('&')] + r = [] for name_value in pairs: if not name_value and not strict_parsing: From 74a365c540299f31fe3be3cd1611291b8d647a86 Mon Sep 17 00:00:00 2001 From: Fidget-Spinner <28750310+Fidget-Spinner@users.noreply.github.com> Date: Wed, 20 Jan 2021 23:46:21 +0800 Subject: [PATCH 2/5] fix tests, default to True since too much code relies on that already --- Lib/cgi.py | 8 ++++++-- Lib/test/test_urlparse.py | 6 +++--- Lib/urllib/parse.py | 8 ++++---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Lib/cgi.py b/Lib/cgi.py index 6018c3608697af..c1de0cc82d0fb4 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -115,7 +115,8 @@ def closelog(): # 0 ==> unlimited input maxlen = 0 -def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0): +def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, + semicolon_sep=True): """Parse a query in the environment or from a file (default stdin) Arguments, all optional: @@ -134,6 +135,9 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0): strict_parsing: flag indicating what to do with parsing errors. If false (the default), errors are silently ignored. If true, errors raise a ValueError exception. + + semicolon_sep: flag indicating whether ``;`` should be treated as a + valid separator. """ if fp is None: fp = sys.stdin @@ -178,7 +182,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0): qs = "" environ['QUERY_STRING'] = qs # XXX Shouldn't, really return urllib.parse.parse_qs(qs, keep_blank_values, strict_parsing, - encoding=encoding) + encoding=encoding, semicolon_sep=semicolon_sep) def parse_multipart(fp, pdict, encoding="utf-8", errors="replace"): diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 4cbc6b108de65f..86b04f4379ea4d 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -203,7 +203,7 @@ def test_qs(self): def test_qsl_no_semicolon(self): # See bpo-42967 for more information. for orig, expect in parse_qsl_test_cases_semicolon: - with self.subTest(f"Original: {orig}, Expected: {expect}"): + with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"): result = urllib.parse.parse_qsl(orig, keep_blank_values=True, semicolon_sep=False) self.assertEqual(result, expect, @@ -215,9 +215,9 @@ def test_qsl_no_semicolon(self): "Error parsing %r" % orig) def test_qs_no_semicolon(self): - + # See bpo-42967 for more information. for orig, expect in parse_qs_test_cases_semicolon: - with self.subTest(f"Original: {orig}, Expected: {expect}"): + with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"): result = urllib.parse.parse_qs(orig, keep_blank_values=True, semicolon_sep=False) self.assertEqual(result, expect, "Error parsing %r" % orig) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 4a1f7b25c0a19b..8c86cc11629a25 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -663,7 +663,7 @@ def unquote(string, encoding='utf-8', errors='replace'): def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, - semicolon_sep=False): + semicolon_sep=True): """Parse a query given as a string argument. Arguments: @@ -688,7 +688,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, are more than n fields read by parse_qsl(). semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. Defaults to False due to recommendation by the W3C + valid separator. Recommended to set to False. (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) Returns a dictionary. @@ -708,7 +708,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, - semicolon_sep=False): + semicolon_sep=True): """Parse a query given as a string argument. Arguments: @@ -732,7 +732,7 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, if there are more than n fields read by parse_qsl(). semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. Defaults to False due to recommendation by the W3C + valid separator. Recommended to set to False. (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) Returns a list, as G-d intended. From d23323a76e1f4b95a867cc9c41f48a7795511ef2 Mon Sep 17 00:00:00 2001 From: Fidget-Spinner <28750310+Fidget-Spinner@users.noreply.github.com> Date: Thu, 21 Jan 2021 00:29:40 +0800 Subject: [PATCH 3/5] expose in cgi's public API --- Lib/cgi.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Lib/cgi.py b/Lib/cgi.py index c1de0cc82d0fb4..628eeec67ccdc7 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -319,7 +319,7 @@ class FieldStorage: def __init__(self, fp=None, headers=None, outerboundary=b'', environ=os.environ, keep_blank_values=0, strict_parsing=0, limit=None, encoding='utf-8', errors='replace', - max_num_fields=None): + max_num_fields=None, semicolon_sep=True): """Constructor. Read multipart/* until last part. Arguments, all optional: @@ -362,11 +362,16 @@ def __init__(self, fp=None, headers=None, outerboundary=b'', max_num_fields: int. If set, then __init__ throws a ValueError if there are more than n fields read by parse_qsl(). + semicolon_sep: flag indicating whether ``;`` should be treated as a + valid separator. Passed directly to urllib.parse.parse_qsl + internally. + """ method = 'GET' self.keep_blank_values = keep_blank_values self.strict_parsing = strict_parsing self.max_num_fields = max_num_fields + self.semicolon_sep = semicolon_sep if 'REQUEST_METHOD' in environ: method = environ['REQUEST_METHOD'].upper() self.qs_on_post = None @@ -476,7 +481,7 @@ def __init__(self, fp=None, headers=None, outerboundary=b'', if ctype == 'application/x-www-form-urlencoded': self.read_urlencoded() elif ctype[:10] == 'multipart/': - self.read_multi(environ, keep_blank_values, strict_parsing) + self.read_multi(environ, keep_blank_values, strict_parsing, semicolon_sep) else: self.read_single() @@ -593,13 +598,14 @@ def read_urlencoded(self): query = urllib.parse.parse_qsl( qs, self.keep_blank_values, self.strict_parsing, encoding=self.encoding, errors=self.errors, - max_num_fields=self.max_num_fields) + max_num_fields=self.max_num_fields, + semicolon_sep=self.semicolon_sep) self.list = [MiniFieldStorage(key, value) for key, value in query] self.skip_lines() FieldStorageClass = None - def read_multi(self, environ, keep_blank_values, strict_parsing): + def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep): """Internal: read a part that is itself multipart.""" ib = self.innerboundary if not valid_boundary(ib): @@ -609,7 +615,8 @@ def read_multi(self, environ, keep_blank_values, strict_parsing): query = urllib.parse.parse_qsl( self.qs_on_post, self.keep_blank_values, self.strict_parsing, encoding=self.encoding, errors=self.errors, - max_num_fields=self.max_num_fields) + max_num_fields=self.max_num_fields, + semicolon_sep=self.semicolon_sep) self.list.extend(MiniFieldStorage(key, value) for key, value in query) klass = self.FieldStorageClass or self.__class__ @@ -653,7 +660,8 @@ def read_multi(self, environ, keep_blank_values, strict_parsing): else self.limit - self.bytes_read part = klass(self.fp, headers, ib, environ, keep_blank_values, strict_parsing, limit, - self.encoding, self.errors, max_num_fields) + self.encoding, self.errors, max_num_fields, + semicolon_sep) if max_num_fields is not None: max_num_fields -= 1 From 99e86a1af2de11869d5bd204a3d247d299295ade Mon Sep 17 00:00:00 2001 From: Fidget-Spinner <28750310+Fidget-Spinner@users.noreply.github.com> Date: Thu, 21 Jan 2021 00:33:26 +0800 Subject: [PATCH 4/5] expose even more in public API --- Lib/cgi.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Lib/cgi.py b/Lib/cgi.py index 628eeec67ccdc7..88b2c3712937af 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -158,7 +158,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, if environ['REQUEST_METHOD'] == 'POST': ctype, pdict = parse_header(environ['CONTENT_TYPE']) if ctype == 'multipart/form-data': - return parse_multipart(fp, pdict) + return parse_multipart(fp, pdict, semicolon_sep=semicolon_sep) elif ctype == 'application/x-www-form-urlencoded': clength = int(environ['CONTENT_LENGTH']) if maxlen and clength > maxlen: @@ -185,7 +185,8 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, encoding=encoding, semicolon_sep=semicolon_sep) -def parse_multipart(fp, pdict, encoding="utf-8", errors="replace"): +def parse_multipart(fp, pdict, encoding="utf-8", errors="replace", + semicolon_sep=True): """Parse multipart input. Arguments: @@ -209,7 +210,7 @@ def parse_multipart(fp, pdict, encoding="utf-8", errors="replace"): except KeyError: pass fs = FieldStorage(fp, headers=headers, encoding=encoding, errors=errors, - environ={'REQUEST_METHOD': 'POST'}) + environ={'REQUEST_METHOD': 'POST'}, semicolon_sep=semicolon_sep) return {k: fs.getlist(k) for k in fs} def _parseparam(s): @@ -605,7 +606,7 @@ def read_urlencoded(self): FieldStorageClass = None - def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep): + def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep=True): """Internal: read a part that is itself multipart.""" ib = self.innerboundary if not valid_boundary(ib): From 8a1180d9f60815471a054e3d3b01309cc6a8ab13 Mon Sep 17 00:00:00 2001 From: Fidget-Spinner <28750310+Fidget-Spinner@users.noreply.github.com> Date: Sun, 24 Jan 2021 00:08:49 +0800 Subject: [PATCH 5/5] allow choosing of separator, add adam to acks --- Lib/cgi.py | 33 ++++++++++++++++----------------- Lib/test/test_urlparse.py | 18 +++++++++--------- Lib/urllib/parse.py | 28 +++++++++++----------------- Misc/ACKS | 1 + 4 files changed, 37 insertions(+), 43 deletions(-) diff --git a/Lib/cgi.py b/Lib/cgi.py index 88b2c3712937af..4e227aa05bd554 100755 --- a/Lib/cgi.py +++ b/Lib/cgi.py @@ -116,7 +116,7 @@ def closelog(): maxlen = 0 def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, - semicolon_sep=True): + separators=('&', ';')): """Parse a query in the environment or from a file (default stdin) Arguments, all optional: @@ -136,8 +136,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, If false (the default), errors are silently ignored. If true, errors raise a ValueError exception. - semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. + separators: valid separators in the query string, e.g. ('&',) """ if fp is None: fp = sys.stdin @@ -158,7 +157,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, if environ['REQUEST_METHOD'] == 'POST': ctype, pdict = parse_header(environ['CONTENT_TYPE']) if ctype == 'multipart/form-data': - return parse_multipart(fp, pdict, semicolon_sep=semicolon_sep) + return parse_multipart(fp, pdict, separators=separators) elif ctype == 'application/x-www-form-urlencoded': clength = int(environ['CONTENT_LENGTH']) if maxlen and clength > maxlen: @@ -182,11 +181,11 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0, qs = "" environ['QUERY_STRING'] = qs # XXX Shouldn't, really return urllib.parse.parse_qs(qs, keep_blank_values, strict_parsing, - encoding=encoding, semicolon_sep=semicolon_sep) + encoding=encoding, separators=separators) def parse_multipart(fp, pdict, encoding="utf-8", errors="replace", - semicolon_sep=True): + separators=('&', ';')): """Parse multipart input. Arguments: @@ -210,7 +209,7 @@ def parse_multipart(fp, pdict, encoding="utf-8", errors="replace", except KeyError: pass fs = FieldStorage(fp, headers=headers, encoding=encoding, errors=errors, - environ={'REQUEST_METHOD': 'POST'}, semicolon_sep=semicolon_sep) + environ={'REQUEST_METHOD': 'POST'}, separators=separators) return {k: fs.getlist(k) for k in fs} def _parseparam(s): @@ -320,7 +319,7 @@ class FieldStorage: def __init__(self, fp=None, headers=None, outerboundary=b'', environ=os.environ, keep_blank_values=0, strict_parsing=0, limit=None, encoding='utf-8', errors='replace', - max_num_fields=None, semicolon_sep=True): + max_num_fields=None, separators=('&', ';')): """Constructor. Read multipart/* until last part. Arguments, all optional: @@ -363,16 +362,15 @@ def __init__(self, fp=None, headers=None, outerboundary=b'', max_num_fields: int. If set, then __init__ throws a ValueError if there are more than n fields read by parse_qsl(). - semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. Passed directly to urllib.parse.parse_qsl - internally. + separators: valid separators in the query string, e.g. ('&',). + Passed directly to urllib.parse.parse_qsl internally. """ method = 'GET' self.keep_blank_values = keep_blank_values self.strict_parsing = strict_parsing self.max_num_fields = max_num_fields - self.semicolon_sep = semicolon_sep + self.separators = separators if 'REQUEST_METHOD' in environ: method = environ['REQUEST_METHOD'].upper() self.qs_on_post = None @@ -482,7 +480,7 @@ def __init__(self, fp=None, headers=None, outerboundary=b'', if ctype == 'application/x-www-form-urlencoded': self.read_urlencoded() elif ctype[:10] == 'multipart/': - self.read_multi(environ, keep_blank_values, strict_parsing, semicolon_sep) + self.read_multi(environ, keep_blank_values, strict_parsing, separators) else: self.read_single() @@ -600,13 +598,14 @@ def read_urlencoded(self): qs, self.keep_blank_values, self.strict_parsing, encoding=self.encoding, errors=self.errors, max_num_fields=self.max_num_fields, - semicolon_sep=self.semicolon_sep) + separators=self.separators) self.list = [MiniFieldStorage(key, value) for key, value in query] self.skip_lines() FieldStorageClass = None - def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep=True): + def read_multi(self, environ, keep_blank_values, strict_parsing, + separators=('&', ';')): """Internal: read a part that is itself multipart.""" ib = self.innerboundary if not valid_boundary(ib): @@ -617,7 +616,7 @@ def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep=T self.qs_on_post, self.keep_blank_values, self.strict_parsing, encoding=self.encoding, errors=self.errors, max_num_fields=self.max_num_fields, - semicolon_sep=self.semicolon_sep) + separators=self.separators) self.list.extend(MiniFieldStorage(key, value) for key, value in query) klass = self.FieldStorageClass or self.__class__ @@ -662,7 +661,7 @@ def read_multi(self, environ, keep_blank_values, strict_parsing, semicolon_sep=T part = klass(self.fp, headers, ib, environ, keep_blank_values, strict_parsing, limit, self.encoding, self.errors, max_num_fields, - semicolon_sep) + separators) if max_num_fields is not None: max_num_fields -= 1 diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 86b04f4379ea4d..9c3ab771714a06 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -180,23 +180,23 @@ def checkRoundtrips(self, url, parsed, split): def test_qsl(self): for orig, expect in parse_qsl_test_cases: result = urllib.parse.parse_qsl(orig, keep_blank_values=True, - semicolon_sep=True) + separators=('&', ';')) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = [v for v in expect if len(v[1])] result = urllib.parse.parse_qsl(orig, keep_blank_values=False, - semicolon_sep=True) + separators=('&', ';')) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) def test_qs(self): for orig, expect in parse_qs_test_cases: result = urllib.parse.parse_qs(orig, keep_blank_values=True, - semicolon_sep=True) + separators=('&', ';')) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = {v: expect[v] for v in expect if len(expect[v][0])} result = urllib.parse.parse_qs(orig, keep_blank_values=False, - semicolon_sep=True) + separators=('&', ';')) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) @@ -205,12 +205,12 @@ def test_qsl_no_semicolon(self): for orig, expect in parse_qsl_test_cases_semicolon: with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"): result = urllib.parse.parse_qsl(orig, keep_blank_values=True, - semicolon_sep=False) + separators=('&',)) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = [v for v in expect if len(v[1])] result = urllib.parse.parse_qsl(orig, keep_blank_values=False, - semicolon_sep=False) + separators=('&',)) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) @@ -219,12 +219,12 @@ def test_qs_no_semicolon(self): for orig, expect in parse_qs_test_cases_semicolon: with self.subTest(f"Original: {orig!r}, Expected: {expect!r}"): result = urllib.parse.parse_qs(orig, keep_blank_values=True, - semicolon_sep=False) + separators=('&',)) self.assertEqual(result, expect, "Error parsing %r" % orig) expect_without_blanks = {v: expect[v] for v in expect if len(expect[v][0])} result = urllib.parse.parse_qs(orig, keep_blank_values=False, - semicolon_sep=False) + separators=('&',)) self.assertEqual(result, expect_without_blanks, "Error parsing %r" % orig) @@ -964,7 +964,7 @@ def test_parse_qsl_max_num_fields(self): urllib.parse.parse_qs('&'.join(['a=a']*11), max_num_fields=10) with self.assertRaises(ValueError): urllib.parse.parse_qs(';'.join(['a=a']*11), max_num_fields=10, - semicolon_sep=True) + separators=('&', ';')) urllib.parse.parse_qs('&'.join(['a=a']*10), max_num_fields=10) def test_urlencode_sequences(self): diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 8c86cc11629a25..745312cbef1a33 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -663,7 +663,7 @@ def unquote(string, encoding='utf-8', errors='replace'): def parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, - semicolon_sep=True): + separators=('&', ';')): """Parse a query given as a string argument. Arguments: @@ -687,9 +687,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, max_num_fields: int. If set, then throws a ValueError if there are more than n fields read by parse_qsl(). - semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. Recommended to set to False. - (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) + separators: valid separators in the query string, e.g. ('&',) Returns a dictionary. """ @@ -697,7 +695,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, pairs = parse_qsl(qs, keep_blank_values, strict_parsing, encoding=encoding, errors=errors, max_num_fields=max_num_fields, - semicolon_sep=semicolon_sep) + separators=separators) for name, value in pairs: if name in parsed_result: parsed_result[name].append(value) @@ -708,7 +706,7 @@ def parse_qs(qs, keep_blank_values=False, strict_parsing=False, def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None, - semicolon_sep=True): + separators=('&', ';')): """Parse a query given as a string argument. Arguments: @@ -731,9 +729,7 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, max_num_fields: int. If set, then throws a ValueError if there are more than n fields read by parse_qsl(). - semicolon_sep: flag indicating whether ``;`` should be treated as a - valid separator. Recommended to set to False. - (see https://www.w3.org/TR/2014/REC-html5-20141028/forms.html#url-encoded-form-data) + separators: valid separators in the query string, e.g. ('&',) Returns a list, as G-d intended. """ @@ -743,17 +739,15 @@ def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, # is less than max_num_fields. This prevents a memory exhaustion DOS # attack via post bodies with many fields. if max_num_fields is not None: - if semicolon_sep: - num_fields = 1 + qs.count('&') + qs.count(';') - else: - num_fields = 1 + qs.count('&') + num_fields = 1 + sum(qs.count(sep) for sep in separators) if max_num_fields < num_fields: raise ValueError('Max number of fields exceeded') - if semicolon_sep: - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - else: - pairs = [s for s in qs.split('&')] + if not separators: + raise ValueError('Separators must be a tuple or list of strings') + for sep in separators[1:]: + qs = qs.replace(sep, separators[0]) + pairs = qs.split(separators[0]) r = [] for name_value in pairs: diff --git a/Misc/ACKS b/Misc/ACKS index 136266965a869b..41d65a6f7712a9 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -613,6 +613,7 @@ Karan Goel Jeroen Van Goey Christoph Gohlke Tim Golden +Adam Goldschmidt Yonatan Goldschmidt Mark Gollahon Mikhail Golubev