8000 gh-121285: Remove backtracking when parsing tarfile headers by sethmlarson · Pull Request #121286 · python/cpython · GitHub
[go: up one dir, main page]

Skip to content

gh-121285: Remove backtracking when parsing tarfile headers #121286

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 31, 2024
Merged
Prev Previous commit
Next Next commit
Rewrite PAX header parsing to be stricter
  • Loading branch information
sethmlarson committed Jul 9, 2024
commit 39a419cbc6e345da202546af3af4d1be08aeb0a7
78 changes: 43 additions & 35 deletions Lib/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,9 @@ def data_filter(member, dest_path):
# Sentinel for replace() defaults, meaning "don't change the attribute"
_KEEP = object()

# Header length is digits followed by a space.
_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")

class TarInfo(object):
"""Informational class which holds the details about an
archive member given by a tar header block.
Expand Down Expand Up @@ -1432,61 +1435,66 @@ def _proc_pax(self, tarfile):
else:
pax_headers = tarfile.pax_headers.copy()

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails.
if (
# Statement is both a contains check (!=-1) and a bounds check (>0)
(hdrcharset_offset := buf.find(b" hdrcharset=") - 1) > -1
# Check that the character before is a digit (0x30-0x39 is 0-9)
and 0x30 <= buf[hdrcharset_offset] <= 0x39
):
match = re.match(br"^\d{1,20} hdrcharset=([^\n]+)\n", buf[hdrcharset_offset:])
if match is not None:
pax_headers["hdrcharset"] = match.group(1).decode("utf-8")

# For the time being, we don't care about anything other than "BINARY".
# The only other value that is currently allowed by the standard is
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
hdrcharset = pax_headers.get("hdrcharset")
if hdrcharset == "BINARY":
encoding = tarfile.encoding
else:
encoding = "utf-8"

# Parse pax header information. A record looks like that:
# "%d %s=%s\n" % (length, keyword, value). length is the size
# of the complete record including the length field itself and
# the newline. keyword and value are both UTF-8 encoded strings.
regex = re.compile(br"^(\d{1,20}) ([^=]+)=")
# the newline.
pos = 0
while match := regex.match(buf[pos:]):
length, keyword = match.groups()
length = int(length)
if length == 0:
encoding = "utf-8"
raw_headers = []
while len(buf) > pos and buf[pos] != 0x00:
if not (match := _header_length_prefix_re.match(buf, pos)):
raise InvalidHeaderError("invalid header")
try:
length = int(match.group(1))
except ValueError:
raise InvalidHeaderError("invalid header")
# Headers must be at least 3 bytes, one for each of keyword, '=', and '\n'.
# Value is allowed to be empty.
if length < 3:
raise InvalidHeaderError("invalid header")
value = buf[match.end(2) + pos + 1:match.start(1) + pos + length - 1]
if pos + length > len(buf):
raise InvalidHeaderError("invalid header")

keyword_and_value = buf[match.end(1) + 1:match.start(1) + length - 1]
raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")

# Check the framing of the header. The last character must be '\n' (0x0A)
if not raw_keyword or equals != b"=" or buf[match.start(1) + length - 1] != 0x0A:
raise InvalidHeaderError("invalid header")
raw_headers.append((length, raw_keyword, raw_value))

# Check if the pax header contains a hdrcharset field. This tells us
# the encoding of the path, linkpath, uname and gname fields. Normally,
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
# implementations are allowed to store them as raw binary strings if
# the translation to UTF-8 fails. For the time being, we don't care about
# anything other than "BINARY". The only other value that is currently
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
if raw_keyword == b"hdrcharset" and raw_value == b"BINARY":
encoding = tarfile.encoding

pos += length

# After parsing the raw headers we can decode them to text.
for length, raw_keyword, raw_value in raw_headers:
# Normally, we could just use "utf-8" as the encoding and "strict"
# as the error handler, but we better not take the risk. For
# example, GNU tar <= 1.23 is known to store filenames it cannot
# translate to UTF-8 as raw strings (unfortunately without a
# hdrcharset=BINARY header).
# We first try the strict standard encoding, and if that fails we
# fall back on the user's encoding and error handler.
keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
tarfile.errors)
if keyword in PAX_NAME_FIELDS:
value = self._decode_pax_field(value, encoding, tarfile.encoding,
value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
tarfile.errors)
else:
value = self._decode_pax_field(value, "utf-8", "utf-8",
value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
tarfile.errors)

pax_headers[keyword] = value
pos += length

# Fetch the next header.
try:
Expand Down
42 changes: 42 additions & 0 deletions Lib/test/test_tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,48 @@ def test_pax_number_fields(self):
finally:
tar.close()

def test_pax_header_bad_formats(self):
# The fields from the pax header have priority over the
# TarInfo.
pax_header_replacements = (
b" foo=bar\n",
b"0 \n",
b"1 \n",
b"2 \n",
b"3 =\n",
b"4 =a\n",
b"1000000 foo=bar\n",
b"0 foo=bar\n",
b"-12 foo=bar\n",
b"000000000000000000000000036 foo=bar\n",
)
pax_headers = {"foo": "bar"}

for replacement in pax_header_replacements:
with self.subTest(header=replacement):
tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT,
encoding="iso8859-1")
try:
t = tarfile.TarInfo()
t.name = "pax" # non-ASCII
t.uid = 1
t.pax_headers = pax_headers
tar.addfile(t)
finally:
tar.close()

with open(tmpname, "rb") as f:
data = f.read()
self.assertIn(b"11 foo=bar\n", data)
data = data.replace(b"11 foo=bar\n", replacement)

with open(tmpname, "wb") as f:
f.truncate()
f.write(data)

with self.assertRaisesRegex(tarfile.ReadError, "method tar: ReadError\('invalid header'\)"):
tarfile.open(tmpname, encoding="iso8859-1")


class WriteTestBase(TarTest):
# Put all write tests in here that are supposed to be tested
Expand Down
Loading
0