10000 gh-91810: ElementTree: Use text file's encoding by default in XML dec… · serhiy-storchaka/cpython@ca0f8e9 · GitHub
[go: up one dir, main page]

Skip to content

Commit ca0f8e9

Browse files
pythongh-91810: ElementTree: Use text file's encoding by default in XML declaration
ElementTree method write() and function tostring() now use the text file's encoding ("UTF-8" if not available) instead of locale encoding in XML declaration when encoding="unicode" is specified.
1 parent 4403320 commit ca0f8e9

File tree

3 files changed

+101
-40
lines changed

3 files changed

+101
-40
lines changed

Lib/test/test_xml_etree.py

Lines changed: 87 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import html
1111
import io
1212
import itertools
13-
import locale
1413
import operator
1514
import os
1615
import pickle
@@ -975,15 +974,13 @@ def test_tostring_xml_declaration(self):
975974

976975
def test_tostring_xml_declaration_unicode_encoding(self):
977976
elem = ET.XML('<body><tag/></body>')
978-
preferredencoding = locale.getpreferredencoding()
979977
self.assertEqual(
980-
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
981-
ET.tostring(elem, encoding='unicode', xml_declaration=True)
978+
ET.tostring(elem, encoding='unicode', xml_declaration=True),
979+
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
982980
)
983981

984982
def test_tostring_xml_declaration_cases(self):
985983
elem = ET.XML('<body><tag>ø</tag></body>')
986-
preferredencoding = locale.getpreferredencoding()
987984
TESTCASES = [
988985
# (expected_retval, encoding, xml_declaration)
989986
# ... xml_declaration = None
@@ -1010,7 +1007,7 @@ def test_tostring_xml_declaration_cases(self):
10101007
b"<body><tag>&#248;</tag></body>", 'US-ASCII', True),
10111008
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
10121009
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
1013-
(f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
1010+
("<?xml version='1.0' encoding='utf-8'?>\n"
10141011
"<body><tag>ø</tag></body>", 'unicode', True),
10151012

10161013
]
@@ -1048,11 +1045,10 @@ def test_tostringlist_xml_declaration(self):
10481045
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
10491046
)
10501047

1051-
preferredencoding = locale.getpreferredencoding()
10521048
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
10531049
self.assertEqual(
10541050
''.join(stringlist),
1055-
f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
1051+
"<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
10561052
)
10571053
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
10581054
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
@@ -3712,49 +3708,114 @@ def test_encoding(self):
37123708
"<tag key=\"åöö&lt;&gt;\" />" % enc).encode(enc))
37133709

37143710
def test_write_to_filename(self):
3715-
self.addCleanup(os_helper.unlink, TESTFN)
3716-
tree = ET.ElementTree(ET.XML('''<site />'''))
3711+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37173712
tree.write(TESTFN)
37183713
with open(TESTFN, 'rb') as f:
3719-
self.assertEqual(f.read(), b'''<site />''')
3714+
self.assertEqual(f.read(), b'''<site>&#248;</site>''')
3715+
3716+
def test_write_to_filename_with_encoding(self):
3717+
self.addCleanup(os_helper.unlink, TESTFN)
3718+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
3719+
tree.write(TESTFN, encoding='utf-8')
3720+
with open(TESTFN, 'rb') as f:
3721+
self.assertEqual(f.read(), b'''<site>\xc3\xb8</site>''')
3722+
3723+
tree.write(TESTFN, encoding='ISO-8859-1')
3724+
with open(TESTFN, 'rb') as f:
3725+
self.assertEqual(f.read(),
3726+
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
3727+
b'''<site>\xf8</site>''')
3728+
3729+
def test_write_to_filename_as_unicode(self):
3730+
self.addCleanup(os_helper.unlink, TESTFN)
3731+
with open(TESTFN, 'w') as f:
3732+
encoding = f.encoding
3733+
os_helper.unlink(TESTFN)
3734+
3735+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
3736+
tree.write(TESTFN, encoding='unicode')
3737+
with open(TESTFN, 'rb') as f:
3738+
data = f.read()
3739+
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
3740+
self.assertIn(
3741+
"<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace'),
3742+
data)
3743+
if encoding.lower() in ('utf-8', 'ascii'):
3744+
self.assertEqual(data, expected)
3745+
else:
3746+
self.assertIn(b"<?xml version='1.0' encoding=", data)
3747+
self.assertIn(expected, data)
37203748

37213749
def test_write_to_text_file(self):
37223750
self.addCleanup(os_helper.unlink, TESTFN)
3723-
tree = ET.ElementTree(ET.XML('''<site />'''))
3751+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37243752
with open(TESTFN, 'w', encoding='utf-8') as f:
37253753
tree.write(f, encoding='unicode')
37263754
self.assertFalse(f.closed)
37273755
with open(TESTFN, 'rb') as f:
3728-
self.assertEqual(f.read(), b'''<site />''')
3756+
self.assertEqual(f.read(), b'''<site>\xc3\xb8</site>''')
3757+
3758+
with open(TESTFN, 'w', encoding='ascii', errors='xmlcharrefreplace') as f:
3759+
tree.write(f, encoding='unicode')
3760+
self.assertFalse(f.closed)
3761+
with open(TESTFN, 'rb') as f:
3762+
self.assertEqual(f.read(),
3763+
b'''<?xml version='1.0' encoding='ascii'?>\n'''
3764+
b'''<site>&#248;</site>''')
3765+
3766+
with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
3767+
tree.write(f, encoding='unicode')
3768+
self.assertFalse(f.closed)
3769+
with open(TESTFN, 'rb') as f:
3770+
self.assertEqual(f.read(),
3771+
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
3772+
b'''<site>\xf8</site>''')
37293773

37303774
def test_write_to_binary_file(self):
37313775
self.addCleanup(os_helper.unlink, TESTFN)
3732-
tree = ET.ElementTree(ET.XML('''<site />'''))
3776+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37333777
with open(TESTFN, 'wb') as f:
37343778
tree.write(f)
37353779
self.assertFalse(f.closed)
37363780
with open(TESTFN, 'rb') as f:
3737-
self.assertEqual(f.read(), b'''<site />''')
3781+
self.assertEqual(f.read(), b'''<site>&#248;</site>''')
3782+
3783+
def test_write_to_binary_file_with_encoding(self):
3784+
self.addCleanup(os_helper.unlink, TESTFN)
3785+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
3786+
with open(TESTFN, 'wb') as f:
3787+
tree.write(f, encoding='utf-8')
3788+
self.assertFalse(f.closed)
3789+
with open(TESTFN, 'rb') as f:
3790+
self.assertEqual(f.read(), b'''<site>\xc3\xb8</site>''')
3791+
3792+
with open(TESTFN, 'wb') as f:
3793+
tree.write(f, encoding='ISO-8859-1')
3794+
self.assertFalse(f.closed)
3795+
with open(TESTFN, 'rb') as f:
3796+
self.assertEqual(f.read(),
3797+
b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
3798+
b'''<site>\xf8</site>''')
37383799

37393800
def test_write_to_binary_file_with_bom(self):
37403801
self.addCleanup(os_helper.unlink, TESTFN)
3741-
tree = ET.ElementTree(ET.XML('''<site />'''))
3802+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37423803
# test BOM writing to buffered file
37433804
with open(TESTFN, 'wb') as f:
37443805
tree.write(f, encoding='utf-16')
37453806
self.assertFalse(f.closed)
37463807
with open(TESTFN, 'rb') as f:
37473808
self.assertEqual(f.read(),
37483809
'''<?xml version='1.0' encoding='utf-16'?>\n'''
3749-
'''<site />'''.encode("utf-16"))
3810+
'''<site>\xf8</site>'''.encode("utf-16"))
37503811
# test BOM writing to non-buffered file
37513812
with open(TESTFN, 'wb', buffering=0) as f:
37523813
tree.write(f, encoding='utf-16')
37533814
self.assertFalse(f.closed)
37543815
with open(TESTFN, 'rb') as f:
37553816
self.assertEqual(f.read(),
37563817
'''<?xml version='1.0' encoding='utf-16'?>\n'''
3757-
'''<site />'''.encode("utf-16"))
3818+
'''<site>\xf8</site>'''.encode("utf-16"))
37583819

37593820
def test_read_from_stringio(self):
37603821
tree = ET.ElementTree()
@@ -3763,10 +3824,10 @@ def test_read_from_stringio(self):
37633824
self.assertEqual(tree.getroot().tag, 'site')
37643825

37653826
def test_write_to_stringio(self):
3766-
tree = ET.ElementTree(ET.XML('''<site />'''))
3827+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37673828
stream = io.StringIO()
37683829
tree.write(stream, encoding='unicode')
3769-
self.assertEqual(stream.getvalue(), '''<site />''')
3830+
self.assertEqual(stream.getvalue(), '''<site>\xf8</site>''')
37703831

37713832
def test_read_from_bytesio(self):
37723833
tree = ET.ElementTree()
@@ -3775,10 +3836,10 @@ def test_read_from_bytesio(self):
37753836
self.assertEqual(tree.getroot().tag, 'site')
37763837

37773838
def test_write_to_bytesio(self):
3778-
tree = ET.ElementTree(ET.XML('''<site />'''))
3839+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37793840
raw = io.BytesIO()
37803841
tree.write(raw)
3781-
self.assertEqual(raw.getvalue(), b'''<site />''')
3842+
self.assertEqual(raw.getvalue(), b'''<site>&#248;</site>''')
37823843

37833844
class dummy:
37843845
pass
@@ -3792,12 +3853,12 @@ def test_read_from_user_text_reader(self):
37923853
self.assertEqual(tree.getroot().tag, 'site')
37933854

37943855
def test_write_to_user_text_writer(self):
3795-
tree = ET.ElementTree(ET.XML('''<site />'''))
3856+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
37963857
stream = io.StringIO()
37973858
writer = self.dummy()
37983859
writer.write = stream.write
37993860
tree.write(writer, encoding='unicode')
3800-
self.assertEqual(stream.getvalue(), '''<site />''')
3861+
self.assertEqual(stream.getvalue(), '''<site>\xf8</site>''')
38013862

38023863
def test_read_from_user_binary_reader(self):
38033864
raw = io.BytesIO(b'''<?xml version="1.0"?><site></site>''')
@@ -3809,12 +3870,12 @@ def test_read_from_user_binary_reader(self):
38093870
tree = ET.ElementTree()
38103871

38113872
def test_write_to_user_binary_writer(self):
3812-
tree = ET.ElementTree(ET.XML('''<site />'''))
3873+
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
38133874
raw = io.BytesIO()
38143875
writer = self.dummy()
38153876
writer.write = raw.write
38163877
tree.write(writer)
3817-
self.assertEqual(raw.getvalue(), b'''<site />''')
3878+
self.assertEqual(raw.getvalue(), b'''<site>&#248;</site>''')
38183879

38193880
def test_write_to_user_binary_writer_with_bom(self):
38203881
tree = ET.ElementTree(ET.XML('''<site />'''))

Lib/xml/etree/ElementTree.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -728,16 +728,10 @@ def write(self, file_or_filename,
728728
encoding = "utf-8"
729729
else:
730730
encoding = "us-ascii"
731-
enc_lower = encoding.lower()
732-
with _get_writer(file_or_filename, enc_lower) as write:
731+
with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
733732
if method == "xml" and (xml_declaration or
734733
(xml_declaration is None and
735-
enc_lower not in ("utf-8", "us-ascii", "unicode"))):
736-
declared_encoding = encoding
737-
if enc_lower == "unicode":
738-
# Retrieve the default encoding for the xml declaration
739-
import locale
740-
declared_encoding = locale.getpreferredencoding()
734+
declared_encoding.lower() not in ("utf-8", "us-ascii"))):
741735
write("<?xml version='1.0' encoding='%s'?>\n" % (
742736
declared_encoding,))
743737
if method == "text":
@@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
762756
write = file_or_filename.write
763757
except AttributeError:
764758
# file_or_filename is a file name
765-
if encoding == "unicode":
766-
file = open(file_or_filename, "w")
759+
if encoding.lower() == "unicode":
760+
file = open(file_or_filename, "w",
761+
errors="xmlcharrefreplace")
767762
else:
768763
file = open(file_or_filename, "w", encoding=encoding,
769764
errors="xmlcharrefreplace")
770765
with file:
771-
yield file.write
766+
yield file.write, file.encoding
772767
else:
773768
# file_or_filename is a file-like object
774769
# encoding determines if it is a text or binary writer
775-
if encoding == "unicode":
770+
if encoding.lower() == "unicode":
776771
# use a text writer as is
777-
yield write
772+
yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
778773
else:
779774
# wrap a binary writer with TextIOWrapper
780775
with contextlib.ExitStack() as stack:
@@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
805800
# Keep the original file open when the TextIOWrapper is
806801
# destroyed
807802
stack.callback(file.detach)
808-
yield file.write
803+
yield file.write, encoding
809804

810805
def _namespaces(elem, default_namespace=None):
811806
# identify namespaces used in this tree
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
:class:`~xml.etree.ElementTree.ElementTree` method
2+
:meth:`~xml.etree.ElementTree.ElementTree.write` and function
3+
:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
4+
("UTF-8" if not available) instead of locale encoding in XML declaration
5+
when ``encoding="unicode"`` is specified.

0 commit comments

Comments
 (0)
0