handle zip files which contain non-UTF-8 encoded files

pyexcel · chfw · Sep 19, 2020 · Sep 16, 2020 · Sep 16, 2020 · 61c1195cd1672057c65d0e5135d7d3dbc48edf19
commit 61c1195cd1672057c65d0e5135d7d3dbc48edf19
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,6 +4,10 @@ Change log
 0.6.0 - tbd
 --------------------------------------------------------------------------------
 
+#. `#74 <https://github.com/pyexcel/pyexcel-io/issues/74>`_: handle zip files which
+   contain non-UTF-8 encoded files.
+
+
 **removed**
 
 #. python 3.6 lower versions are no longer supported

diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -5,6 +5,7 @@
 In alphabet
8000
ical order:
 
 * `Antherkiv <https://api.github.com/users/antherkiv>`_
+* `Craig Anderson <https://api.github.com/users/craiga>`_
 * `John Vandenberg <https://api.github.com/users/jayvdb>`_
 * `Stephen J. Fuhry <https://api.github.com/users/fuhrysteve>`_
 * `Stephen Rauch <https://api.github.com/users/stephenrauch>`_
diff --git a/pyexcel_io/readers/csvz.py b/pyexcel_io/readers/csvz.py
@@ -9,6 +9,8 @@
 """
 import zipfile
 
+import chardet
+
 from pyexcel_io.sheet import NamedContent
 from pyexcel_io._compact import StringIO
 from pyexcel_io.readers.csvr import CSVinMemoryReader
@@ -43,7 +45,8 @@ def close(self):
     def read_sheet(self, index):
         name = self.content_array[index].name
         content = self.zipfile.read(self.content_array[index].payload)
-        sheet = StringIO(content.decode("utf-8"))
+        encoding_guess = chardet.detect(content)
+        sheet = StringIO(content.decode(encoding_guess["encoding"]))
 
         return CSVinMemoryReader(NamedContent(name, sheet), **self.keywords)
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 ordereddict;python_version<"2.7"
 lml>=0.0.4
+chardet
diff --git a/setup.py b/setup.py
@@ -74,6 +74,7 @@
 
 INSTALL_REQUIRES = [
     "lm
8000
l>=0.0.4",
+    "chardet",
 ]
 SETUP_COMMANDS = {}
 

diff --git a/test.sh b/test.sh
diff --git a/tests/test_new_csvz_book.py b/tests/test_new_csvz_book.py
@@ -53,6 +53,16 @@ def test_reading(self):
         self.assertEqual(list(data["pyexcel_sheet1"]), [[u"中", u"文", 1, 2, 3]])
         zipreader.close()
 
+    def test_reading_utf32(self):
+        zip = zipfile.ZipFile(self.file, "w")
+        zip.writestr("something.ext", self.result.encode("utf-32"))
+        zip.close()
+        zipreader = self.reader_class()
+        zipreader.open(self.file)
+        data = zipreader.read_all()
+        self.assertEqual(list(data["something"]), [[u"中", u"文", 1, 2, 3]])
+        zipreader.close()
+
     def tearDown(self):
         os.unlink(self.file)