8000 BUG: Use whole file for encoding checks with `charset_normalizer` [f2… · numpy/numpy@fe73a84 · GitHub
[go: up one dir, main page]

Skip to content

Commit fe73a84

Browse files
HaoZekemelissawm
andauthored
BUG: Use whole file for encoding checks with charset_normalizer [f2py] (#22872)
* BUG: Use whole file for encoding checks [f2py] * DOC: Add a code comment Co-authored-by: melissawm <melissawm@gmail.com> * TST: Add a conditional unicode f2py test * MAINT: Add chardet as a test requirement * ENH: Cleanup and switch f2py to charset_normalizer * MAINT: Remove chardet for charset_normalizer * TST: Simplify UTF-8 encoding [f2py] Co-authored-by: melissawm <melissawm@gmail.com>
1 parent 235dbe1 commit fe73a84

File tree

4 files changed

+45
-25
lines changed

4 files changed

+45
-25
lines changed

numpy/f2py/crackfortran.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,9 @@
148148
import platform
149149
import codecs
150150
try:
151-
import chardet
151+
import charset_normalizer
152152
except ImportError:
153-
chardet = None
153+
charset_normalizer = None
154154

155155
from . import __version__
156156

@@ -309,26 +309,31 @@ def getextension(name):
309309
def openhook(filename, mode):
310310
"""Ensures that filename is opened with correct encoding parameter.
311311
312-
This function uses chardet package, when available, for
313-
determining the encoding of the file to be opened. When chardet is
314-
not available, the function detects only UTF encodings, otherwise,
315-
ASCII encoding is used as fallback.
312+
This function uses charset_normalizer package, when available, for
313+
determining the encoding of the file to be opened. When charset_normalizer
314+
is not available, the function detects only UTF encodings, otherwise, ASCII
315+
encoding is used as fallback.
316316
"""
317-
bytes = min(32, os.path.getsize(filename))
318-
with open(filename, 'rb') as f:
319-
raw = f.read(bytes)
320-
if raw.startswith(codecs.BOM_UTF8):
321-
encoding = 'UTF-8-SIG'
322-
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
323-
encoding = 'UTF-32'
324-
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
325-
encoding = 'UTF-16'
317+
# Reads in the entire file. Robust detection of encoding.
318+
# Correctly handles comments or late stage unicode characters
319+
# gh-22871
320+
if charset_normalizer is not None:
321+
encoding = charset_normalizer.from_path(filename).best().encoding
326322
else:
327-
if chardet is not None:
328-
encoding = chardet.detect(raw)['encoding']
329-
else:
330-
# hint: install chardet to ensure correct encoding handling
331-
encoding = 'ascii'
323+
# hint: install charset_normalizer for correct encoding handling
324+
# No need to read the whole file for trying with startswith
325+
nbytes = min(32, os.path.getsize(filename))
326+
with open(filename, 'rb') as fhandle:
327+
raw = fhandle.read(nbytes)
328+
if raw.startswith(codecs.BOM_UTF8):
329+
encoding = 'UTF-8-SIG'
330+
elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)):
331+
encoding = 'UTF-32'
332+
elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)):
333+
encoding = 'UTF-16'
334+
else:
335+
# Fallback, without charset_normalizer
336+
encoding = 'ascii'
332337
return open(filename, mode, encoding=encoding)
333338

334339

@@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
394399
except UnicodeDecodeError as msg:
395400
raise Exception(
396401
f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
397-
f' failed with\n{msg}.\nIt is likely that installing chardet'
402+
f' failed with\n{msg}.\nIt is likely that installing charset_normalizer'
398403
' package will help f2py determine the input file encoding'
399404
' correctly.')
400405
if not l:
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
subroutine foo(x)
2+
real(8), intent(in) :: x
3+
! Écrit à l'écran la valeur de x
4+
end subroutine

numpy/f2py/tests/test_crackfortran.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import importlib
12
import codecs
3+
import unicodedata
24
import pytest
35
import numpy as np
46
from numpy.f2py.crackfortran import markinnerspaces
@@ -257,13 +259,20 @@ class TestFortranReader(util.F2PyTest):
257259
def test_input_encoding(self, tmp_path, encoding):
258260
# gh-635
259261
f_path = tmp_path / f"input_with_{encoding}_encoding.f90"
260-
# explicit BOM is required for UTF8
261-
bom = {'utf-8'< 67E6 /span>: codecs.BOM_UTF8}.get(encoding, b'')
262262
with f_path.open('w', encoding=encoding) as ff:
263-
ff.write(bom.decode(encoding) +
264-
"""
263+
ff.write("""
265264
subroutine foo()
266265
end subroutine foo
267266
""")
268267
mod = crackfortran.crackfortran([str(f_path)])
269268
assert mod[0]['name'] == 'foo'
269+
270+
class TestUnicodeComment(util.F2PyTest):
271+
sources = [util.getpath("tests", "src", "crackfortran", "unicode_comment.f90")]
272+
273+
@pytest.mark.skipif(
274+
(importlib.util.find_spec("charset_normalizer") is None),
275+
reason="test requires charset_normalizer which is not installed",
276+
)
277+
def test_encoding_comment(self):
278+
self.module.foo(3)

test_requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@ cffi; python_version < '3.10'
1212
# NOTE: Keep mypy in sync with environment.yml
1313
mypy==0.981; platform_python_implementation != "PyPy"
1414
typing_extensions>=4.2.0
15+
# for optional f2py encoding detection
16+
charset-normalizer

0 commit comments

Comments
 (0)
0