|
148 | 148 | import platform
|
149 | 149 | import codecs
|
150 | 150 | try:
|
151 |
| - import chardet |
| 151 | + import charset_normalizer |
152 | 152 | except ImportError:
|
153 |
| - chardet = None |
| 153 | + charset_normalizer = None |
154 | 154 |
|
155 | 155 | from . import __version__
|
156 | 156 |
|
@@ -309,26 +309,31 @@ def getextension(name):
|
309 | 309 | def openhook(filename, mode):
|
310 | 310 | """Ensures that filename is opened with correct encoding parameter.
|
311 | 311 |
|
312 |
| - This function uses chardet package, when available, for |
313 |
| - determining the encoding of the file to be opened. When chardet is |
314 |
| - not available, the function detects only UTF encodings, otherwise, |
315 |
| - ASCII encoding is used as fallback. |
| 312 | + This function uses charset_normalizer package, when available, for |
| 313 | + determining the encoding of the file to be opened. When charset_normalizer |
| 314 | + is not available, the function detects only UTF encodings, otherwise, ASCII |
| 315 | + encoding is used as fallback. |
316 | 316 | """
|
317 |
| - bytes = min(32, os.path.getsize(filename)) |
318 |
| - with open(filename, 'rb') as f: |
319 |
| - raw = f.read(bytes) |
320 |
| - if raw.startswith(codecs.BOM_UTF8): |
321 |
| - encoding = 'UTF-8-SIG' |
322 |
| - elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): |
323 |
| - encoding = 'UTF-32' |
324 |
| - elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): |
325 |
| - encoding = 'UTF-16' |
| 317 | + # Reads in the entire file. Robust detection of encoding. |
| 318 | + # Correctly handles comments or late stage unicode characters |
| 319 | + # gh-22871 |
| 320 | + if charset_normalizer is not None: |
| 321 | + encoding = charset_normalizer.from_path(filename).best().encoding |
326 | 322 | else:
|
327 |
| - if chardet is not None: |
328 |
| - encoding = chardet.detect(raw)['encoding'] |
329 |
| - else: |
330 |
| - # hint: install chardet to ensure correct encoding handling |
331 |
| - encoding = 'ascii' |
| 323 | + # hint: install charset_normalizer for correct encoding handling |
| 324 | + # No need to read the whole file for trying with startswith |
| 325 | + nbytes = min(32, os.path.getsize(filename)) |
| 326 | + with open(filename, 'rb') as fhandle: |
| 327 | + raw = fhandle.read(nbytes) |
| 328 | + if raw.startswith(codecs.BOM_UTF8): |
| 329 | + encoding = 'UTF-8-SIG' |
| 330 | + elif raw.startswith((codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)): |
| 331 | + encoding = 'UTF-32' |
| 332 | + elif raw.startswith((codecs.BOM_LE, codecs.BOM_BE)): |
| 333 | + encoding = 'UTF-16' |
| 334 | + else: |
| 335 | + # Fallback, without charset_normalizer |
| 336 | + encoding = 'ascii' |
332 | 337 | return open(filename, mode, encoding=encoding)
|
333 | 338 |
|
334 | 339 |
|
@@ -394,7 +399,7 @@ def readfortrancode(ffile, dowithline=show, istop=1):
|
394 | 399 | except UnicodeDecodeError as msg:
|
395 | 400 | raise Exception(
|
396 | 401 | f'readfortrancode: reading {fin.filename()}#{fin.lineno()}'
|
397 |
| - f' failed with\n{msg}.\nIt is likely that installing chardet' |
| 402 | + f' failed with\n{msg}.\nIt is likely that installing charset_normalizer' |
398 | 403 | ' package will help f2py determine the input file encoding'
|
399 | 404 | ' correctly.')
|
400 | 405 | if not l:
|
|
0 commit comments