8000 ENH: Add encoding option to numpy text IO by juliantaylor · Pull Request #4208 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Add encoding option to numpy text IO #4208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
298748a
attempt to salvage loadtxt and genfromtxt
juliantaylor Apr 3, 2017
2c821c0
update some internal tests
juliantaylor Apr 4, 2017
749dbe7
update datasource
juliantaylor Apr 8, 2017
e6fceb7
less ugly dtype=S hack
juliantaylor Apr 8, 2017
0d066e8
use special value encoding="bytes" for converter value
juliantaylor Apr 8, 2017
49d143d
windows test fix
juliantaylor Apr 8, 2017
3fb3794
add gzip line separator test
juliantaylor Apr 8, 2017
c612568
try to keep genfromtxt backward compat
juliantaylor Apr 8, 2017
1f44b3c
add decoding converter tests
juliantaylor Apr 8, 2017
03e827d
support unicode user dtype in genfromtxt
juliantaylor Apr 8, 2017
9cdd203
add unicode support to fromregexp
juliantaylor Apr 9, 2017
760c1a1
add docstrings
juliantaylor Apr 9, 2017
dcfc2c7
move line decoding to a function
juliantaylor Apr 14, 2017
1b23544
add binary stream decode test
juliantaylor Apr 14, 2017
67cd094
only use writewrap in savetxt when necessary
juliantaylor Apr 14, 2017
38fa80f
don't convert data to lists unnecessarily in genfromtxt
juliantaylor Apr 14, 2017
06725fd
avoid the file encoding workaround if encoding is provided by user
juliantaylor Apr 14, 2017
eea935d
add xz support and add tests
juliantaylor Apr 14, 2017
7f0d6f7
cleanup compressed file handling in datasource
juliantaylor Apr 16, 2017
088f4b3
remove two now unnecessary abstractions
juliantaylor Apr 16, 2017
097f7c0
fix encoding argument not being passed to Linesplitter
juliantaylor Apr 16, 2017
053449d
move decoding into Linesplitter's handyman function
juliantaylor Apr 16, 2017
3aba208
cleanup
juliantaylor Apr 16, 2017
bec193e
ENH: change loadtxt to use a generator to load data
juliantaylor Apr 16, 2017
1fe69f3
DOC: add release notes for text IO changes
juliantaylor Jul 11, 2017
e9ae400
DEPR: add a deprecation warning when reading strings without encoding
juliantaylor Jul 11, 2017
c482a5b
add test for savetxt into StringIO
juliantaylor Jul 11, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleanup
  • Loading branch information
juliantaylor committed Nov 6, 2017
commit 3aba20831b763f80d976fcf26532aea2fb5120bd
2 changes: 1 addition & 1 deletion numpy/lib/_iotools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

def _decode_line(line, encoding=None):
""" decode bytes from binary input streams, default to latin1 """
if type(line) == bytes:
if type(line) is bytes:
if encoding is None:
line = line.decode('latin1')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than hardcode latin1 even more, maybe just do line = asstr(line) (where asstr should be imported from numpy.compat).

p.s. Better might be to add the encoding to asstr, but this perhaps is more work than is worth it, as it means also defining it properly for python2

else:
Expand Down
39 changes: 20 additions & 19 deletions numpy/lib/npyio.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,10 +883,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
regex_comments = re.compile('|'.join(comments))
user_converters = converters

byte_converters = False
if encoding == 'bytes':
encoding = None
byte_converters = True
else:
byte_converters = False

if usecols is not None:
# Allow usecols to be a single int or a sequence of ints
Expand Down Expand Up @@ -924,7 +925,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
X = []

# input may be a python2 io stream
if encoding is not None and encoding != 'bytes':
if encoding is not None:
fencoding = encoding
# we must assume local encoding
# TOOD emit portability warning?
Expand Down Expand Up @@ -1036,15 +1037,17 @@ def split_line(line):
# converters may use decode to workaround numpy's oldd behaviour,
# so encode the string again before passing to the user converter
def tobytes_first(x, conv):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to redefine asbytes from py3k.py; can one just use that? I.e., something like

converters[i] = lambda x: conv(asbytes(x))

if type(x) == bytes:
if type(x) is bytes:
return conv(x)
return conv(x.encode("latin1"))
import functools
user_conv = functools.partial(tobytes_first, conv=conv)
converters[i] = user_conv
converters[i] = functools.partial(tobytes_first, conv=conv)
else:
converters[i] = conv
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe here put the check on conv is bytes from below? It seems nicer to just ensure the converters are all set up correctly at this stage.


converters = [conv if conv is not bytes else
lambda x: x.encode(fencoding) for conv in converters]

# Parse each line, including the first
for i, line in enumerate(itertools.chain([first_line], fh)):
vals = split_line(line)
Expand All @@ -1057,8 +1060,6 @@ def tobytes_first(x, conv):
raise ValueError("Wrong number of columns at line %d"
% line_num)

converters = [conv if conv != bytes else
lambda x: x.encode(fencoding) for conv in converters]
# Convert each value according to its column and store
items = [conv(val) for (conv, val) in zip(converters, vals)]

Expand Down Expand Up @@ -1228,7 +1229,7 @@ class WriteWrap(object):
""" convert to unicode in py2 or to bytes on bytestream inputs """
def __init__(self, fh, encoding):
self.fh = fh
self.encoding = encoding if encoding else 'latin1'
self.encoding = encoding
self.do_write = self.first_write

def close(self):
Expand Down Expand Up @@ -1265,10 +1266,10 @@ def first_write(self, v):
own_fh = True
# need to convert str to unicode for text io output
if sys.version_info[0] == 2:
fh = WriteWrap(fh, encoding)
fh = WriteWrap(fh, encoding or 'latin1')
elif hasattr(fname, 'write'):
# wrap to handle byte output streams
fh = WriteWrap(fname, encoding)
fh = WriteWrap(fname, encoding or 'latin1')
else:
raise ValueError('fname must be a string or file handle')

Expand Down Expand Up @@ -1416,7 +1417,7 @@ def fromregex(file, regexp, dtype, encoding=None):
if isinstance(content, bytes) and not isinstance(regexp, bytes):
regexp = asbytes(regexp)
elif not isinstance(content, bytes) and isinstance(regexp, bytes):
regexp = regexp.decode('latin1')
regexp = asstr(regexp)

if not hasattr(regexp, 'match'):
regexp = re.compile(regexp)
Expand Down Expand Up @@ -1630,10 +1631,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
"The input argument 'converter' should be a valid dictionary "
"(got '%s' instead)" % type(user_converters))

byte_converters = False
if encoding == 'bytes':
encoding = None
byte_converters = True
else:
byte_converters = False

# Initialize the filehandle, the LineSplitter and the NameValidator
own_fhd = False
Expand Down Expand Up @@ -1866,18 +1868,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
testing_value = first_values[j]
else:
testing_value = None
user_conv = conv
if conv == bytes:
if conv is bytes:
user_conv = asbytes
elif byte_converters:
# converters may use decode to workaround numpy's oldd behaviour,
# so encode the string again before passing to the user converter
def tobytes_first(x, conv):
if type(x) == bytes:
if type(x) is bytes:
return conv(x)
return conv(x.encode("latin1"))
import functools
user_conv = functools.partial(tobytes_first, conv=conv)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as for loadtxt: user_conv = lambda x: conv(asbytes(x)) (if that indeed works)

else:
user_conv = conv
converters[i].update(user_conv, locked=True,
testing_value=testing_value,
default=filling_values[i],
Expand Down Expand Up @@ -2080,10 +2083,8 @@ def tobytes_first(x, conv):
# Keep the dtype of the current converter
if i in user_converters:
ishomogeneous &= (ttype == dtype.type)
if ttype == np.string_:
ttype = "|S%i" % max(len(row[i]) for row in data)
elif ttype == np.unicode_:
ttype = "|U%i" % max(len(row[i]) for row in data)
if np.issubdtype(ttype, np.character):
ttype = (ttype, max(len(row[i]) for row in data))
descr.append(('', ttype))
else:
descr.append(('', dtype))
Expand Down
0