-
-
Notifications
You must be signed in to change notification settings - Fork 11.1k
ENH: Add encoding option to numpy text IO #4208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
298748a
2c821c0
749dbe7
e6fceb7
0d066e8
49d143d
3fb3794
c612568
1f44b3c
03e827d
9cdd203
760c1a1
dcfc2c7
1b23544
67cd094
38fa80f
06725fd
eea935d
7f0d6f7
088f4b3
097f7c0
053449d
3aba208
bec193e
1fe69f3
e9ae400
c482a5b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -883,10 +883,11 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, | |
regex_comments = re.compile('|'.join(comments)) | ||
user_converters = converters | ||
|
||
byte_converters = False | ||
if encoding == 'bytes': | ||
encoding = None | ||
byte_converters = True | ||
else: | ||
byte_converters = False | ||
|
||
if usecols is not None: | ||
# Allow usecols to be a single int or a sequence of ints | ||
|
@@ -924,7 +925,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, | |
X = [] | ||
|
||
# input may be a python2 io stream | ||
if encoding is not None and encoding != 'bytes': | ||
if encoding is not None: | ||
fencoding = encoding | ||
# we must assume local encoding | ||
# TOOD emit portability warning? | ||
|
@@ -1036,15 +1037,17 @@ def split_line(line): | |
# converters may use decode to workaround numpy's oldd behaviour, | ||
# so encode the string again before passing to the user converter | ||
def tobytes_first(x, conv): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to redefine
|
||
if type(x) == bytes: | ||
if type(x) is bytes: | ||
return conv(x) | ||
return conv(x.encode("latin1")) | ||
import functools | ||
user_conv = functools.partial(tobytes_first, conv=conv) | ||
converters[i] = user_conv | ||
converters[i] = functools.partial(tobytes_first, conv=conv) | ||
else: | ||
converters[i] = conv | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe here put the check on |
||
|
||
converters = [conv if conv is not bytes else | ||
lambda x: x.encode(fencoding) for conv in converters] | ||
|
||
# Parse each line, including the first | ||
for i, line in enumerate(itertools.chain([first_line], fh)): | ||
vals = split_line(line) | ||
|
@@ -1057,8 +1060,6 @@ def tobytes_first(x, conv): | |
raise ValueError("Wrong number of columns at line %d" | ||
% line_num) | ||
|
||
converters = [conv if conv != bytes else | ||
lambda x: x.encode(fencoding) for conv in converters] | ||
# Convert each value according to its column and store | ||
items = [conv(val) for (conv, val) in zip(converters, vals)] | ||
|
||
|
@@ -1228,7 +1229,7 @@ class WriteWrap(object): | |
""" convert to unicode in py2 or to bytes on bytestream inputs """ | ||
def __init__(self, fh, encoding): | ||
self.fh = fh | ||
self.encoding = encoding if encoding else 'latin1' | ||
self.encoding = encoding | ||
self.do_write = self.first_write | ||
|
||
def close(self): | ||
|
@@ -1265,10 +1266,10 @@ def first_write(self, v): | |
own_fh = True | ||
# need to convert str to unicode for text io output | ||
if sys.version_info[0] == 2: | ||
fh = WriteWrap(fh, encoding) | ||
fh = WriteWrap(fh, encoding or 'latin1') | ||
elif hasattr(fname, 'write'): | ||
# wrap to handle byte output streams | ||
fh = WriteWrap(fname, encoding) | ||
fh = WriteWrap(fname, encoding or 'latin1') | ||
else: | ||
raise ValueError('fname must be a string or file handle') | ||
|
||
|
@@ -1416,7 +1417,7 @@ def fromregex(file, regexp, dtype, encoding=None): | |
if isinstance(content, bytes) and not isinstance(regexp, bytes): | ||
regexp = asbytes(regexp) | ||
elif not isinstance(content, bytes) and isinstance(regexp, bytes): | ||
regexp = regexp.decode('latin1') | ||
regexp = asstr(regexp) | ||
|
||
if not hasattr(regexp, 'match'): | ||
regexp = re.compile(regexp) | ||
|
@@ -1630,10 +1631,11 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | |
"The input argument 'converter' should be a valid dictionary " | ||
"(got '%s' instead)" % type(user_converters)) | ||
|
||
byte_converters = False | ||
if encoding == 'bytes': | ||
encoding = None | ||
byte_converters = True | ||
else: | ||
byte_converters = False | ||
|
||
# Initialize the filehandle, the LineSplitter and the NameValidator | ||
own_fhd = False | ||
|
@@ -1866,18 +1868,19 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | |
testing_value = first_values[j] | ||
else: | ||
testing_value = None | ||
user_conv = conv | ||
if conv == bytes: | ||
if conv is bytes: | ||
user_conv = asbytes | ||
elif byte_converters: | ||
# converters may use decode to workaround numpy's oldd behaviour, | ||
# so encode the string again before passing to the user converter | ||
def tobytes_first(x, conv): | ||
if type(x) == bytes: | ||
if type(x) is bytes: | ||
return conv(x) | ||
return conv(x.encode("latin1")) | ||
import functools | ||
user_conv = functools.partial(tobytes_first, conv=conv) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as for |
||
else: | ||
user_conv = conv | ||
converters[i].update(user_conv, locked=True, | ||
testing_value=testing_value, | ||
default=filling_values[i], | ||
|
@@ -2080,10 +2083,8 @@ def tobytes_first(x, conv): | |
# Keep the dtype of the current converter | ||
if i in user_converters: | ||
ishomogeneous &= (ttype == dtype.type) | ||
if ttype == np.string_: | ||
ttype = "|S%i" % max(len(row[i]) for row in data) | ||
elif ttype == np.unicode_: | ||
ttype = "|U%i" % max(len(row[i]) for row in data) | ||
if np.issubdtype(ttype, np.character): | ||
ttype = (ttype, max(len(row[i]) for row in data)) | ||
descr.append(('', ttype)) | ||
else: | ||
descr.append(('', dtype)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than hardcode
latin1
even more, maybe just doline = asstr(line)
(whereasstr
should be imported fromnumpy.compat
).p.s. Better might be to add the encoding to
asstr
, but this perhaps is more work than is worth it, as it means also defining it properly for python2