8000 BUG: Resize bytes_ columns in genfromtxt · eric-wieser/numpy@0a87861 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0a87861

Browse files
committed
BUG: Resize bytes_ columns in genfromtxt
Fixes numpygh-10394, due to regression in numpygh-10054
1 parent 5f01e54 commit 0a87861

File tree

2 files changed

+27
-17
lines changed

2 files changed

+27
-17
lines changed

numpy/lib/npyio.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,7 +2042,6 @@ def tobytes_first(x, conv):
20422042
strcolidx = [i for (i, v) in enumerate(column_types)
20432043
if v == np.unicode_]
20442044

2045-
type_str = np.unicode_
20462045
if byte_converters and strcolidx:
20472046
# convert strings back to bytes for backward compatibility
20482047
warnings.warn(
@@ -2058,33 +2057,37 @@ def encode_unicode_cols(row_tup):
20582057

20592058
try:
20602059
data = [encode_unicode_cols(r) for r in data]
2061-
type_str = np.bytes_
20622060
except UnicodeEncodeError:
20632061
pass
2062+
else:
2063+
for i in strcolidx:
2064+
column_types[i] = np.bytes_
20642065

2066+
# Update string types to be the right length
2067+
sized_column_types = column_types[:]
2068+
for i, col_type in enumerate(column_types):
2069+
if np.issubdtype(col_type, np.character):
2070+
n_chars = max(len(row[i]) for row in data)
2071+
sized_column_types[i] = (col_type, n_chars)
20652072

2066-
# ... and take the largest number of chars.
2067-
for i in strcolidx:
2068-
max_line_length = max(len(row[i]) for row in data)
2069-
column_types[i] = np.dtype((type_str, max_line_length))
2070-
#
20712073
if names is None:
2072-
# If the dtype is uniform, don't define names, else use ''
2073-
base = set([c.type for c in converters if c._checked])
2074+
# If the dtype is uniform (before sizing strings)
2075+
base = set([
2076+
c_type
2077+
for c, c_type in zip(converters, column_types)
2078+
if c._checked])
20742079
if len(base) == 1:
2075-
if strcolidx:
2076-
(ddtype, mdtype) = (type_str, bool)
2077-
else:
2078-
(ddtype, mdtype) = (list(base)[0], bool)
2080+
uniform_type, = base
2081+
(ddtype, mdtype) = (uniform_type, bool)
20792082
else:
20802083
ddtype = [(defaultfmt % i, dt)
2081-
for (i, dt) in enumerate(column_types)]
2084+
for (i, dt) in enumerate(sized_column_types)]
20822085
if usemask:
20832086
mdtype = [(defaultfmt % i, bool)
2084-
for (i, dt) in enumerate(column_types)]
2087+
for (i, dt) in enumerate(sized_column_types)]
20852088
else:
2086-
ddtype = list(zip(names, column_types))
2087-
mdtype = list(zip(names, [bool] * len(column_types)))
2089+
ddtype = list(zip(names, sized_column_types))
2090+
mdtype = list(zip(names, [bool] * len(sized_column_types)))
20882091
output = np.array(data, dtype=ddtype)
20892092
if usemask:
20902093
outputmask = np.array(masks, dtype=mdtype)

numpy/lib/tests/test_io.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,13 @@ def test_recfromcsv(self):
20562056
assert_(isinstance(test, np.recarray))
20572057
assert_equal(test, control)
20582058

2059+
#gh-10394
2060+
data = TextIO('color\n"red"\n"blue"')
2061+
test = np.recfromcsv(data, converters={0: lambda x: x.strip(b'\"')})
2062+
control = np.array([('red',), ('blue',)], dtype=[('color', (bytes, 4))])
2063+
assert_equal(test.dtype, control.dtype)
2064+
assert_equal(test, control)
2065+
20592066
def test_max_rows(self):
20602067
# Test the `max_rows` keyword argument.
20612068
data = '1 2\n3 4\n5 6\n7 8\n9 10\n'

0 commit comments

Comments
 (0)
0