@@ -2042,7 +2042,6 @@ def tobytes_first(x, conv):
2042
2042
strcolidx = [i for (i , v ) in enumerate (column_types )
2043
2043
if v == np .unicode_ ]
2044
2044
2045
- type_str = np .unicode_
2046
2045
if byte_converters and strcolidx :
2047
2046
# convert strings back to bytes for backward compatibility
2048
2047
warnings .warn (
@@ -2058,33 +2057,37 @@ def encode_unicode_cols(row_tup):
2058
2057
2059
2058
try :
2060
2059
data = [encode_unicode_cols (r ) for r in data ]
2061
- type_str = np .bytes_
2062
2060
except UnicodeEncodeError :
2063
2061
pass
2062
+ else :
2063
+ for i in strcolidx :
2064
+ column_types [i ] = np .bytes_
2064
2065
2066
+ # Update string types to be the right length
2067
+ sized_column_types = column_types [:]
2068
+ for i , col_type in enumerate (column_types ):
2069
+ if np .issubdtype (col_type , np .character ):
2070
+ n_chars = max (len (row [i ]) for row in data )
2071
+ sized_column_types [i ] = (col_type , n_chars )
2065
2072
2066
- # ... and take the largest number of chars.
2067
- for i in strcolidx :
2068
- max_line_length = max (len (row [i ]) for row in data )
2069
- column_types [i ] = np .dtype ((type_str , max_line_length ))
2070
- #
2071
2073
if names is None :
2072
- # If the dtype is uniform, don't define names, else use ''
2073
- base = set ([c .type for c in converters if c ._checked ])
2074
+ # If the dtype is uniform (before sizing strings)
2075
+ base = set ([
2076
+ c_type
2077
+ for c , c_type in zip (converters , column_types )
2078
+ if c ._checked ])
2074
2079
if len (base ) == 1 :
2075
- if strcolidx :
2076
- (ddtype , mdtype ) = (type_str , bool )
2077
- else :
2078
- (ddtype , mdtype ) = (list (base )[0 ], bool )
2080
+ uniform_type , = base
2081
+ (ddtype , mdtype ) = (uniform_type , bool )
2079
2082
else :
2080
2083
ddtype = [(defaultfmt % i , dt )
2081
- for (i , dt ) in enumerate (column_types )]
2084
+ for (i , dt ) in enumerate (sized_column_types )]
2082
2085
if usemask :
2083
2086
mdtype = [(defaultfmt % i , bool )
2084
- for (i , dt ) in enumerate (column_types )]
2087
+ for (i , dt ) in enumerate (sized_column_types )]
2085
2088
else :
2086
- ddtype = list (zip (names , column_types ))
2087
- mdtype = list (zip (names , [bool ] * len (column_types )))
2089
+ ddtype = list (zip (names , sized_column_types ))
2090
+ mdtype = list (zip (names , [bool ] * len (sized_column_types )))
2088
2091
output = np .array (data , dtype = ddtype )
2089
2092
if usemask :
2090
2093
outputmask = np .array (masks , dtype = mdtype )
0 commit comments