-
-
Notifications
You must be signed in to change notification settings - Fork 11k
ENH: Quoting support in np.genfromtxt(...) #4594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,7 +188,8 @@ def autostrip(self, method): | |
""" | ||
return lambda input: [_.strip() for _ in method(input)] | ||
# | ||
def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True): | ||
def __init__(self, delimiter=None, quoter=None, comments=asbytes('#'), | ||
autostrip=True): | ||
self.comments = comments | ||
# Delimiter is a character | ||
if isinstance(delimiter, unicode): | ||
|
@@ -207,6 +208,9 @@ def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True): | |
else: | ||
(_handyman, delimiter) = (self._delimited_splitter, None) | ||
self.delimiter = delimiter | ||
if isinstance(quoter, unicode): | ||
quoter = quoter.encode('ascii') | ||
self.quoter = quoter | ||
if autostrip: | ||
self._handyman = self.autostrip(_handyman) | ||
else: | ||
|
@@ -218,7 +222,35 @@ def _delimited_splitter(self, line): | |
line = line.strip(asbytes(" \r\n")) | ||
if not line: | ||
return [] | ||
return line.split(self.delimiter) | ||
if self.quoter is None: | ||
return line.split(self.delimiter) | ||
else: | ||
out = [] | ||
word = asbytes('') | ||
in_quote = False | ||
is_escaped = False | ||
|
||
# py3 bytes compat | ||
chars = [line[i:i+1] for i in range(len(line))] | ||
|
||
for char in chars: | ||
if is_escaped: | ||
word += char | ||
is_escaped = False | ||
elif char == asbytes('\\'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. another symptom of this broken function, no bytes here and following, we are loading text after all |
||
is_escaped = True | ||
elif char == self.quoter: | ||
in_quote = not in_quote | ||
elif in_quote: | ||
word += char | ||
elif char == self.delimiter: | ||
out.append(word) | ||
word = asbytes('') | ||
else: | ||
word += char | ||
if word: | ||
out.append(word) | ||
return out | ||
# | ||
def _fixedwidth_splitter(self, line): | ||
if self.comments is not None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1174,7 +1174,7 @@ def fromregex(file, regexp, dtype): | |
#####-------------------------------------------------------------------------- | ||
|
||
|
||
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | ||
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, quoter=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the new argument has to be at he end of the argument list for backward compatibility, it is unfortunately not a keyword only function |
||
skiprows=0, skip_header=0, skip_footer=0, converters=None, | ||
missing='', missing_values=None, filling_values=None, | ||
usecols=None, names=None, | ||
|
@@ -1207,6 +1207,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | |
skip_rows : int, optional | ||
`skip_rows` was deprecated in numpy 1.5, and will be removed in | ||
numpy 2.0. Please use `skip_header` instead. | ||
quoter str, optional | ||
The string used as a quoting character. By default it's assumed that | ||
the values are not quoted. | ||
skip_header : int, optional | ||
The number of lines to skip at the beginning of the file. | ||
skip_footer : int, optional | ||
|
@@ -1334,6 +1337,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | |
comments = asbytes(comments) | ||
if isinstance(delimiter, unicode): | ||
delimiter = asbytes(delimiter) | ||
if isinstance(quoter, unicode): | ||
quoter = asbytes(quoter) | ||
if isinstance(missing, unicode): | ||
missing = asbytes(missing) | ||
if isinstance(missing_values, (unicode, list, tuple)): | ||
|
@@ -1365,7 +1370,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, | |
"fname must be a string, filehandle, or generator. " | ||
"(got %s instead)" % type(fname)) | ||
|
||
split_line = LineSplitter(delimiter=delimiter, comments=comments, | ||
split_line = LineSplitter(delimiter=delimiter, quoter=quoter, | ||
comments=comments, | ||
autostrip=autostrip)._handyman | ||
validate_names = NameValidator(excludelist=excludelist, | ||
deletechars=deletechars, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there should no reason the quoter must be ascii, though to fix that probably the full machinery must be fixed to work properly with strings and not bytes