10000 ENH: Quoting support in np.genfromtxt(...) by ddasilva · Pull Request #4594 · numpy/numpy · GitHub
[go: up one dir, main page]

Skip to content

ENH: Quoting support in np.genfromtxt(...) #4594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions numpy/lib/_iotools.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ def autostrip(self, method):
"""
return lambda input: [_.strip() for _ in method(input)]
#
def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True):
def __init__(self, delimiter=None, quoter=None, comments=asbytes('#'),
autostrip=True):
self.comments = comments
# Delimiter is a character
if isinstance(delimiter, unicode):
Expand All @@ -207,6 +208,9 @@ def __init__(self, delimiter=None, comments=asbytes('#'), autostrip=True):
else:
(_handyman, delimiter) = (self._delimited_splitter, None)
self.delimiter = delimiter
if isinstance(quoter, unicode):
quoter = quoter.encode('ascii')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should no reason the quoter must be ascii, though to fix that probably the full machinery must be fixed to work properly with strings and not bytes

self.quoter = quoter
if autostrip:
self._handyman = self.autostrip(_handyman)
else:
Expand All @@ -218,7 +222,35 @@ def _delimited_splitter(self, line):
line = line.strip(asbytes(" \r\n"))
if not line:
return []
return line.split(self.delimiter)
if self.quoter is None:
return line.split(self.delimiter)
else:
out = []
word = asbytes('')
in_quote = False
is_escaped = False

# py3 bytes compat
chars = [line[i:i+1] for i in range(len(line))]

for char in chars:
if is_escaped:
word += char
is_escaped = False
elif char == asbytes('\\'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

another symptom of this broken function, no bytes here and following, we are loading text after all

is_escaped = True
elif char == self.quoter:
in_quote = not in_quote
elif in_quote:
word += char
elif char == self.delimiter:
out.append(word)
word = asbytes('')
else:
word += char
if word:
out.append(word)
return out
#
def _fixedwidth_splitter(self, line):
if self.comments is not None:
Expand Down
10 changes: 8 additions & 2 deletions numpy/lib/npyio.py
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ def fromregex(file, regexp, dtype):
#####--------------------------------------------------------------------------


def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, quoter=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the new argument has to be at he end of the argument list for backward compatibility, it is unfortunately not a keyword only function

skiprows=0, skip_header=0, skip_footer=0, converters=None,
missing='', missing_values=None, filling_values=None,
usecols=None, names=None,
Expand Down Expand Up @@ -1207,6 +1207,9 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
skip_rows : int, optional
`skip_rows` was deprecated in numpy 1.5, and will be removed in
numpy 2.0. Please use `skip_header` instead.
quoter str, optional
The string used as a quoting character. By default it's assumed that
the values are not quoted.
skip_header : int, optional
The number of lines to skip at the beginning of the file.
skip_footer : int, optional
Expand Down Expand Up @@ -1334,6 +1337,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
comments = asbytes(comments)
if isinstance(delimiter, unicode):
delimiter = asbytes(delimiter)
if isinstance(quoter, unicode):
quoter = asbytes(quoter)
if isinstance(missing, unicode):
missing = asbytes(missing)
if isinstance(missing_values, (unicode, list, tuple)):
Expand Down Expand Up @@ -1365,7 +1370,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
"fname must be a string, filehandle, or generator. "
"(got %s instead)" % type(fname))

split_line = LineSplitter(delimiter=delimiter, comments=comments,
split_line = LineSplitter(delimiter=delimiter, quoter=quoter,
comments=comments,
autostrip=autostrip)._handyman
validate_names = NameValidator(excludelist=excludelist,
deletechars=deletechars,
Expand Down
28 changes: 28 additions & 0 deletions numpy/lib/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,34 @@ def test_commented_header(self):
test = np.genfromtxt(data, names=True, dtype=None)
assert_equal(test, ctrl)

def check_quoter(self, quoter):
data = [["a, b c d", "e f", "g" + quoter + ' x'],
["h, i jk", "lm no, p q", "r"]]

ctrl = np.array([[asbytes(el) for el in row]
for row in data],
dtype='|S10')

tio = TextIO()
for row in data:
quoted = []
for el in row:
text = el.replace(quoter, '\\' + quoter)
quoted.append(quoter + text + quot 6685 er)
line = ','.join(quoted)
tio.write(line)
tio.write('\n')
tio.seek(0)

test = np.genfromtxt(tio, quoter=quoter, delimiter=",",
dtype='|S10')

assert_equal(test, ctrl)

def test_quote(self):
self.check_quoter('"')
self.check_quoter("'")

def test_autonames_and_usecols(self):
"Tests names and usecols"
data = TextIO('A B C D\n aaaa 121 45 9.1')
Expand Down
0