8000 ENH/BUG: ignore line comments in CSV files GH2685 by holocronweaver · Pull Request #4505 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH/BUG: ignore line comments in CSV files GH2685 #4505

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
ENH/BUG: ignore line comments in CSV files GH2685
* also fix bug in CSV format sniffer
  • Loading branch information
Jesse Johnson committed Aug 12, 2013
commit e4fb9ed16442beace0f0b431a1e198799dc008bb
42 changes: 32 additions & 10 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,6 @@ def __init__(self, src, **kwds):
self._name_processed = True
(index_names, self.names,
self.index_col) = _clean_index_names(self.names, self.index_col)

if self.index_names is None:
self.index_names = index_names

Expand Down Expand Up @@ -1100,7 +1099,6 @@ def _get_index_names(self):
if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names,
self.index_col) = _clean_index_names(names, self.index_col)

return names, idx_names

def _maybe_parse_dates(self, values, index, try_parse_dates=True):
Expand Down Expand Up @@ -1282,21 +1280,30 @@ class MyDialect(csv.Dialect):

sniff_sep = True

if sep is not None:
if (sep is not None) and (dia.quotechar is not None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need for parens here is not binds tighter than and

sniff_sep = False
dia.delimiter = sep
# attempt to sniff the delimiter
if sniff_sep:
line = f.readline()
while self.pos in self.skiprows:
self.pos += 1
line = f.readline()

line = self._check_comments([line])[0]
line = self._check_comments([[line]])

while not line:
self.pos += 1
line = f.readline()
line = self._check_comments([[line]])

line = line[0][0]

self.pos += 1
sniffed = csv.Sniffer().sniff(line)
dia.delimiter = sniffed.delimiter
if not dia.delimiter:
dia.delimiter = sniffed.delimiter
if not dia.quotechar:
dia.quotechar = sniffed.quotechar
if self.encoding is not None:
self.buf.extend(list(
com.UnicodeReader(StringIO(line),
Expand Down Expand Up @@ -1466,14 +1473,26 @@ def _next_line(self):
line = self.data[self.pos]
except IndexError:
raise StopIteration

line = self._check_comments([line])

while not line:
self.pos += 1
try:
line = self.data[self.pos]
except IndexError:
raise StopIteration
line = self._check_comments([line])

line = line[0]
else:
while self.pos in self.skiprows:
next(self.data)
self.pos += 1

line = next(self.data)
line = self._check_comments([line])[0]

line = self._check_comments([line])[0]
line = self._check_thousands([line])[0]

self.pos += 1
Expand All @@ -1496,7 +1515,10 @@ def _check_comments(self, lines):
if len(x) > 0:
rl.append(x)
break
ret.append(rl)
if rl:
ret.append(rl)
if not ret:
ret = [[]];
return ret

def _check_thousands(self, lines):
Expand Down Expand Up @@ -1524,7 +1546,7 @@ def _clear_buffer(self):
def _get_index_name(self, columns):
orig_names = list(columns)
columns = list(columns)

try:
line = self._next_line()
except StopIteration:
Expand All @@ -1539,7 +1561,7 @@ def _get_index_name(self, columns):

# implicitly index_col=0 b/c 1 fewer column names
implicit_first_cols = 0
if line is not None:
if line and (line is not None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

# leave it 0, #2442
if self.index_col is not False:
implicit_first_cols = len(line) - len(columns)
Expand Down
5 changes: 2 additions & 3 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
}
else if (c == self->delimiter) {
// End of field. End of line not reached yet

END_FIELD();
self->state = START_FIELD;
}
Expand Down Expand Up @@ -866,7 +865,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
} else {
/* \r line terminator */

/* UGH. we don't actually want to consume the token. fix this later */
/*FIXME UGH. we don't actually want to consume the token. */
self->stream_len = slen;
if (end_line(self) < 0) {
goto parsingerror;
Expand All @@ -875,7 +874,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
slen = self->stream_len;
self->state = START_RECORD;

/* HACK, let's try this one again */
/*FIXME let's try this one again */
--i; buf--;
if (line_limit > 0 && self->lines == start_lines + line_limit) {
goto linelimit;
Expand Down
0