ENH/BUG: ignore line comments in CSV files GH2685

* also fix bug in CSV format sniffer
pandas-dev · holocronweaver · Aug 5, 2013 · Aug 12, 2013 · Aug 12, 2013 · e4fb9ed16442beace0f0b431a1e198799dc008bb
commit e4fb9ed16442beace0f0b431a1e198799dc008bb
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -991,7 +991,6 @@ def __init__(self, src, **kwds):
                 self._name_processed = True
                 (index_names, self.names,
                  self.index_col) = _clean_index_names(self.names, self.index_col)
-
                 if self.index_names is None:
                     self.index_names = index_names
 
@@ -1100,7 +1099,6 @@ def _get_index_names(self):
         if self._reader.leading_cols == 0 and self.index_col is not None:
             (idx_names, names,
              self.index_col) = _clean_index_names(names, self.index_col)
-
         return names, idx_names
 
     def _maybe_parse_dates(self, values, index, try_parse_dates=True):
@@ -1282,21 +1280,30 @@ class MyDialect(csv.Dialect):
 
             sniff_sep = True
 
-            if sep is not None:
+            if (sep is not None) and (dia.quotechar is not None):
                 sniff_sep = False
-                dia.delimiter = sep
             # attempt to sniff the delimiter
             if sniff_sep:
                 line = f.readline()
                 while self.pos in self.skiprows:
                     self.pos += 1
                     line = f.readline()
 
-                line = self._check_comments([line])[0]
+                line = self._check_comments([[line]])
+
+                while not line:
+                    self.pos += 1
+                    line = f.readline()
+                    line = self._check_comments([[line]])
+
+                line = line[0][0]
 
                 self.pos += 1
                 sniffed = csv.Sniffer().sniff(line)
-                dia.delimiter = sniffed.delimiter
+                if not dia.delimiter:
+                    dia.delimiter = sniffed.delimiter
+                if not dia.quotechar:
+                    dia.quotechar = sniffed.quotechar
                 if self.encoding is not None:
                     self.buf.extend(list(
                         com.UnicodeReader(StringIO(line),
@@ -1466,14 +1473,26 @@ def _next_line(self):
                 line = self.data[self.pos]
             except IndexError:
                 raise StopIteration
+
+            line = self._check_comments([line])
+
+            while not line:
+                self.pos += 1
+                try:
+                    line = self.data[self.pos]
+                except IndexError:
+                    raise StopIteration
+                line = self._check_comments([line])
+
+            line = line[0]
         else:
             while self.pos in self.skiprows:
                 next(self.data)
                 self.pos += 1
 
             line = next(self.data)
+            line = self._check_comments([line])[0]
 
-        line = self._check_comments([line])[0]
         line = self._check_thousands([line])[0]
 
         self.pos += 1
@@ -1496,7 +1515,10 @@ def _check_comments(self, lines):
                     if len(x) > 0:
                         rl.append(x)
                     break
-            ret.append(rl)
+            if rl:
+                ret.append(rl)
+        if not ret:
+            ret = [[]];
         return ret
 
     def _check_thousands(self, lines):
@@ -1524,7 +1546,7 @@ def _clear_buffer(self):
     def _get_index_name(self, columns):
         orig_names = list(columns)
         columns = list(columns)
-
+        
         try:
             line = self._next_line()
         except StopIteration:
@@ -1539,7 +1561,7 @@ def _get_index_name(self, columns):
 
         # implicitly index_col=0 b/c 1 fewer column names
         implicit_first_cols = 0
-        if line is not None:
+        if line and (line is not None):
             # leave it 0, #2442
             if self.index_col is not False:
                 implicit_first_cols = len(line) - len(columns)

diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -823,7 +823,6 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
             }
             else if (c == self->delimiter) {
                 // End of field. End of line not reached yet
-
                 END_FIELD();
                 self->state = START_FIELD;
             }
@@ -866,7 +865,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
             } else {
                 /* \r line terminator */
 
-                /* UGH. we don't actually want to consume the token. fix this later */
+                /*FIXME UGH. we don't actually want to consume the token. */
                 self->stream_len = slen;
                 if (end_line(self) < 0) {
                     goto parsingerror;
@@ -875,7 +874,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
                 slen = self->stream_len;
                 self->state = START_RECORD;
 
-                /* HACK, let's try this one again */
+                /*FIXME let's try this one again */
                 --i; buf--;
                 if (line_limit > 0 && self->lines == start_lines + line_limit) {
                     goto linelimit;