ENH: enable regular expressions / long delimiters in read_table/csv, GH #364

wesm · wesm · commit ab92792d9aea · 2011-11-17T13:25:19.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -65,6 +65,8 @@ pandas 0.5.1
     should work across platforms (GH #300)
   - Add `nunique` function to Series for counting unique elements (GH #297)
   - DataFrame constructor will use Series name if no columns passed (GH #373)
+  - Support regular expressions and longer delimiters in read_table/read_csv,
+    but does not handle quoted strings yet (GH #364)
 
 **Improvements to existing features**
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2,6 +2,7 @@
 Module contains tools for processing files into DataFrames or other objects
 """
 from StringIO import StringIO
+import re
 import zipfile
 
 import numpy as np
@@ -25,22 +26,23 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
         except Exception: # pragma: no cover
             f = open(filepath_or_buffer, 'r')
 
-    sniff_sep = True
-    # default dialect
-    dia = csv.excel
-    if sep is not None:
-        sniff_sep = False
-        dia.delimiter = sep
-    # attempt to sniff the delimiter
-    if sniff_sep:
-        line = f.readline()
-        sniffed = csv.Sniffer().sniff(line)
-        dia.delimiter = sniffed.delimiter
-        buf = list(csv.reader(StringIO(line), dialect=dia))
+    buf = []
+    if sep is None or len(sep) == 1:
+        sniff_sep = True
+        # default dialect
+        dia = csv.excel
+        if sep is not None:
+            sniff_sep = False
+            dia.delimiter = sep
+        # attempt to sniff the delimiter
+        if sniff_sep:
+            line = f.readline()
+            sniffed = csv.Sniffer().sniff(line)
+            dia.delimiter = sniffed.delimiter
+            buf.extend(list(csv.reader(StringIO(line), dialect=dia)))
+        reader = csv.reader(f, dialect=dia)
     else:
-        buf = []
-
-    reader = csv.reader(f, dialect=dia)
+        reader = (re.split(sep, line.strip()) for line in f)
 
     if date_parser is not None:
         parse_dates = True
@@ -73,7 +75,7 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
                     nrows=nrows, iterator=iterator, chunksize=chunksize,
                     skip_footer=skip_footer, converters=converters)
 
-def read_clipboard(**kwargs):  # pragma: no cover
+def read_clipboard(sep='\s+', **kwargs):  # pragma: no cover
     """
     Read text from clipboard and pass to read_table. See read_table for the full
     argument list
@@ -84,6 +86,7 @@ def read_clipboard(**kwargs):  # pragma: no cover
     """
     from pandas.util.clipboard import clipboard_get
     text = clipboard_get()
+    kwargs['sep'] = sep
     return read_table(StringIO(text), **kwargs)
 
 _parser_params = """Also supports optionally iterating or breaking of the file
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2,6 +2,7 @@
 from datetime import datetime
 import csv
 import os
+import re
 import unittest
 
 import nose
@@ -428,6 +429,18 @@ def test_converters(self):
         assert_frame_equal(result, expected)
         assert_frame_equal(result2, expected)
 
+    def test_regex_separator(self):
+        data = """   A   B   C   D
+a   1   2   3   4
+b   1   2   3   4
+c   1   2   3   4
+"""
+        df = read_table(StringIO(data), sep='\s+')
+        expected = read_csv(StringIO(re.sub('[ ]+', ',', data)),
+                            index_col=0)
+        self.assert_(expected.index.name is None)
+        assert_frame_equal(df, expected)
+
 class TestParseSQL(unittest.TestCase):
 
     def test_convert_sql_column_floats(self):