8000 ENH: enable regular expressions / long delimiters in read_table/csv, … · pandas-dev/pandas@ab92792 · GitHub
[go: up one dir, main page]

Skip to content

Commit ab92792

Browse files
committed
ENH: enable regular expressions / long delimiters in read_table/csv, GH #364
1 parent 3e2e7af commit ab92792

File tree

3 files changed

+34
-16
lines changed

3 files changed

+34
-16
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ pandas 0.5.1
6565
should work across platforms (GH #300)
6666
- Add `nunique` function to Series for counting unique elements (GH #297)
6767
- DataFrame constructor will use Series name if no columns passed (GH #373)
68+
- Support regular expressions and longer delimiters in read_table/read_csv,
69+
but does not handle quoted strings yet (GH #364)
6870
6971
**Improvements to existing features**
7072

pandas/io/parsers.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Module contains tools for processing files into DataFrames or other objects
33
"""
44
from StringIO import StringIO
5+
import re
56
import zipfile
67

78
import numpy as np
@@ -25,22 +26,23 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
2526
except Exception: # pragma: no cover
2627
f = open(filepath_or_buffer, 'r')
2728

28-
sniff_sep = True
29-
# default dialect
30-
dia = csv.excel
31-
if sep is not None:
32-
sniff_sep = False
33-
dia.delimiter = sep
34-
# attempt to sniff the delimiter
35-
if sniff_sep:
36-
line = f.readline()
37-
sniffed = csv.Sniffer().sniff(line)
38-
dia.delimiter = sniffed.delimiter
39-
buf = list(csv.reader(StringIO(line), dialect=dia))
29+
buf = []
30+
if sep is None or len(sep) == 1:
31+
sniff_sep = True
32+
# default dialect
33+
dia = csv.excel
34+
if sep is not None:
35+
sniff_sep = False
36+
dia.delimiter = sep
37+
# attempt to sniff the delimiter
38+
if sniff_sep:
39+
line = f.readline()
40+
sniffed = csv.Sniffer().sniff(line)
41+
dia.delimiter = sniffed.delimiter
42+
buf.extend(list(csv.reader(StringIO(line), dialect=dia)))
43+
reader = csv.reader(f, dialect=dia)
4044
else:
41-
buf = []
42-
43-
reader = csv.reader(f, dialect=dia)
45+
reader = (re.split(sep, line.strip()) for line in f)
4446

4547
if date_parser is not None:
4648
parse_dates = True
@@ -73,7 +75,7 @@ def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None,
7375
nrows=nrows, iterator=iterator, chunksize=chunksize,
7476
skip_footer=skip_footer, converters=converters)
7577

76-
def read_clipboard(**kwargs): # pragma: no cover
78+
def read_clipboard(sep='\s+', **kwargs): # pragma: no cover
7779
"""
7880
Read text from clipboard and pass to read_table. See read_table for the full
7981
argument list
@@ -84,6 +86,7 @@ def read_clipboard(**kwargs): # pragma: no cover
8486
"""
8587
from pandas.util.clipboard import clipboard_get
8688
text = clipboard_get()
89+
kwargs['sep'] = sep
8790
return read_table(StringIO(text), **kwargs)
8891

8992
_parser_params = """Also supports optionally iterating or breaking of the file

pandas/io/tests/test_parsers.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from datetime import datetime
33
import csv
44
import os
5+
import re
56
import unittest
67

78
import nose
@@ -428,6 +429,18 @@ def test_converters(self):
428429
assert_frame_equal(result, expected)
429430
assert_frame_equal(result2, expected)
430431

432+
def test_regex_separator(self):
433+
data = """ A B C D
434+
a 1 2 3 4
435+
b 1 2 3 4
436+
c 1 2 3 4
437+
"""
438+
df = read_table(StringIO(data), sep='\s+')
439+
expected = read_csv(StringIO(re.sub('[ ]+', ',', data)),
440+
index_col=0)
441+
self.assert_(expected.index.name is None)
442+
assert_frame_equal(df, expected)
443+
431444
class TestParseSQL(unittest.TestCase):
432445

433446
def test_convert_sql_column_floats(self):

0 commit comments

Comments
 (0)
0