E579 Fix a delimiter detection problem in sniffer. Sniffing "a|b|c\r\n" was · python/cpython@39b29be · GitHub
[go: up one dir, main page]

Skip to content

Commit 39b29be

Browse files
author
Skip Montanaro
committed
Fix a delimiter detection problem in sniffer. Sniffing "a|b|c\r\n" was
returning 'a' as the delimiter. It now returns '|', but not because I understood better what the code was supposed to do. Would someone that understands the idea behind _guess_delimiter() (see its doc string) look to see if my fallback choice is better than before or if it's just serendipity that I picked the proper delimiter?
1 parent 0174ddd commit 39b29be

File tree

2 files changed

+17
-3
lines changed

2 files changed

+17
-3
lines changed

Lib/csv.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,13 @@ def sniff(self, sample, delimiters=None):
152152

153153
quotechar, delimiter, skipinitialspace = \
154154
self._guess_quote_and_delimiter(sample, delimiters)
155-
if delimiter is None:
155+
if not delimiter:
156156
delimiter, skipinitialspace = self._guess_delimiter(sample,
157157
delimiters)
158158

159+
if not delimiter:
160+
raise Error, "Could not determine delimiter"
161+
159162
class dialect(Dialect):
160163
_name = "sniffed"
161164
lineterminator = '\r\n'
@@ -329,8 +332,12 @@ def _guess_delimiter(self, data, delimiters):
329332
data[0].count("%c " % d))
330333
return (d, skipinitialspace)
331334

332-
# finally, just return the first damn character in the list
333-
delim = delims.keys()[0]
335+
# nothing else indicates a preference, pick the character that
336+
# dominates(?)
337+
items = [(v,k) for (k,v) in delims.items()]
338+
items.sort()
339+
delim = items[-1][1]
340+
334341
skipinitialspace = (data[0].count(delim) ==
335342
data[0].count("%c " % delim))
336343
return (delim, skipinitialspace)

Lib/test/test_csv.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,8 @@ class TestSniffer(unittest.TestCase):
852852
'''
853853

854854
sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n"
855+
sample6 = "a|b|c\r\nd|e|f\r\n"
856+
sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
855857

856858
def test_has_header(self):
857859
sniffer = csv.Sniffer()
@@ -882,6 +884,11 @@ def test_delimiters(self):
882884
self.assertEqual(dialect.delimiter, ";")
883885
dialect = sniffer.sniff(self.sample5)
884886
self.assertEqual(dialect.delimiter, "\t")
887+
dialect = sniffer.sniff(self.sample6)
888+
self.assertEqual(dialect.delimiter, "|")
889+
dialect = sniffer.sniff(self.sample7)
890+
self.assertEqual(dialect.delimiter, "|")
891+
self.assertEqual(dialect.quotechar, "'")
885892

886893
if not hasattr(sys, "gettotalrefcount"):
887894
if test_support.verbose: print "*** skipping leakage tests ***"

0 commit comments

Comments
 (0)
0