8000 ENH: allow to_csv to write multi-index columns, read_csv to read with header=list arg by jreback · Pull Request #3575 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: allow to_csv to write multi-index columns, read_csv to read with header=list arg #3575

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 19, 2013
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ENH: Allow read_csv to handle multi-index in columns
     GH3571, GH1651, GH3141
  • Loading branch information
jreback committed May 19, 2013
commit cc93d614eaa3e3c46daf340a4aae58b56a0fa226
69 changes: 35 additions & 34 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -963,48 +963,49 @@ def _save_header(self):
encoded_labels = []

has_aliases = isinstance(header, (tuple, list, np.ndarray))
if has_aliases or self.header:
if not (has_aliases or self.header):
return

if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, MultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ''
index_label.append(name)
if self.index:
# should write something for index label
if index_label is not False:
if index_label is None:
if isinstance(obj.index, MultiIndex):
index_label = []
for i, name in enumerate(obj.index.names):
if name is None:
name = ''
index_label.append(name)
else:
8000 index_label = obj.index.name
if index_label is None:
index_label = ['']
else:
index_label = obj.index.name
if index_label is None:
index_label = ['']
else:
index_label = [index_label]
elif not isinstance(index_label, (list, tuple, np.ndarray)):
# given a string for a DF with Index
index_label = [index_label]
index_label = [index_label]
elif not isinstance(index_label, (list, tuple, np.ndarray)):
# given a string for a DF with Index
index_label = [index_label]

encoded_labels = list(index_label)
else:
encoded_labels = []
encoded_labels = list(index_label)
else:
encoded_labels = []

if has_aliases:
if len(header) != len(cols):
raise ValueError(('Writing %d cols but got %d aliases'
% (len(cols), len(header))))
else:
write_cols = header
if has_aliases:
if len(header) != len(cols):
raise ValueError(('Writing %d cols but got %d aliases'
% (len(cols), len(header))))
else:
write_cols = cols
write_cols = header
else:
write_cols = cols

if not has_mi_columns:
encoded_labels += list(write_cols)
if not has_mi_columns:
encoded_labels += list(write_cols)

else:
else:

if not has_mi_columns:
encoded_labels += list(cols)
if not has_mi_columns:
encoded_labels += list(cols)

# write out the mi
if has_mi_columns:
Expand Down
53 changes: 42 additions & 11 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,19 +677,18 @@ def read(self, nrows=None):
if self.options.get('as_recarray'):
return ret

index, columns, col_dict = ret

# May alter columns / col_dict
# index, columns, col_dict = self._create_index(col_dict, columns)
index, columns, col_dict = self._create_index(ret)

df = DataFrame(col_dict, columns=columns, index=index)

if self.squeeze and len(df.columns) == 1:
return df[df.columns[0]]
return df

def _create_index(self, col_dict, columns):
pass
def _create_index(self, ret):
index, columns, col_dict = ret
return index, columns, col_dict

def get_chunk(self, size=None):
if size is None:
Expand All @@ -709,6 +708,7 @@ def __init__(self, kwds):

self.index_col = kwds.pop('index_col', None)
self.index_names = None
self.col_names = None

self.parse_dates = kwds.pop('parse_dates', False)
self.date_parser = kwds.pop('date_parser', None)
Expand Down Expand Up @@ -942,7 +942,32 @@ def __init__(self, src, **kwds):
if self._reader.header is None:
self.names = None
else:
self.names = list(self._reader.header)
if len(self._reader.header) > 1:
# the names are the tuples of the header that are not the index cols
# 0 is the name of the index, assuming index_col is a list of column
# numbers
if (self._reader.leading_cols == 0 and
_is_index_col(self.index_col)):
ic = self.index_col
if not isinstance(ic, (list,tuple,np.ndarray)):
ic = [ ic ]
sic = set(ic)

header = list(self._reader.header)
index_names = header.pop(-1)
self.index_names = [ index_names[i] for i in ic ]
field_count = len(header[0])

def extract(r):
return tuple([ r[i] for i in range(field_count) if i not in sic ])

self.names = ic + zip(*[ extract(r) for r in header ])
self.col_names = [ r[0] if len(r[0]) else None for r in header ]
passed_names = True
else:
raise Exception("must have an index_col when have a multi-index specified")
else:
self.names = list(self._reader.header[0])

if self.names is None:
if self.prefix:
Expand All @@ -958,12 +983,14 @@ def __init__(self, src, **kwds):

if not self._has_complex_date_col:
if (self._reader.leading_cols == 0 and
_is_index_col(self.index_col)):
_is_index_col(self.index_col)):

self._name_processed = True
(self.index_names, self.names,
self.index_col) = _clean_index_names(self.names,
self.index_col)
(index_names, self.names,
self.index_col) = _clean_index_names(self.names, self.index_col)

if self.index_names is None:
self.index_names = index_names

if self._reader.header is None and not passed_names:
self.index_names = [None] * len(self.index_names)
Expand Down Expand Up @@ -1051,6 +1078,10 @@ def read(self, nrows=None):
names, data = self._do_date_conversions(names, data)
index = self._make_index(data, alldata, names)

# possibly create a column mi here
if all([ isinstance(c,tuple) for c in names]):
names = MultiIndex.from_tuples(names,names=self.col_names)

return index, names, data

def _filter_usecols(self, names):
Expand All @@ -1061,7 +1092,7 @@ def _filter_usecols(self, names):
return names

def _get_index_names(self):
names = list(self._reader.header)
names = list(self._reader.header[0])
idx_names = None

if self._reader.leading_cols == 0 and self.index_col is not None:
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/tests/test_cparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self):
reader = TextReader(StringIO(data), delimiter=',', header=2,
as_recarray=True)
header = reader.header
expected = ['a', 'b', 'c']
expected = [['a', 'b', 'c']]
self.assertEquals(header, expected)

recs = reader.read()
Expand Down
113 changes: 70 additions & 43 deletions pandas/src/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ cdef extern from "parser/tokenizer.h":
char thousands

int header # Boolean: 1: has header, 0: no header
int header_start # header row start
int header_end # header row end

void *skipset
int skip_footer
Expand Down Expand Up @@ -242,7 +244,7 @@ cdef class TextReader:
object na_values, true_values, false_values
object memory_map
object as_recarray
object header, names
object header, names, header_start, header_end
object low_memory
object skiprows
object compact_ints, use_unsigned
Expand All @@ -256,6 +258,8 @@ cdef class TextReader:
delimiter=b',',

header=0,
header_start=0,
header_end=0,
names=None,

memory_map=False,
Expand Down Expand Up @@ -435,11 +439,28 @@ cdef class TextReader:
# TODO: no header vs. header is not the first row
if header is None:
# sentinel value
self.parser.header_start = -1
self.parser.header_end = -1
self.parser.header = -1
self.parser_start = 0
self.header = []
else:
self.parser.header = header
self.parser_start = header + 1
if isinstance(header, list) and len(header):
# need to artifically skip the final line
# which is still a header line
header.append(header[-1]+1)

self.parser.header_start = header[0]
self.parser.header_end = header[-1]
self.parser.header = header[0]
F438 self.parser_start = header[-1] + 1
self.header = header
else:
self.parser.header_start = header
self.parser.header_end = header
self.parser.header = header
self.parser_start = header + 1
self.header = [ header ]

self.names = names
self.header, self.table_width = self._get_header()
Expand Down Expand Up @@ -534,8 +555,10 @@ cdef class TextReader:
' got %s type' % type(source))

cdef _get_header(self):
# header is now a list of lists, so field_count should use header[0]

cdef:
size_t i, start, data_line, field_count, passed_count
size_t i, start, data_line, field_count, passed_count, hr
char *word
object name
int status
Expand All @@ -544,49 +567,53 @@ cdef class TextReader:

header = []

if self.parser.header >= 0:
# Header is in the file
if self.parser.header_start >= 0:

if self.parser.lines < self.parser.header + 1:
self._tokenize_rows(self.parser.header + 2)

# e.g., if header=3 and file only has 2 lines
if self.parser.lines < self.parser.header + 1:
raise CParserError('Passed header=%d but only %d lines in file'
% (self.parser.header, self.parser.lines))
# Header is in the file
for hr in self.header:

field_count = self.parser.line_fields[self.parser.header]
start = self.parser.line_start[self.parser.header]
this_header = []

# TODO: Py3 vs. Py2
counts = {}
for i in range(field_count):
word = self.parser.words[start + i]
if self.parser.lines < hr + 1:
self._tokenize_rows(hr + 2)

if self.c_encoding == NULL and not PY3:
name = PyBytes_FromString(word)
else:
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
name = PyUnicode_FromString(word)
else:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)
# e.g., if header=3 and file only has 2 lines
if self.parser.lines < hr + 1:
raise CParserError('Passed header=%d but only %d lines in file'
% (self.parser.header, self.parser.lines))

if name == '':
name = 'Unnamed: %d' % i
field_count = self.parser.line_fields[hr]
start = self.parser.line_start[hr]

# TODO: Py3 vs. Py2
counts = {}
for i in range(field_count):
word = self.parser.words[start + i]

count = counts.get(name, 0)
if count > 0 and self.mangle_dupe_cols:
header.append('%s.%d' % (name, count))
else:
header.append(name)
counts[name] = count + 1
if self.c_encoding == NULL and not PY3:
name = PyBytes_FromString(word)
else:
if self.c_encoding == NULL or self.c_encoding == b'utf-8':
name = PyUnicode_FromString(word)
else:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)

if name == '':
name = 'Unnamed: %d' % i

count = counts.get(name, 0)
if count > 0 and self.mangle_dupe_cols:
this_header.append('%s.%d' % (name, count))
else:
this_header.append(name)
counts[name] = count + 1

data_line = self.parser.header + 1
data_line = hr + 1
header.append(this_header)

if self.names is not None:
header = self.names
header = [ self.names ]

elif self.names is not None:
# Enforce this unless usecols
Expand All @@ -597,11 +624,11 @@ cdef class TextReader:
if self.parser.lines < 1:
self._tokenize_rows(1)

header = self.names
header = [ self.names ]
data_line = 0

if self.parser.lines < 1:
field_count = len(header)
field_count = len(header[0])
else:
field_count = self.parser.line_fields[data_line]
else:
Expand All @@ -613,7 +640,7 @@ cdef class TextReader:

# Corner case, not enough lines in the file
if self.parser.lines < data_line + 1:
field_count = len(header)
field_count = len(header[0])
else: # not self.has_usecols:

field_count = self.parser.line_fields[data_line]
Expand All @@ -622,7 +649,7 @@ cdef class TextReader:
if self.names is not None:
field_count = max(field_count, len(self.names))

passed_count = len(header)
passed_count = len(header[0])

# if passed_count > field_count:
# raise CParserError('Column names have %d fields, '
Expand Down Expand Up @@ -1038,10 +1065,10 @@ cdef class TextReader:
if self.header is not None:
j = i - self.leading_cols
# hack for #2442
if j == len(self.header):
if j == len(self.header[0]):
return j
else:
return self.header[j]
return self.header[0][j]
else:
return None

Expand Down
Loading
0