8000 PERF/API: concat improvements by jreback · Pull Request #6438 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

PERF/API: concat improvements #6438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 21, 2014
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
API: concat will now concatenate mixed Series and DataFrames using th…
…e Series name

     or numbering columns as needed (GH2385)
  • Loading branch information
jreback committed Feb 21, 2014
commit c6b21b4742fe716ea6aa02b458682b5e0ee340f7
27 changes: 27 additions & 0 deletions doc/source/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``:

df1.append(df2, ignore_index=True)

.. _merging.mixed_ndims:

Concatenating with mixed ndims
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

You can concatenate a mix of Series and DataFrames. The
Series will be transformed to DataFrames with the column name as
the name of the Series.

.. ipython:: python

df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D'])
s1 = Series(randn(6), name='foo')
concat([df1, s1],axis=1)

If unnamed Series are passed they will be numbered consecutively.

.. ipython:: python

s2 = Series(randn(6))
concat([df1, s2, s2, s2],axis=1)

Passing ``ignore_index=True`` will drop all name references.

.. ipython:: python

concat([df1, s1],axis=1,ignore_index=True)

More concatenating with group keys
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ API Changes
- The top-level :func:`pandas.eval` function does not allow you use the
``'@'`` prefix and provides you with an error message telling you so.
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`)

Experimental Features
~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 2 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ API changes
- The top-level :func:`pandas.eval` function does not allow you use the
``'@'`` prefix and provides you with an error message telling you so.
- ``NameResolutionError`` was removed because it isn't necessary anymore.
- ``concat`` will now concatenate mixed Series and DataFrames using the Series name
or numbering columns as needed (:issue:`2385`). See :ref:`the docs <mergine.mixed_ndims>`

MultiIndexing Using Slicers
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
76 changes: 61 additions & 15 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,40 +970,86 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
objs = clean_objs
keys = clean_keys

# consolidate data
self.objs = []
if len(objs) == 0:
raise Exception('All objects passed were None')

# consolidate data & figure out what our result ndim is going to be
ndims = set()
for obj in objs:
if not isinstance(obj, NDFrame):
raise TypeError("cannot concatenate a non-NDFrame object")

# skip completely empty
if not np.sum(obj.shape):
continue

# consolidate
obj.consolidate(inplace=True)
self.objs.append(obj)
ndims.add(obj.ndim)

# get the sample
# want the higest ndim that we have, and must be non-empty
# unless all objs are empty
sample = None
if len(ndims) > 1:
max_ndim = max(ndims)
for obj in objs:
if obj.ndim == max_ndim and np.sum(obj.shape):
sample = obj
break

if len(self.objs) == 0:
raise Exception('All objects passed were None')

# need the first as a sample non-empty as a sample
sample = next(obj for obj in self.objs if np.prod(obj.shape))
else:
# filter out the empties
# if we have not multi-index possibiltes
df = DataFrame([ obj.shape for obj in objs ]).sum(1)
non_empties = df[df!=0]
if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None):
objs = [ objs[i] for i in non_empties.index ]
sample = objs[0]

if sample is None:
sample = objs[0]
self.objs = objs

# Need to flip BlockManager axis in the DataFrame special case
if isinstance(sample, DataFrame):
self._is_frame = isinstance(sample, DataFrame)
if self._is_frame:
axis = 1 if axis == 0 else 0

self._is_series = isinstance(sample, ABCSeries)
if not 0 <= axis <= sample.ndim:
raise AssertionError("axis must be between 0 and {0}, "
"input was {1}".format(sample.ndim, axis))

# if we have mixed ndims, then convert to highest ndim
# creating column numbers as needed
if len(ndims) > 1:
current_column = 0
max_ndim = sample.ndim
self.objs, objs = [], self.objs
for obj in objs:

ndim = obj.ndim
if ndim == max_ndim:
pass

elif ndim != max_ndim-1:
raise ValueError("cannot concatenate unaligned mixed "
"dimensional NDFrame objects")

else:
name = getattr(obj,'name',None)
if ignore_index or name is None:
name = current_column
current_column += 1

# doing a row-wise concatenation so need everything
# to line up
if self._is_frame and axis == 1:
name = 0
obj = sample._constructor({ name : obj })

self.objs.append(obj)

# note: this is the BlockManager axis (since DataFrame is transposed)
self.axis = axis

self.join_axes = join_axes

self.keys = keys
self.names = names
self.levels = levels
Expand Down
71 changes: 62 additions & 9 deletions pandas/tools/tests/test_merge.py
9E88
Original file line number Diff line number Diff line change
Expand Up @@ -1657,11 +1657,73 @@ def test_handle_empty_objects(self):
# GH3259
df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s'))
empty = DataFrame()
result = concat([df,empty],axis=1)
assert_frame_equal(result, df)
result = concat([empty,df],axis=1)
assert_frame_equal(result, df)

result = concat([df,empty])
assert_frame_equal(result, df)
result = concat([empty,df])
assert_frame_equal(result, df)

def test_concat_mixed_objs(self):

# concat mixed series/frames
# G2385

# axis 1
index=date_range('01-Jan-2013', periods=10, freq='H')
arr = np.arange(10, dtype='int64')
s1 = Series(arr, index=index)
s2 = Series(arr, index=index)
df = DataFrame(arr.reshape(-1,1), index=index)

expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0])
result = concat([df,df], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1])
result = concat([s1,s2], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
result = concat([s1,s2,s1], axis=1)
assert_frame_equal(result, expected)

expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3])
result = concat([s1,df,s2,s2,s1], axis=1)
assert_frame_equal(result, expected)

# with names
s1.name = 'foo'
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0])
result = concat([s1,df,s2], axis=1)
assert_frame_equal(result, expected)

s2.name = 'bar'
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar'])
result = concat([s1,df,s2], axis=1)
assert_frame_equal(result, expected)

# ignore index
expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2])
result = concat([s1,df,s2], axis=1, ignore_index=True)
assert_frame_equal(result, expected)

# axis 0
expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0])
result = concat([s1,df,s2])
assert_frame_equal(result, expected)

expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0])
result = concat([s1,df,s2], ignore_index=True)
assert_frame_equal(result, expected)

# invalid concatente of mixed dims
panel = tm.makePanel()
self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1))

def test_panel_join(self):
panel = tm.makePanel()
tm.add_nans(panel)
Expand Down Expand Up @@ -1991,15 +2053,6 @@ def test_concat_invalid_first_argument(self):
# generator ok though
concat(DataFrame(np.random.rand(5,5)) for _ in range(3))

def test_concat_mixed_types_fails(self):
df = DataFrame(randn(10, 1))

with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
concat([df[0], df], axis=1)

with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"):
concat([df, df[0]], axis=1)

class TestOrderedMerge(tm.TestCase):

def setUp(self):
Expand Down
4 changes: 2 additions & 2 deletions vb_suite/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ def sample(values, k):
empty = DataFrame()
"""

concat_empty_frames1 = Benchmark('concat([df,empty)', setup,
concat_empty_frames1 = Benchmark('concat([df,empty])', setup,
start_date=datetime(2012, 1, 1))
concat_empty_frames2 = Benchmark('concat([empty,df)', setup,
concat_empty_frames2 = Benchmark('concat([empty,df])', setup,
start_date=datetime(2012, 1, 1))


Expand Down
0