8000 Issues #194 (cov function) and #92 (multi-column sort) by changhiskhan · Pull Request #362 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

Issues #194 (cov function) and #92 (multi-column sort) #362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 44 additions & 22 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1491,11 +1491,7 @@ def sort(self, column=None, axis=0, ascending=True):
-------
sorted : DataFrame
"""
by = None
if column:
assert(axis == 0)
by = self[column].values
return self.sort_index(by=by, axis=axis, ascending=ascending)
return self.sort_index(by=column, axis=axis, ascending=ascending)

def sort_index(self, axis=0, by=None, ascending=True):
"""
Expand All @@ -1507,7 +1503,7 @@ def sort_index(self, axis=0, by=None, ascending=True):
axis : {0, 1}
Sort index/rows versus columns
by : object
Column name in frame
Column names in frame
ascending : boolean, default True
Sort ascending vs. descending

Expand All @@ -1516,17 +1512,20 @@ def sort_index(self, axis=0, by=None, ascending=True):
sorted : DataFrame
"""
labels = self._get_axis(axis)

order_list = None
if by is not None:
try:
if by in self.columns:
assert(axis == 0)
by = self[by].values
except Exception:
pass

assert(len(by) == len(labels))
sort_index = Series(by, index=labels).order().index
assert(axis == 0)
by = self[by]

if isinstance(by, Series):
assert(len(by) == len(labels))
by = by.values
sort_index = Series(by, index=labels).order().index
elif isinstance(by, DataFrame):
assert(len(by.index) == len(labels))
type_list = [(col_name, by[col_name].dtype) for col_name in by.columns]
sort_arr = np.array([tuple(r) for r in by.values], dtype=type_list)
sort_index = labels.take(sort_arr.argsort(order=by.columns.tolist()))
else:
sort_index = labels.take(labels.argsort())

Expand Down Expand Up @@ -2485,19 +2484,42 @@ def corr(self):
sigma = np.sqrt(np.diag(baseCov))
correl = baseCov / np.outer(sigma, sigma)

# Get the covariance with items that have NaN values
for i, j, ac, bc in self._cov_helper(mat):
c = np.corrcoef(ac, bc)[0, 1]
correl[i, j] = c
correl[j, i] = c

return self._constructor(correl, index=cols, columns=cols)

def cov(self):
"""
Compute pairwise covariance of columns, excluding NA/null values

Returns
-------
y : DataFrame
"""
cols = self.columns
mat = self.as_matrix(cols).T
baseCov = np.cov(mat)

for i, j, ac, bc in self._cov_helper(mat):
c = np.cov(ac, bc)[0, 1]
baseCov[i, j] = c
baseCov[j, i] = c

return self._constructor(baseCov, index=cols, columns=cols)

def _cov_helper(self, mat):
# Get the covariance with items that have NaN values
mask = np.isfinite(mat)
for i, A in enumerate(mat):
if not mask[i].all():
for j, B in enumerate(mat):
in_common = mask[i] & mask[j]
if in_common.any():
ac, bc = A[in_common], B[in_common]
c = np.corrcoef(ac, bc)[0, 1]
correl[i, j] = c
correl[j, i] = c

return self._constructor(correl, index=cols, columns=cols)
yield i, j, A[in_common], B[in_common]

def corrwith(self, other, axis=0, drop=False):
"""
Expand Down
31 changes: 28 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -903,15 +903,40 @@ def corr(self, other):
-------
correlation : float
"""
this, that = self._get_nonna_aligned(other)
if this is None or that is None:
return nan
return np.corrcoef(this, that)[0, 1]

def cov(self, other):
"""
Compute covariance with Series, excluding missing values

Parameters
----------
other : Series

Returns
-------
covariance : float
"""
this, that = self._get_nonna_aligned(other)
if this is None or that is None:
return nan
return np.cov(this, that)[0, 1]

def _get_nonna_aligned(self, other):
"""
Returns two sub-Series with the same index and only non-na values
"""
commonIdx = self.dropna().index.intersection(other.dropna().index)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can also do:

this, that = self.dropna().align(other.dropna(), join='inner')


if len(commonIdx) == 0:
return nan
return None, None

this = self.reindex(commonIdx)
that = other.reindex(commonIdx)

return np.corrcoef(this, that)[0, 1]
return this, that

def diff(self, periods=1):
"""
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,6 +1950,14 @@ def test_corr(self):

assert_almost_equal(correls['A']['C'],
self.frame['A'].corr(self.frame['C']))

def test_cov(self):
self.frame['A'][:5] = nan
self.frame['B'][:10] = nan
cov = self.frame.cov()
8000
assert_almost_equal(cov['A']['C'],
self.frame['A'].cov(self.frame['C']))

def test_corrwith(self):
a = self.tsframe
Expand Down Expand Up @@ -2698,6 +2706,28 @@ def test_sort_index(self):
expected = frame.ix[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)

# by multiple columns
frame.values[1, 0] = frame.values[0, 0]
smaller, larger = min(frame.values[:1, 1]), max(frame.values[:1, 1])
if smaller == larger:
larger = smaller + 1
frame.values[0, 1] = larger
frame.values[1, 1] = smaller

sorted_df = frame.sort_index(by=['A', 'B'])
indexer = frame['A'].argsort().values
zero_mask = indexer == 0
one_mask = indexer == 1
indexer[zero_mask] = 1
indexer[one_mask] = 0
expected = frame.ix[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)

sorted_df = frame.sort_index(by=['A', 'B'], ascending=False)
indexer = indexer[::-1]
expected = frame.ix[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)

# check for now
sorted_df = frame.sort(column='A')
expected = frame.sort_index(by='A')
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,16 @@ def test_corr(self):

# additional checks?

def test_cov(self):
# full overlap
self.assertAlmostEqual(self.ts.cov(self.ts), self.ts.std()**2)

# partial overlap
self.assertAlmostEqual(self.ts[:15].cov(self.ts[5:]), self.ts[5:15].std()**2)

# No overlap
self.assert_(np.isnan(self.ts[::2].cov(self.ts[1::2])))

def test_copy(self):
ts = self.ts.copy()

Expand Down
0