8000 BUG: Dense ranking with percent now uses 100% basis by rouzazari · Pull Request #15639 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

BUG: Dense ranking with percent now uses 100% basis #15639

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 9, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
BUG: Dense ranking with percent now uses 100% basis
- `DataFrame.rank()` and `Series.rank()` when `method='dense'` and
  `pct=True` now scales to 100%.

See #15630
  • Loading branch information
rouzazari authored and gfyoung committed Mar 8, 2018
commit 62997900fa1cce38f05c9ba1e8b4910be2df4419
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1731,3 +1731,4 @@ Other
- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
- Bug in interactions with ``Qt`` when a ``QtApplication`` already exists (:issue:`14372`)
- Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you move to 0.21.0

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to the numeric section of bug fixes. I thin this is ok as a bug fix, though if you think as a user it would warrant a bigger mention, a sub-section would be ok too.

- Bug in ``DataFrame.rank()`` and ``Series.rank()`` when ``method='dense'`` and ``pct=True`` (:issue:`15630`)
10 changes: 8 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
sum_ranks = dups = 0
{{endif}}
if pct:
return ranks / count
if tiebreak == TIEBREAK_DENSE:
return ranks / total_tie_count
else:
return ranks / count
else:
return ranks

Expand Down Expand Up @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
ranks[i, argsorted[i, z]] = total_tie_count
sum_ranks = dups = 0
if pct:
ranks[i, :] /= count
if tiebreak == TIEBREAK_DENSE:
ranks[i, :] /= total_tie_count
else:
ranks[i, :] /= count
if axis == 0:
return ranks.T
else:
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/frame/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,14 @@ def test_rank_methods_frame(self):
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)

def test_rank_dense_(self):
df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
result = df.rank(method='dense', pct=True)
expected = DataFrame([[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]])
assert_frame_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/series/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,25 @@ def test_rank_dense_method(self):
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

def test_rank_dense_(self):
# GH15630, pct should be on 100% basis even when method='dense'
in_out = [([1], [1.]),
([2], [1.]),
([0], [1.]),
([2, 2], [1., 1.]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([-5, -4, -3, -2, -1],
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

Expand Down
210 changes: 210 additions & 0 deletions pandas/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# -*- coding: utf-8 -*-
from pandas import compat

from distutils.version import LooseVersion
from numpy import nan
import numpy as np

from pandas import Series, DataFrame

from pandas.compat import product
from pandas.util.testing import (assert_frame_equal, assert_series_equal)
import pandas.util.testing as tm


class TestRank(tm.TestCase):
s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3])
df = DataFrame({'A': s, 'B': s})

results = {
'average': np.array([1.5, 5.5, 7.0, 3.5, nan,
3.5, 1.5, 8.0, nan, 5.5]),
'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]),
'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]),
'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]),
'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]),
}

def test_rank_tie_methods(self):
s = self.s

def _check(s, expected, method='average'):
result = s.rank(method=method)
tm.assert_series_equal(result, Series(expected))

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, dtype in product(results, dtypes):
if (dtype, method) in disabled:
continue
series = s if dtype is None else s.astype(dtype)
_check(series, results[method], method=method)

def test_rank_methods_series(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata

xs = np.random.randn(9)
xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
np.random.shuffle(xs)

index = [chr(ord('a') + i) for i in range(len(xs))]

for vals in [xs, xs + 1e6, xs * 1e-6]:
ts = Series(vals, index=index)

for m in ['average', 'min', 'max', 'first', 'dense']:
result = ts.rank(method=m)
sprank = rankdata(vals, m if m != 'first' else 'ordinal')
expected = Series(sprank, index=index)

if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_series_equal(result, expected)

def test_rank_methods_frame(self):
tm.skip_if_no_package('scipy', '0.13', 'scipy.stats.rankdata')
import scipy
from scipy.stats import rankdata

xs = np.random.randint(0, 21, (100, 26))
xs = (xs - 10.0) / 10.0
cols = [chr(ord('z') - i) for i in range(xs.shape[1])]

for vals in [xs, xs + 1e6, xs * 1e-6]:
df = DataFrame(vals, columns=cols)

for ax in [0, 1]:
for m in ['average', 'min', 'max', 'first', 'dense']:
result = df.rank(axis=ax, method=m)
sprank = np.apply_along_axis(
rankdata, ax, vals,
m if m != 'first' else 'ordinal')
sprank = sprank.astype(np.float64)
expected = DataFrame(sprank, columns=cols)

if LooseVersion(scipy.__version__) >= '0.17.0':
expected = expected.astype('float64')
tm.assert_frame_equal(result, expected)

def test_rank_dense_method(self):
dtypes = ['O', 'f8', 'i8']
in_out = [([1], [1]),
([2], [1]),
([0], [1]),
([2, 2], [1, 1]),
([1, 2, 3], [1, 2, 3]),
([4, 2, 1], [3, 2, 1],),
([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense')
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

# GH15630, pct should be on 100% basis even when method='dense'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make this a parametrized tests (needs to be a separate function, not in this class)

in_out = [([1], [1.]),
([2], [1.]),
([0], [1.]),
([2, 2], [1., 1.1]),
([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]),
([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],),
([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]),
([-5, -4, -3, -2, -1],
[1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]

for ser, exp in in_out:
for dtype in dtypes:
s = Series(ser).astype(dtype)
result = s.rank(method='dense', pct=True)
expected = Series(exp).astype(result.dtype)
assert_series_equal(result, expected)

df = DataFrame([['2012', 'B', 3], ['2012', 'A', 2], ['2012', 'A', 1]])
result = df.rank(method='dense', pct=True)
expected = DataFrame([[1., 1., 1.],
[1., 0.5, 2. / 3],
[1., 0.5, 1. / 3]])
assert_frame_equal(result, expected)

def test_rank_descending(self):
dtypes = ['O', 'f8', 'i8']

for dtype, method in product(dtypes, self.results):
if 'i' in dtype:
s = self.s.dropna()
df = self.df.dropna()
else:
s = self.s.astype(dtype)
df = self.df.astype(dtype)

res = s.rank(ascending=False)
expected = (s.max() - s).rank()
assert_series_equal(res, expected)

res = df.rank(ascending=False)
expected = (df.max() - df).rank()
assert_frame_equal(res, expected)

if method == 'first' and dtype == 'O':
continue

expected = (s.max() - s).rank(method=method)
res2 = s.rank(method=method, ascending=False)
assert_series_equal(res2, expected)

expected = (df.max() - df).rank(method=method)

if dtype != 'O':
res2 = df.rank(method=method, ascending=False,
numeric_only=True)
assert_frame_equal(res2, expected)

res3 = df.rank(method=method, ascending=False,
numeric_only=False)
assert_frame_equal(res3, expected)

def test_rank_2d_tie_methods(self):
df = self.df

def _check2d(df, expected, method='average', axis=0):
exp_df = DataFrame({'A': expected, 'B': expected})

if axis == 1:
df = df.T
exp_df = exp_df.T

result = df.rank(method=method, axis=axis)
assert_frame_equal(result, exp_df)

dtypes = [None, object]
disabled = set([(object, 'first')])
results = self.results

for method, axis, dtype in product(results, [0, 1], dtypes):
if (dtype, method) in disabled:
continue
frame = df if dtype is None else df.astype(dtype)
_check2d(frame, results[method], method=method, axis=axis)

def test_rank_int(self):
s = self.s.dropna().astype('i8')

for method, res in compat.iteritems(self.results):
result = s.rank(method=method)
expected = Series(res).dropna()
expected.index = result.index
assert_series_equal(result, expected)

def test_rank_object_bug(self):
# GH 13445

# smoke tests
Series([np.nan] * 32).astype(object).rank(ascending=True)
Series([np.nan] * 32).astype(object).rank(ascending=False)
0