8000 ENH: Add return_inverse to cython-unique; unify unique/factorize-code by h-vetinari · Pull Request #23400 · pandas-dev/pandas · GitHub
[go: up one dir, main page]

Skip to content

ENH: Add return_inverse to cython-unique; unify unique/factorize-code #23400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
858f54e
Unify unique/factorize, remove kwargs (perf); enable inverse for unique
h-vetinari Oct 15, 2018
4ed354a
Template over {return_inverse, ignore_na} for perf
h-vetinari Oct 26, 2018
906cd50
Re-add kwargs to method signature
h-vetinari Oct 26, 2018
0d6dad0
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Oct 28, 2018
19c7c1f
Fix small oversight
h-vetinari Oct 28, 2018
a8f079f
Simplify an if-condition
h-vetinari Oct 29, 2018
1c5b97a
Reword comment 8000
h-vetinari Oct 29, 2018
d71d68c
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Oct 29, 2018
c7327fd
Revert templating over {return_inverse, ignore_na}
h-vetinari Oct 31, 2018
e09f6fe
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 2, 2018
f93a912
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 3, 2018
a06494e
Add new kwargs at the end (review jreback)
h-vetinari Nov 3, 2018
799fdfb
Merge branch 'master' into unique_inverse_cython
h-vetinari Nov 6, 2018
0ad2272
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 11, 2018
906a2b9
Retrigger CircleCI
h-vetinari Nov 11, 2018
29aecdd
Retrigger CI after flaky hypothesis test
h-vetinari Nov 11, 2018
746c0e3
Retrigger CircleCI
h-vetinari Nov 11, 2018
8da33f4
Retrigger CI after timeout
h-vetinari Nov 11, 2018
ba9d8b8
Retrigger CircleCI
h-vetinari Nov 11, 2018
2423a05
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 12, 2018
6079c26
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 18, 2018
30de418
Always calculate inverse
h-vetinari Nov 18, 2018
feb32d6
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 19, 2018
0b85759
Revert "Always calculate inverse"
h-vetinari Nov 19, 2018
aad4b91
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 21, 2018
4451854
Add comments to ignore_na branches (review jreback)
h-vetinari Nov 21, 2018
e6b71b5
Merge remote-tracking branch 'upstream/master' into unique_inverse_cy…
h-vetinari Nov 27, 2018
00a304d
Switch signature of hashtable.factorize (review jreback)
h-vetinari Nov 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Switch signature of hashtable.factorize (review jreback)
  • Loading branch information
h-vetinari committed Nov 27, 2018
commit 00a304d467483135ad2c035d218a562fbdbeecd4
39 changes: 15 additions & 24 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -502,18 +502,15 @@ cdef class {{name}}HashTable(HashTable):

Returns
-------
labels : ndarray[int64]
The labels from values to uniques
uniques : ndarray[{{dtype}}]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
uniques_vector = {{name}}Vector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
# factorize has reversed outputs compared to _unique
return labels, uniques
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)

def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
Expand Down Expand Up @@ -858,18 +855,15 @@ cdef class StringHashTable(HashTable):

Returns
-------
labels : ndarray[int64]
The labels from values to uniques
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
# factorize has reversed outputs compared to _unique
return labels, uniques
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
Expand Down Expand Up @@ -1092,18 +1086,15 @@ cdef class PyObjectHashTable(HashTable):

Returns
-------
labels : ndarray[int64]
The labels from values to uniques
uniques : ndarray[object]
Unique values of input, not sorted
labels : ndarray[int64]
The labels from values to uniques
"""
uniques_vector = ObjectVector()
uniques, labels = self._unique(values, uniques_vector,
na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)
# factorize has reversed outputs compared to _unique
return labels, uniques
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
return_inverse=True)

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
(hash_klass, _), values = _get_data_algo(values, _hashtables)

table = hash_klass(size_hint or len(values))
labels, uniques = table.factorize(values, na_sentinel=na_sentinel,
uniques, labels = table.factorize(values, na_sentinel=na_sentinel,
na_value=na_value)

labels = ensure_platform_int(labels)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,7 +1391,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
s_duplicated.values.setflags(write=writable)
na_mask = s_duplicated.isna().values

result_inverse, result_unique = htable().factorize(s_duplicated.values)
result_unique, result_inverse = htable().factorize(s_duplicated.values)

# drop_duplicates has own cython code (hash_table_func_helper.pxi)
# and is tested separately; keeps first occurrence like ht.factorize()
Expand Down
0