8000
We read every piece of feedback, and take your input very seriously.
2 parents f0706b1 + 8bb17cb commit 22af130Copy full SHA for 22af130
pandas/core/algorithms.py
@@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
232
values = PeriodIndex(values, name=name)
233
234
values = values.view(np.int64)
235
- keys, counts = htable.value_count_int64(values)
+ keys, counts = htable.value_count_scalar64(values, dropna)
236
237
if dropna:
238
from pandas.tslib import iNaT
@@ -244,10 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
244
245
elif com.is_integer_dtype(dtype):
246
values = com._ensure_int64(values)
247
248
elif com.is_float_dtype(dtype):
249
values = com._ensure_float64(values)
250
- keys, counts = htable.value_count_float64(values, dropna)
251
252
else:
253
values = com._ensure_object(values)
pandas/core/categorical.py
@@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True):
1030
from pandas.core.index import CategoricalIndex
1031
1032
cat = self.dropna() if dropna else self
1033
- keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
+ keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
1034
result = Series(counts, index=keys)
1035
1036
ix = np.arange(len(cat.categories), dtype='int64')
pandas/hashtable.pyx
@@ -866,96 +866,90 @@ cdef class Int64Factorizer:
866
self.count = len(self.uniques)
867
return labels
868
869
+ctypedef fused kh_scalar64:
870
+ kh_int64_t
871
+ kh_float64_t
872
+
873
@cython.boundscheck(False)
-cdef build_count_table_float64(float64_t[:] values, kh_float64_t *table, bint dropna):
874
+cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values,
875
+ kh_scalar64 *table, bint dropna):
876
cdef:
877
khiter_t k
878
Py_ssize_t i, n = len(values)
- float64_t val
879
+ sixty_four_bit_scalar val
880
int ret = 0
881
- with nogil:
- kh_resize_float64(table, n)
882
+ if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t:
883
+ with nogil:
884
+ kh_resize_float64(table, n)
885
- for i in range(n):
- val = values[i]
- if val == val or not dropna:
- k = kh_get_float64(table, val)
886
+ for i in range(n):
887
+ val = values[i]
888
+ if val == val or not dropna:
889
+ k = kh_get_float64(table, val)
890
+ if k != table.n_buckets:
891
+ table.vals[k] += 1
892
+ else:
893
+ k = kh_put_float64(table, val, &ret)
894
+ table.vals[k] = 1
895
+ elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t:
896
897
+ kh_resize_int64(table, n)
898
899
900
901
+ k = kh_get_int64(table, val)
902
if k != table.n_buckets:
903
table.vals[k] += 1
904
- k = kh_put_float64(table, val, &ret)
905
+ k = kh_put_int64(table, val, &ret)
906
table.vals[k] = 1
907
908
+ raise ValueError("Table type must match scalar type.")
909
910
911
912
-cpdef value_count_float64(float64_t[:] values, bint dropna):
913
+cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
914
915
Py_ssize_t i
- kh_float64_t * table
- float64_t[:] result_keys
916
+ kh_float64_t *ftable
917
+ kh_int64_t *itable
918
+ sixty_four_bit_scalar[:] result_keys
919
int64_t[:] result_counts
920
int k
921
- table = kh_init_float64()
- build_count_table_float64(values, table, dropna)
-
922
i = 0
- result_keys = np.empty(table.n_occupied, dtype=np.float64)
- result_counts = np.zeros(table.n_occupied, dtype=np.int64)
923
- for k in range(table.n_buckets):
- if kh_exist_float64(table, k):
- result_keys[i] = table.keys[k]
- result_counts[i] = table.vals[k]
- i += 1
- kh_destroy_float64(table)
924
+ if sixty_four_bit_scalar is float64_t:
925
+ ftable = kh_init_float64()
926
+ build_count_table_scalar64(values, ftable, dropna)
927
- return np.asarray(result_keys), np.asarray(result_counts)
928
+ result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
929
+ result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)
930
-@cython.boundscheck(False)
-cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
- cdef:
- khiter_t k
- Py_ssize_t i, n = len(values)
- int64_t val
- int ret = 0
- kh_resize_int64(table, n)
- k = kh_get_int64(table, val)
- if k != table.n_buckets:
931
- table.vals[k] += 1
932
- else:
933
- k = kh_put_int64(table, val, &ret)
934
- table.vals[k] = 1
935
936
937
938
-cpdef value_count_int64(int64_t[:] values):
939
940
- Py_ssize_t i
941
- kh_int64_t *table
942
- int64_t[:] result_keys, result_counts
943
- int k
+ for k in range(ftable.n_buckets):
+ if kh_exist_float64(ftable, k):
+ result_keys[i] = ftable.keys[k]
+ result_counts[i] = ftable.vals[k]
+ i += 1
+ kh_destroy_float64(ftable)
944
945
- table = kh_init_int64()
946
- build_count_table_int64(values, table)
+ elif sixty_four_bit_scalar is int64_t:
+ itable = kh_init_int64()
+ build_count_table_scalar64(values, itable, dropna)
947
948
- i = 0
949
- result_keys = np.empty(table.n_occupied, dtype=np.int64)
950
+ result_keys = np.empty(itable.n_occupied, dtype=np.int64)
+ result_counts = np.zeros(itable.n_occupied, dtype=np.int64)
951
952
953
954
- if kh_exist_int64(table, k):
955
956
957
958
- kh_destroy_int64(table)
+ for k in range(itable.n_buckets):
+ if kh_exist_int64(itable, k):
+ result_keys[i] = itable.keys[k]
+ result_counts[i] = itable.vals[k]
+ kh_destroy_int64(itable)
959
960
return np.asarray(result_keys), np.asarray(result_counts)
961
@@ -1047,7 +1041,7 @@ def mode_int64(int64_t[:] values):
1047
1041
1048
1042
table = kh_init_int64()
1049
1043
1050
1044
+ build_count_table_scalar64(values, table, 0)
1051
1045
1052
1046
modes = np.empty(table.n_buckets, dtype=np.int64)
1053