48
48
from sdc .functions import numpy_like
49
49
from sdc .str_arr_type import string_array_type , StringArrayType
50
50
from sdc .datatypes .range_index_type import RangeIndexType
51
- from sdc .datatypes .int64_index_type import Int64IndexType
52
51
from sdc .str_arr_ext import (num_total_chars , append_string_array_to ,
53
52
str_arr_is_na , pre_alloc_string_array , str_arr_set_na , string_array_type ,
54
53
cp_str_list_to_array , create_str_arr_from_list , get_utf8_size ,
55
- str_arr_set_na_by_mask , str_arr_stable_argosort )
54
+ str_arr_set_na_by_mask )
56
55
from sdc .utilities .prange_utils import parallel_chunks
57
56
from sdc .utilities .utils import sdc_overload , sdc_register_jitable
58
- from sdc .utilities .sdc_typing_utils import (
59
- find_common_dtype_from_numpy_dtypes ,
60
- TypeChecker )
61
- from sdc .utilities .sdc_typing_utils import sdc_pandas_index_types
57
+ from sdc .utilities .sdc_typing_utils import (find_common_dtype_from_numpy_dtypes ,
58
+ TypeChecker )
62
59
63
60
64
61
class SDCLimitation (Exception ):
@@ -74,20 +71,18 @@ def hpat_arrays_append(A, B):
74
71
def hpat_arrays_append_overload (A , B ):
75
72
"""Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A"""
76
73
77
- use_A_array = isinstance (A , ( RangeIndexType , Int64IndexType ) )
78
- use_B_array = isinstance (B , ( RangeIndexType , Int64IndexType ) )
79
- if isinstance (A , (types .Array , RangeIndexType , Int64IndexType )):
80
- if isinstance (B , (types .Array , RangeIndexType , Int64IndexType )):
74
+ A_is_range_index = isinstance (A , RangeIndexType )
75
+ B_is_range_index = isinstance (B , RangeIndexType )
76
+ if isinstance (A , (types .Array , RangeIndexType )):
77
+ if isinstance (B , (types .Array , RangeIndexType )):
81
78
def _append_single_numeric_impl (A , B ):
82
- _A = A .values if use_A_array == True else A # noqa
83
- _B = B .values if use_B_array == True else B # noqa
79
+ _A = A .values if A_is_range_index == True else A # noqa
80
+ _B = B .values if B_is_range_index == True else B # noqa
84
81
return numpy .concatenate ((_A , _B ,))
85
82
86
83
return _append_single_numeric_impl
87
-
88
- elif (isinstance (B , (types .UniTuple , types .List ))
89
- and isinstance (B .dtype , (types .Array , RangeIndexType , Int64IndexType ))):
90
- B_dtype_is_index = isinstance (B .dtype , (RangeIndexType , Int64IndexType ))
84
+ elif isinstance (B , (types .UniTuple , types .List )) and isinstance (B .dtype , (types .Array , RangeIndexType )):
85
+ B_dtype_is_range_index = isinstance (B .dtype , RangeIndexType )
91
86
numba_common_dtype = find_common_dtype_from_numpy_dtypes ([A .dtype , B .dtype .dtype ], [])
92
87
93
88
# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
@@ -97,10 +92,10 @@ def _append_list_numeric_impl(A, B):
97
92
new_data = numpy .empty (total_length , numba_common_dtype )
98
93
99
94
stop = len (A )
100
- _A = numpy .array (A ) if use_A_array == True else A # noqa
95
+ _A = numpy .array (A ) if A_is_range_index == True else A # noqa
101
96
new_data [:stop ] = _A
102
97
for arr in B :
103
- _arr = arr . values if B_dtype_is_index == True else arr # noqa
98
+ _arr = numpy . array ( arr ) if B_dtype_is_range_index == True else arr # noqa
104
99
start = stop
105
100
stop = start + len (_arr )
106
101
new_data [start :stop ] = _arr
@@ -223,13 +218,12 @@ def sdc_join_series_indexes_overload(left, right):
223
218
"""Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm"""
224
219
225
220
# check that both operands are of types used for representing Pandas indexes
226
- if not (isinstance (left , sdc_pandas_index_types ) and isinstance (right , sdc_pandas_index_types )
227
- and not isinstance (left , types .NoneType )
228
- and not isinstance (right , types .NoneType )):
221
+ if not (isinstance (left , (types .Array , StringArrayType , RangeIndexType ))
222
+ and isinstance (right , (types .Array , StringArrayType , RangeIndexType ))):
229
223
return None
230
224
231
- convert_left = isinstance (left , ( RangeIndexType , Int64IndexType ) )
232
- convert_right = isinstance (right , ( RangeIndexType , Int64IndexType ) )
225
+ convert_left = isinstance (left , RangeIndexType )
226
+ convert_right = isinstance (right , RangeIndexType )
233
227
234
228
def _convert_to_arrays_impl (left , right ):
235
229
_left = left .values if convert_left == True else left # noqa
@@ -249,9 +243,10 @@ def sdc_join_range_indexes_impl(left, right):
249
243
250
244
return sdc_join_range_indexes_impl
251
245
252
- elif (isinstance (left , (RangeIndexType , Int64IndexType , types .Array ))
253
- and isinstance (right , (RangeIndexType , Int64IndexType , types .Array ))
254
- and not (isinstance (left , types .Array ) and isinstance (right , types .Array ))):
246
+ elif isinstance (left , RangeIndexType ) and isinstance (right , types .Array ):
247
+ return _convert_to_arrays_impl
248
+
249
+ elif isinstance (left , types .Array ) and isinstance (right , RangeIndexType ):
255
250
return _convert_to_arrays_impl
256
251
257
252
# TODO: remove code duplication below and merge numeric and StringArray impls into one
@@ -518,39 +513,41 @@ def sdc_arrays_argsort(A, kind='quicksort'):
518
513
519
514
520
515
@sdc_overload (sdc_arrays_argsort , jit_options = {'parallel' : False })
521
- def sdc_arrays_argsort_overload (A , kind = 'quicksort' , ascending = True ):
516
+ def sdc_arrays_argsort_overload (A , kind = 'quicksort' ):
522
517
"""Function providing pandas argsort implementation for different 1D array types"""
523
518
524
519
# kind is not known at compile time, so get this function here and use in impl if needed
525
520
quicksort_func = quicksort .make_jit_quicksort ().run_quicksort
526
521
527
522
kind_is_default = isinstance (kind , str )
528
523
if isinstance (A , types .Array ):
529
- def _sdc_arrays_argsort_array_impl (A , kind = 'quicksort' , ascending = True ):
524
+ def _sdc_arrays_argsort_array_impl (A , kind = 'quicksort' ):
530
525
_kind = 'quicksort' if kind_is_default == True else kind # noqa
531
- return numpy_like .argsort (A , kind = _kind , ascending = ascending )
526
+ return numpy_like .argsort (A , kind = _kind )
532
527
533
528
return _sdc_arrays_argsort_array_impl
534
529
535
530
elif A == string_array_type :
536
- def _sdc_arrays_argsort_str_arr_impl (A , kind = 'quicksort' , ascending = True ):
531
+ def _sdc_arrays_argsort_str_arr_impl (A , kind = 'quicksort' ):
537
532
533
+ nan_mask = sdc .hiframes .api .get_nan_mask (A )
534
+ idx = numpy .arange (len (A ))
535
+ old_nan_positions = idx [nan_mask ]
536
+
537
+ data = A [~ nan_mask ]
538
+ keys = idx [~ nan_mask ]
538
539
if kind == 'quicksort' :
539
- indexes = numpy .arange (len (A ))
540
- data_index_pairs = list (zip (list (A ), list (indexes )))
541
- zipped = quicksort_func (data_index_pairs )
542
- argsorted = [zipped [i ][1 ] for i in indexes ]
543
- res = numpy .array (argsorted , dtype = numpy .int64 )
544
- # for non-stable sort the order within groups does not matter
545
- # so just reverse the result when sorting in descending order
546
- if not ascending :
547
- res = res [::- 1 ]
540
+ zipped = list (zip (list (data ), list (keys )))
541
+ zipped = quicksort_func (zipped )
542
+ argsorted = [zipped [i ][1 ] for i in numpy .arange (len (data ))]
548
543
elif kind == 'mergesort' :
549
- res = str_arr_stable_argosort (A , ascending = ascending )
544
+ sdc .hiframes .sort .local_sort ((data , ), (keys , ))
545
+ argsorted = list (keys )
550
546
else :
551
547
raise ValueError ("Unrecognized kind of sort in sdc_arrays_argsort" )
552
548
553
- return res
549
+ argsorted .extend (old_nan_positions )
550
+ return numpy .asarray (argsorted , dtype = numpy .int32 )
554
551
555
552
return _sdc_arrays_argsort_str_arr_impl
556
553
@@ -621,16 +618,13 @@ def _sdc_take(data, indexes):
621
618
@sdc_overload (_sdc_take )
622
619
def _sdc_take_overload (data , indexes ):
623
620
624
- valid_data_types = (types .Array ,) + sdc_pandas_index_types
625
- if not (isinstance (data , valid_data_types ) and not isinstance (data , types .NoneType )):
621
+ if not isinstance (data , (types .Array , StringArrayType , RangeIndexType )):
626
622
return None
627
-
628
- if not (isinstance (indexes , (types .Array , types .List , Int64IndexType ))
623
+ if not (isinstance (indexes , (types .Array , types .List ))
629
624
and isinstance (indexes .dtype , (types .Integer , types .ListType ))):
630
625
return None
631
626
632
- if (isinstance (indexes .dtype , types .ListType )
633
- and isinstance (data , (types .Array , types .List , RangeIndexType , Int64IndexType ))):
627
+ if isinstance (indexes .dtype , types .ListType ) and isinstance (data , (types .Array , types .List , RangeIndexType )):
634
628
arr_dtype = data .dtype
635
629
636
630
def _sdc_take_list_impl (data , indexes ):
@@ -683,7 +677,7 @@ def _sdc_take_list_str_impl(data, indexes):
683
677
684
678
return _sdc_take_list_str_impl
685
679
686
- elif isinstance (data , (types .Array , RangeIndexType , Int64IndexType )):
680
+ elif isinstance (data , (types .Array , RangeIndexType )):
687
681
arr_dtype = data .dtype
688
682
689
683
def _sdc_take_array_impl (data , indexes ):
@@ -746,7 +740,6 @@ def sdc_reindex_series_overload(arr, index, name, by_index):
746
740
""" Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
747
741
748
742
range_indexes = isinstance (index , RangeIndexType ) and isinstance (by_index , RangeIndexType )
749
- int64_indexes = isinstance (index , Int64IndexType ) and isinstance (by_index , Int64IndexType )
750
743
data_dtype , index_dtype = arr .dtype , index .dtype
751
744
data_is_str_arr = isinstance (arr .dtype , types .UnicodeType )
752
745
@@ -755,8 +748,6 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
755
748
# no reindexing is needed if indexes are equal
756
749
if range_indexes == True : # noqa
757
750
equal_indexes = numpy_like .array_equal (index , by_index )
758
- elif int64_indexes == True : # noqa
759
- equal_indexes = numpy_like .array_equal (index , by_index )
760
751
else :
761
752
equal_indexes = False
762
753
if (index is by_index or equal_indexes ):
@@ -781,10 +772,10 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
781
772
map_index_to_position [value ] = i
782
773
783
774
index_mismatch = 0
784
- for i in numba . prange ( len ( by_index )):
785
- val = by_index [ i ]
786
- if val in map_index_to_position :
787
- pos_in_self = map_index_to_position [val ]
775
+ # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used
776
+ for i in numpy . arange ( len ( by_index )):
777
+ if by_index [ i ] in map_index_to_position :
778
+ pos_in_self = map_index_to_position [by_index [ i ] ]
788
779
_res_data [i ] = arr [pos_in_self ]
789
780
if data_is_str_arr == True : # noqa
790
781
res_data_nan_mask [i ] = isna (arr , i )
0 commit comments