From b55020d306aa2b62c01cc530bae6d20b83498ba2 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 21 Dec 2021 22:26:05 +0300 Subject: [PATCH] Fixes incorrect definition of layout for SeriesType Details: definition of underlying data type of Series was done from PyObject dtype only and didn't take into account layout of original array, as a result 'C' layout was always inferred, where the original array might have other layout, breaking iteration over such Series (DF columns). Fixes #996. --- sdc/hiframes/boxing.py | 33 ++++++++++++++++++------- sdc/tests/test_dataframe.py | 29 +++++++++++++++++++++- sdc/tests/test_series.py | 48 +++++++++++++++++++++++++++++++++---- 3 files changed, 95 insertions(+), 15 deletions(-) diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 656b3833c..b8c7011b6 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -46,7 +46,6 @@ from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical from sdc.hiframes.pd_series_ext import SeriesType -from sdc.hiframes.pd_series_type import _get_series_array_type from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types @@ -70,7 +69,7 @@ def typeof_pd_dataframe(val, c): col_names = tuple(val.columns.tolist()) # TODO: support other types like string and timestamp - col_types = get_hiframes_dtypes(val) + col_types = _infer_df_col_types(val) index_type = _infer_index_type(val.index) column_loc, _, _ = get_structure_maps(col_types, col_names) @@ -82,8 +81,24 @@ def typeof_pd_dataframe(val, c): def typeof_pd_series(val, c): index_type = _infer_index_type(val.index) is_named = val.name is not None + + # attempt to define numba Series data type via Series values, + # if not successful, define it later via dtype in SeriesType init + underlying_type = None + try: + underlying_type = numba.typeof(val.values) + except ValueError: + pass + + if not (isinstance(underlying_type, types.Array) + and not isinstance(underlying_type.dtype, types.PyObject)): + underlying_type = None + return SeriesType( - _infer_series_dtype(val), index=index_type, is_named=is_named) + dtype=_infer_series_dtype(val), + data=underlying_type, + index=index_type, + is_named=is_named) @unbox(DataFrameType) @@ -140,13 +155,13 @@ def unbox_dataframe(typ, val, c): return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr)) -def get_hiframes_dtypes(df): - """get hiframe data types for a pandas dataframe - """ +def _infer_df_col_types(df): + """ Infer column data types for a pandas DataFrame """ + col_names = df.columns.tolist() - hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname])) - for cname in col_names] - return tuple(hi_typs) + col_typs = [numba.typeof(df[cname]).data for cname in col_names] + + return tuple(col_typs) def _infer_series_dtype(S): diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 23ec45639..17ea05a91 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -132,6 +132,7 @@ def test_impl(): self.assertEqual(hpat_func(), test_impl()) def test_create_with_series1(self): + """ Create pandas DataFrame from Series of different dtypes """ def test_impl(n): A = pd.Series(np.ones(n, dtype=np.int64)) B = pd.Series(np.zeros(n, dtype=np.float64)) @@ -143,7 +144,7 @@ def test_impl(n): pd.testing.assert_frame_equal(hpat_func(n), test_impl(n)) def test_create_with_series2(self): - # test creating dataframe from passed series + """ Test creating pandas DataFrame from passed Series """ def test_impl(A): df = pd.DataFrame({'A': A}) return (df.A == 2).sum() @@ -153,6 +154,18 @@ def test_impl(A): df = pd.DataFrame({'A': np.arange(n)}) self.assertEqual(hpat_func(df.A), test_impl(df.A)) + def test_create_with_series3(self): + """ Test creating pandas DataFrame from Series of different layouts """ + def test_impl(A, B): + df = pd.DataFrame({'A': A, 'B': B}) + return df.A.sum(), df.B.sum() + sdc_func = self.jit(test_impl) + + n = 11 + A = pd.Series(np.arange(n)) + B = pd.Series(np.arange(2 * n)[::2]) + self.assertEqual(sdc_func(A, B), test_impl(A, B)) + def test_df_create_param_index_default(self): def test_impl(): data = {'A': ['a', 'b'], 'B': [2, 3]} @@ -219,6 +232,8 @@ def test_impl(): pd.testing.assert_frame_equal(hpat_func(), test_impl()) def test_pass_df1(self): + """ Test passing df with contiguous data layout """ + def test_impl(df): return (df.A == 2).sum() hpat_func = self.jit(test_impl) @@ -227,6 +242,18 @@ def test_impl(df): df = pd.DataFrame({'A': np.arange(n)}) self.assertEqual(hpat_func(df), test_impl(df)) + def test_pass_df_2(self): + """ Test passing df with non-contiguous data layout """ + + def test_impl(df): + return df.B.sum() + sdc_func = self.jit(test_impl) + + n_rows, n_cols = 4, 6 + col_names = list(string.ascii_uppercase[:n_cols]) + df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=col_names) + self.assertAlmostEqual(sdc_func(df), test_impl(df)) + def test_pass_df_str(self): def test_impl(df): return (df.A == 'a').sum() diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 4ec29885d..917d7d994 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -63,6 +63,7 @@ gen_strlist, _make_func_from_text) from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.hiframes.pd_series_type import SeriesType _cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [ @@ -339,18 +340,35 @@ def test_impl(name): pd.testing.assert_series_equal(hpat_func('A'), test_impl('A')) - @skip_numba_jit + def test_create_series_data_layouts(self): + def test_impl(data): + vals = pd.Series(data).values + return vals[0], vals[-1] + sdc_func = self.jit(test_impl) + + n = 10 + arrays_to_test = [ + np.arange(n), # 'C' layout + np.arange(2 * n)[::2], # 'A' layout + # no 'F' layout for 1d arrays + ] + + for data in arrays_to_test: + with self.subTest(layout=numba.typeof(data).layout): + result = sdc_func(data) + result_ref = test_impl(data) + self.assertEqual(result, result_ref) + def test_pass_series1(self): - # TODO: check to make sure it is series type def test_impl(A): return (A == 2).sum() - hpat_func = self.jit(test_impl) + sdc_func = self.jit(test_impl) n = 11 S = pd.Series(np.arange(n), name='A') - self.assertEqual(hpat_func(S), test_impl(S)) + self.assertEqual(sdc_func(S), test_impl(S)) + self.assertIsInstance(numba.typeof(S), SeriesType) - @skip_numba_jit def test_pass_series_str(self): def test_impl(A): return (A == 'a').sum() @@ -358,6 +376,7 @@ def test_impl(A): S = pd.Series(['a', 'b', 'c'], name='A') self.assertEqual(hpat_func(S), test_impl(S)) + self.assertIsInstance(numba.typeof(S), SeriesType) def test_pass_series_all_indexes(self): def test_impl(A): @@ -378,6 +397,25 @@ def test_impl(A): S = pd.Series(np.arange(n), index, name='A') pd.testing.assert_series_equal(hpat_func(S), test_impl(S)) + def test_pass_series_data_layouts(self): + def test_impl(S): + vals = S.values + return vals[0], vals[-1] + sdc_func = self.jit(test_impl) + + n = 10 + series_to_test = [ + pd.Series(np.arange(n)), # 'C' layout + pd.Series(np.arange(n))[::2], # 'A' layout + # no 'F' layout for Series + ] + + for s in series_to_test: + with self.subTest(layout=numba.typeof(s).data.layout): + result = sdc_func(s) + result_ref = test_impl(s) + self.assertEqual(result, result_ref) + def test_series_getattr_size(self): def test_impl(S): return S.size