From b55020d306aa2b62c01cc530bae6d20b83498ba2 Mon Sep 17 00:00:00 2001
From: "Kozlov, Alexey" <alexey.kozlov@intel.com>
Date: Tue, 21 Dec 2021 22:26:05 +0300
Subject: [PATCH] Fixes incorrect definition of layout for SeriesType

Details: definition of underlying data type of Series was
done from PyObject dtype only and didn't take into account
layout of original array, as a result 'C' layout was always
inferred, where the original array might have other layout,
breaking iteration over such Series (DF columns).

Fixes #996.
---
 sdc/hiframes/boxing.py      | 33 ++++++++++++++++++-------
 sdc/tests/test_dataframe.py | 29 +++++++++++++++++++++-
 sdc/tests/test_series.py    | 48 +++++++++++++++++++++++++++++++++----
 3 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py
index 656b3833c..b8c7011b6 100644
--- a/sdc/hiframes/boxing.py
+++ b/sdc/hiframes/boxing.py
@@ -46,7 +46,6 @@
 from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical
 from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical
 from sdc.hiframes.pd_series_ext import SeriesType
-from sdc.hiframes.pd_series_type import _get_series_array_type
 from sdc.hiframes.pd_dataframe_ext import get_structure_maps
 from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types
 
@@ -70,7 +69,7 @@ def typeof_pd_dataframe(val, c):
 
     col_names = tuple(val.columns.tolist())
     # TODO: support other types like string and timestamp
-    col_types = get_hiframes_dtypes(val)
+    col_types = _infer_df_col_types(val)
     index_type = _infer_index_type(val.index)
     column_loc, _, _ = get_structure_maps(col_types, col_names)
 
@@ -82,8 +81,24 @@ def typeof_pd_dataframe(val, c):
 def typeof_pd_series(val, c):
     index_type = _infer_index_type(val.index)
     is_named = val.name is not None
+
+    # attempt to define numba Series data type via Series values,
+    # if not successful, define it later via dtype in SeriesType init
+    underlying_type = None
+    try:
+        underlying_type = numba.typeof(val.values)
+    except ValueError:
+        pass
+
+    if not (isinstance(underlying_type, types.Array)
+            and not isinstance(underlying_type.dtype, types.PyObject)):
+        underlying_type = None
+
     return SeriesType(
-        _infer_series_dtype(val), index=index_type, is_named=is_named)
+        dtype=_infer_series_dtype(val),
+        data=underlying_type,
+        index=index_type,
+        is_named=is_named)
 
 
 @unbox(DataFrameType)
@@ -140,13 +155,13 @@ def unbox_dataframe(typ, val, c):
     return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
 
 
-def get_hiframes_dtypes(df):
-    """get hiframe data types for a pandas dataframe
-    """
+def _infer_df_col_types(df):
+    """ Infer column data types for a pandas DataFrame """
+
     col_names = df.columns.tolist()
-    hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname]))
-               for cname in col_names]
-    return tuple(hi_typs)
+    col_typs = [numba.typeof(df[cname]).data for cname in col_names]
+
+    return tuple(col_typs)
 
 
 def _infer_series_dtype(S):
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
index 23ec45639..17ea05a91 100644
--- a/sdc/tests/test_dataframe.py
+++ b/sdc/tests/test_dataframe.py
@@ -132,6 +132,7 @@ def test_impl():
         self.assertEqual(hpat_func(), test_impl())
 
     def test_create_with_series1(self):
+        """ Create pandas DataFrame from Series of different dtypes """
         def test_impl(n):
             A = pd.Series(np.ones(n, dtype=np.int64))
             B = pd.Series(np.zeros(n, dtype=np.float64))
@@ -143,7 +144,7 @@ def test_impl(n):
         pd.testing.assert_frame_equal(hpat_func(n), test_impl(n))
 
     def test_create_with_series2(self):
-        # test creating dataframe from passed series
+        """ Test creating pandas DataFrame from passed Series """
         def test_impl(A):
             df = pd.DataFrame({'A': A})
             return (df.A == 2).sum()
@@ -153,6 +154,18 @@ def test_impl(A):
         df = pd.DataFrame({'A': np.arange(n)})
         self.assertEqual(hpat_func(df.A), test_impl(df.A))
 
+    def test_create_with_series3(self):
+        """ Test creating pandas DataFrame from Series of different layouts """
+        def test_impl(A, B):
+            df = pd.DataFrame({'A': A, 'B': B})
+            return df.A.sum(), df.B.sum()
+        sdc_func = self.jit(test_impl)
+
+        n = 11
+        A = pd.Series(np.arange(n))
+        B = pd.Series(np.arange(2 * n)[::2])
+        self.assertEqual(sdc_func(A, B), test_impl(A, B))
+
     def test_df_create_param_index_default(self):
         def test_impl():
             data = {'A': ['a', 'b'], 'B': [2, 3]}
@@ -219,6 +232,8 @@ def test_impl():
         pd.testing.assert_frame_equal(hpat_func(), test_impl())
 
     def test_pass_df1(self):
+        """ Test passing df with contiguous data layout """
+
         def test_impl(df):
             return (df.A == 2).sum()
         hpat_func = self.jit(test_impl)
@@ -227,6 +242,18 @@ def test_impl(df):
         df = pd.DataFrame({'A': np.arange(n)})
         self.assertEqual(hpat_func(df), test_impl(df))
 
+    def test_pass_df_2(self):
+        """ Test passing df with non-contiguous data layout """
+
+        def test_impl(df):
+            return df.B.sum()
+        sdc_func = self.jit(test_impl)
+
+        n_rows, n_cols = 4, 6
+        col_names = list(string.ascii_uppercase[:n_cols])
+        df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=col_names)
+        self.assertAlmostEqual(sdc_func(df), test_impl(df))
+
     def test_pass_df_str(self):
         def test_impl(df):
             return (df.A == 'a').sum()
diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py
index 4ec29885d..917d7d994 100644
--- a/sdc/tests/test_series.py
+++ b/sdc/tests/test_series.py
@@ -63,6 +63,7 @@
                                   gen_strlist,
                                   _make_func_from_text)
 from sdc.utilities.sdc_typing_utils import SDCLimitation
+from sdc.hiframes.pd_series_type import SeriesType
 
 
 _cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [
@@ -339,18 +340,35 @@ def test_impl(name):
 
         pd.testing.assert_series_equal(hpat_func('A'), test_impl('A'))
 
-    @skip_numba_jit
+    def test_create_series_data_layouts(self):
+        def test_impl(data):
+            vals = pd.Series(data).values
+            return vals[0], vals[-1]
+        sdc_func = self.jit(test_impl)
+
+        n = 10
+        arrays_to_test = [
+            np.arange(n),           # 'C' layout
+            np.arange(2 * n)[::2],  # 'A' layout
+            # no 'F' layout for 1d arrays
+        ]
+
+        for data in arrays_to_test:
+            with self.subTest(layout=numba.typeof(data).layout):
+                result = sdc_func(data)
+                result_ref = test_impl(data)
+                self.assertEqual(result, result_ref)
+
     def test_pass_series1(self):
-        # TODO: check to make sure it is series type
         def test_impl(A):
             return (A == 2).sum()
-        hpat_func = self.jit(test_impl)
+        sdc_func = self.jit(test_impl)
 
         n = 11
         S = pd.Series(np.arange(n), name='A')
-        self.assertEqual(hpat_func(S), test_impl(S))
+        self.assertEqual(sdc_func(S), test_impl(S))
+        self.assertIsInstance(numba.typeof(S), SeriesType)
 
-    @skip_numba_jit
     def test_pass_series_str(self):
         def test_impl(A):
             return (A == 'a').sum()
@@ -358,6 +376,7 @@ def test_impl(A):
 
         S = pd.Series(['a', 'b', 'c'], name='A')
         self.assertEqual(hpat_func(S), test_impl(S))
+        self.assertIsInstance(numba.typeof(S), SeriesType)
 
     def test_pass_series_all_indexes(self):
         def test_impl(A):
@@ -378,6 +397,25 @@ def test_impl(A):
                 S = pd.Series(np.arange(n), index, name='A')
                 pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
 
+    def test_pass_series_data_layouts(self):
+        def test_impl(S):
+            vals = S.values
+            return vals[0], vals[-1]
+        sdc_func = self.jit(test_impl)
+
+        n = 10
+        series_to_test = [
+            pd.Series(np.arange(n)),       # 'C' layout
+            pd.Series(np.arange(n))[::2],  # 'A' layout
+            # no 'F' layout for Series
+        ]
+
+        for s in series_to_test:
+            with self.subTest(layout=numba.typeof(s).data.layout):
+                result = sdc_func(s)
+                result_ref = test_impl(s)
+                self.assertEqual(result, result_ref)
+
     def test_series_getattr_size(self):
         def test_impl(S):
             return S.size