8000 Redesign DataFrame structure by akharche · Pull Request #817 · IntelPython/sdc · GitHub
[go: up one dir, main page]

Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Redesign DataFrame structure #817

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,13 +416,15 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe
# return pandas.Series([result_A, result_B], ['A', 'B'])


def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns):
def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns, df_structure):
result_name_list = []
joined = ', '.join(func_params)
func_lines = [f'def _df_{func_name}_impl({joined}):']
for i, c in enumerate(columns):
type_id = df_structure[c].type_id
col_id = df_structure[c].col_type_id
result_c = f'result_{i}'
func_lines += [f' series_{i} = pandas.Series({func_params[0]}._data[{i}])',
func_lines += [f' series_{i} = pandas.Series({func_params[0]}._data[{type_id}][{col_id}])',
f' {result_c} = series_{i}.{func_name}({series_params})']
result_name_list.append(result_c)
all_results = ', '.join(result_name_list)
Expand All @@ -449,7 +451,8 @@ def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params):

df_func_name = f'_df_{func_name}_impl'

func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns)
func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns,
df.df_structure)
loc_vars = {}
exec(func_text, global_vars, loc_vars)
_reduce_impl = loc_vars[df_func_name]
Expand Down
49 changes: 43 additions & 6 deletions sdc/hiframes/pd_dataframe_ext.py
< 8000 /tr>
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import operator

import numba
from typing import NamedTuple
from numba import types, cgutils
from numba.extending import (models, register_model, lower_cast, infer_getattr,
type_callable, infer, overload, intrinsic,
Expand All @@ -52,6 +53,11 @@ def generic_resolve(self, df, attr):
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)


class ColumnId(NamedTuple):
type_id: int
col_type_id: int


@intrinsic
def init_dataframe(typingctx, *args):
"""Create a DataFrame with provided data, index and columns values.
Expand All @@ -66,6 +72,29 @@ def init_dataframe(typingctx, *args):
index_typ = args[n_cols]
column_names = tuple(a.literal_value for a in args[n_cols + 1:])

# Define df structure, map column name to column position ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
df_structure = {}
# Store unique types of columns ex. {'int64': (0, [0, 2]), 'float64': (1, [1])}
data_typs_map = {}
types_order = []
type_id = 0
for col_id, col_typ in enumerate(data_typs):
col_name = column_names[col_id]

if col_typ not in data_typs_map:
data_typs_map[col_typ] = (type_id, [col_id])
# The first column in each type always has 0 index
df_structure[col_name] = ColumnId(type_id, 0)
types_order.append(col_typ)
else:
# Get index of column in list of types
type_id, col_indices = data_typs_map[col_typ]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type_id looks like a counter. Is it ok that we rewrite it here? Maybe use another name for type_id in else-block and increment type_id only inside of if-block.

col_idx_list = len(col_indices)
df_structure[col_name] = ColumnId(type_id, col_idx_list)
col_indices.append(col_id)

type_id += 1

def codegen(context, builder, signature, args):
in_tup = args[0]
data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]
Expand All @@ -76,15 +105,23 @@ def codegen(context, builder, signature, args):
dataframe = cgutils.create_struct_proxy(
signature.return_type)(context, builder)

data_list_type = [types.List(typ) for typ in types_order]

data_lists = []
for typ_id, typ in enumerate(data_typs_map.keys()):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there no problem with order of lists in the tuple?

data_list_typ = context.build_list(builder, data_list_type[typ_id],
[data_arrs[data_id] for data_id in data_typs_map[typ][1]])
data_lists.append(data_list_typ)

data_tup = context.make_tuple(
builder, types.Tuple(data_typs), data_arrs)
column_tup = context.make_tuple(
builder, types.UniTuple(string_type, n_cols), column_strs)
zero = context.get_constant(types.int8, 0)
builder, types.Tuple(data_list_type), data_lists)

col_list_type = types.List(string_type)
column_list = context.build_list(builder, col_list_type, column_strs)

dataframe.data = data_tup
dataframe.index = index
dataframe.columns = column_tup
dataframe.columns = column_list
dataframe.parent = context.get_constant_null(types.pyobject)

# increase refcount of stored values
Expand All @@ -97,7 +134,7 @@ def codegen(context, builder, signature, args):

return dataframe._getvalue()

ret_typ = DataFrameType(data_typs, index_typ, column_names)
ret_typ = DataFrameType(data_typs, index_typ, column_names, df_structure=df_structure)
sig = signature(ret_typ, types.Tuple(args))
return sig, codegen

Expand Down
16 changes: 12 additions & 4 deletions sdc/hiframes/pd_dataframe_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class DataFrameType(types.Type): # TODO: IterableType over column names
"""Temporary type class for DataFrame objects.
"""

def __init__(self, data=None, index=None, columns=None, has_parent=False):
def __init__(self, data=None, index=None, columns=None, has_parent=False, df_structure=None):
self.data = data
if index is None:
index = types.none
Expand All @@ -45,6 +45,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False):
# keeping whether it is unboxed from Python to enable reflection of new
# columns
self.has_parent = has_parent
self.df_structure = df_structure
super(DataFrameType, self).__init__(
name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))

Expand Down Expand Up @@ -85,11 +86,18 @@ def is_precise(self):
@register_model(DataFrameType)
class DataFrameModel(models.StructModel):
def __init__(self, dmm, fe_type):
n_cols = len(fe_type.columns)
types_unique = set()
df_types = []
for col_type in fe_type.data:
if col_type in types_unique:
continue
types_unique.add(col_type)
df_types.append(col_type)

members = [
('data', types.Tuple(fe_type.data)),
('data', types.Tuple([types.List(typ) for typ in df_types])),
('index', fe_type.index),
('columns', types.UniTuple(string_type, n_cols)),
('columns', types.List(string_type)),
('parent', types.pyobject),
]
super(DataFrameModel, self).__init__(dmm, fe_type, members)
Expand Down
0