This repository was archived by the owner on Feb 2, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 65
Redesign DataFrame structure #817
Merged
AlexanderKalistratov
merged 9 commits into
IntelPython:feature/dataframe_model_refactoring
from
akharche:new_df_structure
Apr 30, 2020
Merged
Changes from 3 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
83a4630
Redesign DataFrame structure
akharche efa54fc
Fix pep
akharche 677a673
Improved named tuple, added types ordering
akharche 2712d76
Fix incrementing
akharche 6468e82
Skip tests expected to fail
akharche 6b86a95
Merge branch 'feature/dataframe_model_refactoring' of https://github.…
akharche 0048c7c
Skip blocks of tests expected to fail due to new structure
akharche 6a42d51
Skip examples running + add decorator dfRefactoringNotImplemented
akharche 926c123
True negatives fix
akharche File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,7 @@ | |
import operator | ||
|
||
import numba | ||
from typing import NamedTuple | ||
from numba import types, cgutils | ||
from numba.extending import (models, register_model, lower_cast, infer_getattr, | ||
type_callable, infer, overload, intrinsic, | ||
|
@@ -52,6 +53,11 @@ def generic_resolve(self, df, attr): | |
return SeriesType(arr_typ.dtype, arr_typ, df.index, True) | ||
|
||
|
||
class ColumnId(NamedTuple): | ||
type_id: int | ||
col_type_id: int | ||
|
||
|
||
@intrinsic | ||
def init_dataframe(typingctx, *args): | ||
"""Create a DataFrame with provided data, index and columns values. | ||
|
@@ -66,6 +72,29 @@ def init_dataframe(typingctx, *args): | |
index_typ = args[n_cols] | ||
column_names = tuple(a.literal_value for a in args[n_cols + 1:]) | ||
|
||
# Define df structure, map column name to column position ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)} | ||
df_structure = {} | ||
# Store unique types of columns ex. {'int64': (0, [0, 2]), 'float64': (1, [1])} | ||
data_typs_map = {} | ||
types_order = [] | ||
type_id = 0 | ||
for col_id, col_typ in enumerate(data_typs): | ||
col_name = column_names[col_id] | ||
|
||
if col_typ not in data_typs_map: | ||
data_typs_map[col_typ] = (type_id, [col_id]) | ||
# The first column in each type always has 0 index | ||
df_structure[col_name] = ColumnId(type_id, 0) | ||
types_order.append(col_typ) | ||
else: | ||
# Get index of column in list of types | ||
type_id, col_indices = data_typs_map[col_typ] | ||
col_idx_list = len(col_indices) | ||
df_structure[col_name] = ColumnId(type_id, col_idx_list) | ||
col_indices.append(col_id) | ||
|
||
type_id += 1 | ||
|
||
def codegen(context, builder, signature, args): | ||
in_tup = args[0] | ||
data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] | ||
|
@@ -76,15 +105,23 @@ def codegen(context, builder, signature, args): | |
dataframe = cgutils.create_struct_proxy( | ||
signature.return_type)(context, builder) | ||
|
||
data_list_type = [types.List(typ) for typ in types_order] | ||
|
||
data_lists = [] | ||
for typ_id, typ in enumerate(data_typs_map.keys()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there no problem with order of lists in the tuple? |
||
data_list_typ = context.build_list(builder, data_list_type[typ_id], | ||
[data_arrs[data_id] for data_id in data_typs_map[typ][1]]) | ||
data_lists.append(data_list_typ) | ||
|
||
data_tup = context.make_tuple( | ||
builder, types.Tuple(data_typs), data_arrs) | ||
column_tup = context.make_tuple( | ||
builder, types.UniTuple(string_type, n_cols), column_strs) | ||
zero = context.get_constant(types.int8, 0) | ||
builder, types.Tuple(data_list_type), data_lists) | ||
|
||
col_list_type = types.List(string_type) | ||
column_list = context.build_list(builder, col_list_type, column_strs) | ||
|
||
dataframe.data = data_tup | ||
dataframe.index = index | ||
dataframe.columns = column_tup | ||
dataframe.columns = column_list | < 8000 /tr>||
dataframe.parent = context.get_constant_null(types.pyobject) | ||
|
||
# increase refcount of stored values | ||
|
@@ -97,7 +134,7 @@ def codegen(context, builder, signature, args): | |
|
||
return dataframe._getvalue() | ||
|
||
ret_typ = DataFrameType(data_typs, index_typ, column_names) | ||
ret_typ = DataFrameType(data_typs, index_typ, column_names, df_structure=df_structure) | ||
sig = signature(ret_typ, types.Tuple(args)) | ||
return sig, codegen | ||
|
||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
type_id
looks like a counter. Is it ok that we rewrite it here? Maybe use another name fortype_id
in else-block and incrementtype_id
only inside of if-block.