8000 refactor: Switch explode node to use column offsets by TrevorBergeron · Pull Request #978 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions bigframes/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,9 +479,8 @@ def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
for column_id in column_ids:
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))

return ArrayValue(
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
)
offsets = tuple(self.get_offset_for_name(id) for id in column_ids)
return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets))

def _uniform_sampling(self, fraction: float) -> ArrayValue:
"""Sampling the table on given fraction.
Expand All @@ -490,3 +489,6 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
The row numbers of result is non-deterministic, avoid to use.
"""
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))

def get_offset_for_name(self, name: str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: get_name_offset ?

return self.schema.names.index(name)
11 changes: 8 additions & 3 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,9 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
columns=columns,
)

def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
def explode(self, offsets: typing.Sequence[int]) -> UnorderedIR:
table = self._to_ibis_expr()
column_ids = tuple(table.columns[offset] for offset in offsets)

# The offset array ensures null represents empty arrays after unnesting.
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
Expand Down Expand Up @@ -712,16 +713,20 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR:
ordering=self._ordering,
)

def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
def explode(self, offsets: typing.Sequence[int]) -> OrderedIR:
table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
column_ids = tuple(table.columns[offset] for offset in offsets)

offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
offset_array = (
vendored_ibis_ops.GenerateArray(
ibis.greatest(
0,
ibis.least(
*[table[column_id].length() - 1 for column_id in column_ids]
*[
table[table.columns[offset]].length() - 1
for offset in offsets
]
),
)
)
Expand Down
9 changes: 6 additions & 3 deletions bigframes/core/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
OVERHEAD_VARIABLES = 5


COL_OFFSET = int


@dataclass(frozen=True)
class BigFrameNode:
"""
Expand Down Expand Up @@ -826,7 +829,7 @@ def variables_introduced(self) -> int:

@dataclass(frozen=True)
class ExplodeNode(UnaryNode):
column_ids: typing.Tuple[str, ...]
column_ids: typing.Tuple[COL_OFFSET, ...]

@property
def row_preserving(self) -> bool:
Expand All @@ -844,9 +847,9 @@ def schema(self) -> schemata.ArraySchema:
self.child.schema.get_type(name).pyarrow_dtype.value_type
),
)
if name in self.column_ids
if offset in self.column_ids
else schemata.SchemaItem(name, self.child.schema.get_type(name))
for name in self.child.schema.names
for offset, name in enumerate(self.child.schema.names)
)
return schemata.ArraySchema(items)

Expand Down
0