8000 Refactor to return a dictionary of buffers · iskode/dataframe-api@9ec830c · GitHub
[go: up one dir, main page]

Skip to content

Commit 9ec830c

Browse files
committed
Refactor to return a dictionary of buffers
1 parent 0d0e94b commit 9ec830c

File tree

1 file changed

+46
-8
lines changed

1 file changed

+46
-8
lines changed

protocol/pandas_implementation.py

Lines changed: 46 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
9999
raise NotImplementedError("Null values represented as masks or "
100100
"sentinel values not handled yet")
101101

102-
_buffer, _dtype = col.get_data_buffer()
102+
_buffer, _dtype = col.get_buffers()["data"]
103103
return buffer_to_ndarray(_buffer, _dtype)
104104

105105

@@ -143,7 +143,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
143143
# categories = col._col.values.categories.values
144144
# codes = col._col.values.codes
145145
categories = np.asarray(list(mapping.values()))
146-
codes_buffer, codes_dtype = col.get_data_buffer()
146+
codes_buffer, codes_dtype = col.get_buffers()["data"]
147147
codes = buffer_to_ndarray(codes_buffer, codes_dtype)
148148
values = categories[codes]
149149

@@ -166,14 +166,17 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
166166
"""
167167
Convert a string column to a NumPy array.
168168
"""
169+
# Retrieve the data buffers:
170+
buffers = col.get_buffers()
171+
169172
# Retrieve the data buffer containing the UTF-8 code units
170-
dbuffer, bdtype = col.get_data_buffer()
173+
dbuffer, bdtype = buffers["data"]
171174

172175
# Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
173-
obuffer, odtype = col.get_offsets_buffer()
176+
obuffer, odtype = buffers["offsets"]
174177

175178
# Retrieve the mask buffer indicating the presence of missing values:
176-
mbuffer, mdtype = col.get_validity_buffer()
179+
mbuffer, mdtype = buffers["validity"]
177180

178181
# Retrieve the missing value encoding:
179182
null_value = col.describe_null[1]
@@ -500,7 +503,42 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
500503
"""
501504
return (self,)
502505

503-
def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple
506+
def get_buffers(self) -> Dict[str, Any]:
507+
"""
508+
Return a dictionary containing the underlying buffers.
509+
510+
The returned dictionary has the following contents:
511+
512+
- "data": a two-element tuple whose first element is a tuple
513+
containing the buffer containing the data and whose second
514+
element is the data buffer's associated dtype.
515+
- "validity": a two-element tuple whose first element is a tuple
516+
containing the buffer containing mask values
517+
indicating missing data and whose second element is
518+
the mask value buffer's associated dtype. None if the
519+
null representation is not a bit or byte mask.
520+
- "offsets": a two-element tuple whose first element is a tuple
521+
containing the buffer containing the offset values for
522+
variable-size binary data (e.g., variable-length
523+
strings) and whose second element is the offsets
524+
buffer's associated dtype. None if the data buffer does
525+
not have an associated offsets buffer.
526+
"""
527+
buffers = {}
528+
buffers["data"] = self._get_data_buffer()
529+
try:
530+
buffers["validity"] = self._get_validity_buffer()
531+
except:
532+
buffers["validity"] = None
533+
534+
try:
535+
buffers["offsets"] = self._get_offsets_buffer()
536+
except:
537+
buffers["offsets"] = None
538+
539+
return buffers
540+
541+
def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple
504542
"""
505543
Return the buffer containing the data and the buffer's associated dtype.
506544
"""
@@ -532,7 +570,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype
532570

533571
return buffer, dtype
534572

535-
def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:
573+
def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:
536574
"""
537575
Return the buffer containing the mask values indicating missing data and
538576
the buffer's associated dtype.
@@ -578,7 +616,7 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:
578616

579617
raise RuntimeError(msg)
580618

581-
def get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
619+
def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
582620
"""
583621
Return the buffer containing the offset values for variable-size binary
584622
data (e.g., variable-length strings) and the buffer's associated dtype.

0 commit comments

Comments
 (0)
0