8000 Add string support in various methods and add todos · iskode/dataframe-api@c8000f7 · GitHub
[go: up one dir, main page]

Skip to content

Commit c8000f7

Browse files
committed
Add string support in various methods and add todos
1 parent c7728e2 commit c8000f7

File tree

1 file changed

+54
-15
lines changed

1 file changed

+54
-15
lines changed

protocol/pandas_implementation.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import ctypes
2424
from typing import Any, Optional, Tuple, Dict, Iterable, Sequence
2525

26+
from io import StringIO
27+
2628
import pandas as pd
2729
import numpy as np
2830
import pandas._testing as tm
@@ -70,6 +72,8 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
7072
columns[name] = convert_column_to_ndarray(col)
7173
elif col.dtype[0] == _k.CATEGORICAL:
7274
columns[name] = convert_categorical_column(col)
75+
elif col.dtype[0] == _k.STRING:
76+
columns[name] = convert_string_column(col)
7377
else:
7478
raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet")
7579

@@ -88,7 +92,7 @@ class _DtypeKind(enum.IntEnum):
8892

8993
def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
9094
"""
91-
Convert an int, uint, float or bool column to a numpy array
95+
Convert an int, uint, float or bool column to a numpy array.
9296
"""
9397
if col.offset != 0:
9498
raise NotImplementedError("column.offset > 0 not handled yet")
@@ -131,7 +135,7 @@ def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray:
131135

132136
def convert_categorical_column(col : ColumnObject) -> pd.Series:
133137
"""
134-
Convert a categorical column to a Series instance
138+
Convert a categorical column to a Series instance.
135139
"""
136140
ordered, is_dict, mapping = col.describe_categorical
137141
if not is_dict:
@@ -160,9 +164,19 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
160164
return series
161165

162166

167+
def convert_string_column(col : ColumnObject) -> pd.Series:
168+
"""
169+
Convert a string column to a Series instance.
170+
"""
171+
buffer, bdtype = col.get_data_buffer()
172+
offsets, odtype = col.get_offsets()
173+
174+
# TODO: implementation
175+
176+
163177
def __dataframe__(cls, nan_as_null : bool = False) -> dict:
164178
"""
165-
The public method to attach to pd.DataFrame
179+
The public method to attach to pd.DataFrame.
166180
167181
We'll attach it via monkeypatching here for demo purposes. If Pandas adopt
168182
the protocol, this will be a regular method on pandas.DataFrame.
@@ -205,20 +219,20 @@ def __init__(self, x : np.ndarray) -> None:
205219
@property
206220
def bufsize(self) -> int:
207221
"""
208-
Buffer size in bytes
222+
Buffer size in bytes.
209223
"""
210224
return self._x.size * self._x.dtype.itemsize
211225

212226
@property
213227
def ptr(self) -> int:
214228
"""
215-
Pointer to start of the buffer as an integer
229+
Pointer to start of the buffer as an integer.
216230
"""
217231
return self._x.__array_interface__['data'][0]
218232

219233
def __dlpack__(self):
220234
"""
221-
DLPack not implemented in NumPy yet, so leave it out here
235+
DLPack not implemented in NumPy yet, so leave it out here.
222236
"""
223237
raise NotImplementedError("__dlpack__")
224238

@@ -242,9 +256,10 @@ class _PandasColumn:
242256
A column object, with only the methods and properties required by the
243257
interchange protocol defined.
244258
245-
A column can contain one or more chunks. Each chunk can contain either one
246-
or two buffers - one data buffer and (depending on null representation) it
247-
may have a mask buffer.
259+
A column can contain one or more chunks. Each chunk can contain up to three
260+
buffers - a data buffer, a mask buffer (depending on null representation),
261+
and an offsets buffer (if variable-size binary; e.g., variable-length
262+
strings).
248263
249264
Note: this Column object can only be produced by ``__dataframe__``, so
250265
doesn't need its own version or ``__column__`` protocol.
@@ -322,7 +337,7 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
322337

323338
def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
324339
"""
325-
See `self.dtype` for details
340+
See `self.dtype` for details.
326341
"""
327342
# Note: 'c' (complex) not handled yet (not in array spec v1).
328343
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
@@ -340,7 +355,7 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
340355
raise ValueError(f"Data type {dtype} not supported by exchange"
341356
"protocol")
342357

343-
if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL):
358+
if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING):
344359
raise NotImplementedError(f"Data type {dtype} not handled yet")
345360

346361
bitwidth = dtype.itemsize * 8
@@ -407,13 +422,15 @@ def describe_null(self) -> Tuple[int, Any]:
407422
null = 1 # np.datetime64('NaT')
408423
elif kind in (_k.INT, _k.UINT, _k.BOOL):
409424
# TODO: check if extension dtypes are used once support for them is
410-
# implemented in this procotol code
425+
# implemented in this protocol code
411426
null = 0 # integer and boolean dtypes are non-nullable
412427
elif kind == _k.CATEGORICAL:
413428
# Null values for categoricals are stored as `-1` sentinel values
414429
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
415430
null = 2
416431
value = -1
432+
elif kind == _k.STRING:
433+
null = 1 # np.nan (object dtype)
417434
else:
418435
raise NotImplementedError(f'Data type {self.dtype} not yet supported')
419436

@@ -442,7 +459,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
442459

443460
def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple
444461
"""
445-
Return the buffer containing the data.
462+
Return the buffer containing the data and the buffer's associated dtype.
446463
"""
447464
_k = _DtypeKind
448465
if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
@@ -452,14 +469,19 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype
452469
codes = self._col.values.codes
453470
buffer = _PandasBuffer(codes)
454471
dtype = self._dtype_from_pandasdtype(codes.dtype)
472+
elif self.dtype[0] == _k.STRING:
473+
buffer = _PandasBuffer(self._col.to_numpy())
474+
bdtype = buffer.dtype; # should be object dtype
475+
dtype = (_k.STRING, bdtype.itemsize*8, '|U', bdtype.byteorder)
455476
else:
456477
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
457478

458479
return buffer, dtype
459480

460-
def get_mask(self) -> _PandasBuffer:
481+
def get_mask(self) -> Tuple[_PandasBuffer, Any]:
461482
"""
462-
Return the buffer containing the mask values indicating missing data.
483+
Return the buffer containing the mask values indicating missing data and
484+
the buffer's associated dtype.
463485
464486
Raises RuntimeError if null representation is not a bit or byte mask.
465487
"""
@@ -473,6 +495,23 @@ def get_mask(self) -> _PandasBuffer:
473495

474496
raise RuntimeError(msg)
475497

498+
def get_offsets(self) -> Tuple[_PandasBuffer, Any]:
499+
"""
500+
Return the buffer containing the offset values for variable-size binary
501+
data (e.g., variable-length strings) and the buffer's associated dtype.
502+
503+
Raises RuntimeError if the data buffer does not have an associated
504+
offsets buffer.
505+
"""
506+
_k = _DtypeKind
507+
if self.dtype[0] == _k.STRING:
508+
# TODO: implementation => we need to manually create the offsets array
509+
510+
else:
511+
raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer")
512+
513+
return buffer, dtype
514+
476515

477516
class _PandasDataFrame:
478517
"""

0 commit comments

Comments
 (0)
0