23
23
import ctypes
24
24
from typing import Any , Optional , Tuple , Dict , Iterable , Sequence
25
25
26
+ from io import StringIO
27
+
26
28
import pandas as pd
27
29
import numpy as np
28
30
import pandas ._testing as tm
@@ -70,6 +72,8 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
70
72
columns [name ] = convert_column_to_ndarray (col )
71
73
elif col .dtype [0 ] == _k .CATEGORICAL :
72
74
columns [name ] = convert_categorical_column (col )
75
+ elif col .dtype [0 ] == _k .STRING :
76
+ columns [name ] = convert_string_column (col )
73
77
else :
74
78
raise NotImplementedError (f"Data type { col .dtype [0 ]} not handled yet" )
75
79
@@ -88,7 +92,7 @@ class _DtypeKind(enum.IntEnum):
88
92
89
93
def convert_column_to_ndarray (col : ColumnObject ) -> np .ndarray :
90
94
"""
91
- Convert an int, uint, float or bool column to a numpy array
95
+ Convert an int, uint, float or bool column to a numpy array.
92
96
"""
93
97
if col .offset != 0 :
94
98
raise NotImplementedError ("column.offset > 0 not handled yet" )
@@ -131,7 +135,7 @@ def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray:
131
135
132
136
def convert_categorical_column (col : ColumnObject ) -> pd .Series :
133
137
"""
134
- Convert a categorical column to a Series instance
138
+ Convert a categorical column to a Series instance.
135
139
"""
136
140
ordered , is_dict , mapping = col .describe_categorical
137
141
if not is_dict :
@@ -160,9 +164,19 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
160
164
return series
161
165
162
166
167
+ def convert_string_column (col : ColumnObject ) -> pd .Series :
168
+ """
169
+ Convert a string column to a Series instance.
170
+ """
171
+ buffer , bdtype = col .get_data_buffer ()
172
+ offsets , odtype = col .get_offsets ()
173
+
174
+ # TODO: implementation
175
+
176
+
163
177
def __dataframe__ (cls , nan_as_null : bool = False ) -> dict :
164
178
"""
165
- The public method to attach to pd.DataFrame
179
+ The public method to attach to pd.DataFrame.
166
180
167
181
We'll attach it via monkeypatching here for demo purposes. If Pandas adopt
168
182
the protocol, this will be a regular method on pandas.DataFrame.
@@ -205,20 +219,20 @@ def __init__(self, x : np.ndarray) -> None:
205
219
@property
206
220
def bufsize (self ) -> int :
207
221
"""
208
- Buffer size in bytes
222
+ Buffer size in bytes.
209
223
"""
210
224
return self ._x .size * self ._x .dtype .itemsize
211
225
212
226
@property
213
227
def ptr (self ) -> int :
214
228
"""
215
- Pointer to start of the buffer as an integer
229
+ Pointer to start of the buffer as an integer.
216
230
"""
217
231
return self ._x .__array_interface__ ['data' ][0 ]
218
232
219
233
def __dlpack__ (self ):
220
234
"""
221
- DLPack not implemented in NumPy yet, so leave it out here
235
+ DLPack not implemented in NumPy yet, so leave it out here.
222
236
"""
223
237
raise NotImplementedError ("__dlpack__" )
224
238
@@ -242,9 +256,10 @@ class _PandasColumn:
242
256
A column object, with only the methods and properties required by the
243
257
interchange protocol defined.
244
258
245
- A column can contain one or more chunks. Each chunk can contain either one
246
- or two buffers - one data buffer and (depending on null representation) it
247
- may have a mask buffer.
259
+ A column can contain one or more chunks. Each chunk can contain up to three
260
+ buffers - a data buffer, a mask buffer (depending on null representation),
261
+ and an offsets buffer (if variable-size binary; e.g., variable-length
262
+ strings).
248
263
249
264
Note: this Column object can only be produced by ``__dataframe__``, so
250
265
doesn't need its own version or ``__column__`` protocol.
@@ -322,7 +337,7 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
322
337
323
338
def _dtype_from_pandasdtype (self , dtype ) -> Tuple [enum .IntEnum , int , str , str ]:
324
339
"""
325
- See `self.dtype` for details
340
+ See `self.dtype` for details.
326
341
"""
327
342
# Note: 'c' (complex) not handled yet (not in array spec v1).
328
343
# 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
@@ -340,7 +355,7 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
340
355
raise ValueError (f"Data type { dtype } not supported by exchange"
341
356
"protocol" )
342
357
343
- if kind not in (_k .INT , _k .UINT , _k .FLOAT , _k .BOOL , _k .CATEGORICAL ):
358
+ if kind not in (_k .INT , _k .UINT , _k .FLOAT , _k .BOOL , _k .CATEGORICAL , _k . STRING ):
344
359
raise NotImplementedError (f"Data type { dtype } not handled yet" )
345
360
346
361
bitwidth = dtype .itemsize * 8
@@ -407,13 +422,15 @@ def describe_null(self) -> Tuple[int, Any]:
407
422
null = 1 # np.datetime64('NaT')
408
423
elif kind in (_k .INT , _k .UINT , _k .BOOL ):
409
424
# TODO: check if extension dtypes are used once support for them is
410
- # implemented in this procotol code
425
+ # implemented in this protocol code
411
426
null = 0 # integer and boolean dtypes are non-nullable
412
427
elif kind == _k .CATEGORICAL :
413
428
# Null values for categoricals are stored as `-1` sentinel values
414
429
# in the category date (e.g., `col.values.codes` is int8 np.ndarray)
415
430
null = 2
416
431
value = - 1
432
+ elif kind == _k .STRING :
433
+ null = 1 # np.nan (object dtype)
417
434
else :
418
435
raise NotImplementedError (f'Data type { self .dtype } not yet supported' )
419
436
@@ -442,7 +459,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
442
459
443
460
def get_data_buffer (self ) -> Tuple [_PandasBuffer , Any ]: # Any is for self.dtype tuple
444
461
"""
445
- Return the buffer containing the data.
462
+ Return the buffer containing the data and the buffer's associated dtype .
446
463
"""
447
464
_k = _DtypeKind
448
465
if self .dtype [0 ] in (_k .INT , _k .UINT , _k .FLOAT , _k .BOOL ):
@@ -452,14 +469,19 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype
452
469
codes = self ._col .values .codes
453
470
buffer = _PandasBuffer (codes )
454
471
dtype = self ._dtype_from_pandasdtype (codes .dtype )
472
+ elif self .dtype [0 ] == _k .STRING :
473
+ buffer = _PandasBuffer (self ._col .to_numpy ())
474
+ bdtype = buffer .dtype ; # should be object dtype
475
+ dtype = (_k .STRING , bdtype .itemsize * 8 , '|U' , bdtype .byteorder )
455
476
else :
456
477
raise NotImplementedError (f"Data type { self ._col .dtype } not handled yet" )
457
478
458
479
return buffer , dtype
459
480
460
- def get_mask (self ) -> _PandasBuffer :
481
+ def get_mask (self ) -> Tuple [ _PandasBuffer , Any ] :
461
482
"""
462
- Return the buffer containing the mask values indicating missing data.
483
+ Return the buffer containing the mask values indicating missing data and
484
+ the buffer's associated dtype.
463
485
464
486
Raises RuntimeError if null representation is not a bit or byte mask.
465
487
"""
@@ -473,6 +495,23 @@ def get_mask(self) -> _PandasBuffer:
473
495
474
496
raise RuntimeError (msg )
475
497
498
+ def get_offsets (self ) -> Tuple [_PandasBuffer , Any ]:
499
+ """
500
+ Return the buffer containing the offset values for variable-size binary
501
+ data (e.g., variable-length strings) and the buffer's associated dtype.
502
+
503
+ Raises RuntimeError if the data buffer does not have an associated
504
+ offsets buffer.
505
+ """
506
+ _k = _DtypeKind
507
+ if self .dtype [0 ] == _k .STRING :
508
+ # TODO: implementation => we need to manually create the offsets array
509
+
510
+ else :
511
+ raise RuntimeError ("This column has a fixed-length dtype so does not have an offsets buffer" )
512
+
513
+ return buffer , dtype
514
+
476
515
477
516
class _PandasDataFrame :
478
517
"""
0 commit comments