@@ -83,14 +83,14 @@ class Buffer:
83
83
@property
84
84
def bufsize (self ) -> int :
85
85
"""
86
- Buffer size in bytes
86
+ Buffer size in bytes.
87
87
"""
88
88
pass
89
89
90
90
@property
91
91
def ptr (self ) -> int :
92
92
"""
93
- Pointer to start of the buffer as an integer
93
+ Pointer to start of the buffer as an integer.
94
94
"""
95
95
pass
96
96
@@ -133,9 +133,10 @@ class Column:
133
133
A column object, with only the methods and properties required by the
134
134
interchange protocol defined.
135
135
136
- A column can contain one or more chunks. Each chunk can contain either one
137
- or two buffers - one data buffer and (depending on null representation) it
138
- may have a mask buffer.
136
+ A column can contain one or more chunks. Each chunk can contain up to three
137
+ buffers - a data buffer, a mask buffer (depending on null representation),
138
+ and an offsets buffer (if variable-size binary; e.g., variable-length
139
+ strings).
139
140
140
141
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
141
142
Instead, it seems to use "children" for both columns with a bit mask,
@@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
185
186
@property
186
187
def offset (self ) -> int :
187
188
"""
188
- Offset of first element
189
+ Offset of first element.
189
190
190
191
May be > 0 if using chunks; for example for a column with N chunks of
191
192
equal size M (only the last chunk may be shorter),
@@ -196,7 +197,7 @@ def offset(self) -> int:
196
197
@property
197
198
def dtype (self ) -> Tuple [enum .IntEnum , int , str , str ]:
198
199
"""
199
- Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
200 + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
200
201
201
202
Kind :
202
203
@@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]:
272
273
- 3 : bit mask
273
274
- 4 : byte mask
274
275
275
- Value : if kind is "sentinel value", the actual value. None otherwise.
276
+ Value : if kind is "sentinel value", the actual value. If kind is a bit
277
+ mask or a byte mask, the value (0 or 1) indicating a missing value. None
278
+ otherwise.
276
279
"""
277
280
pass
278
281
@@ -285,6 +288,13 @@ def null_count(self) -> Optional[int]:
285
288
"""
286
289
pass
287
290
291
+ @property
292
+ def metadata (self ) -> Dict [str , Any ]:
293
+ """
294
+ The metadata for the column. See `DataFrame.metadata` for more details.
295
+ """
296
+ pass
297
+
288
298
def num_chunks (self ) -> int :
289
299
"""
290
300
Return the number of chunks the column consists of.
@@ -299,24 +309,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
299
309
"""
300
310
pass
301
311
302
- def get_data_buffer (self ) -> Buffer :
303
- """
304
- Return the buffer containing the data.
312
+ def get_buffers (self ) -> dict [Tuple [Buffer , Any ], Optional [Tuple [Buffer , Any ]], Optional [Tuple [Buffer , Any ]]]:
305
313
"""
306
- pass
314
+ Return a dictionary containing the underlying buffers.
307
315
308
- def get_mask (self ) -> Buffer :
309
- """
310
- Return the buffer containing the mask values indicating missing data.
316
+ The returned dictionary has the following contents:
311
317
312
- Raises RuntimeError if null representation is not a bit or byte mask.
318
+ - "data": a two-element tuple whose first element is a buffer
319
+ containing the data and whose second element is the data
320
+ buffer's associated dtype.
321
+ - "validity": a two-element tuple whose first element is a buffer
322
+ containing mask values indicating missing data and
323
+ whose second element is the mask value buffer's
324
+ associated dtype. None if the null representation is
325
+ not a bit or byte mask.
326
+ - "offsets": a two-element tuple whose first element is a buffer
327
+ containing the offset values for variable-size binary
328
+ data (e.g., variable-length strings) and whose second
329
+ element is the offsets buffer's associated dtype. None
330
+ if the data buffer does not have an associated offsets
331
+ buffer.
313
332
"""
314
333
pass
315
334
316
335
# def get_children(self) -> Iterable[Column]:
317
336
# """
318
337
# Children columns underneath the column, each object in this iterator
319
- # must adhere to the column specification
338
+ # must adhere to the column specification.
320
339
# """
321
340
# pass
322
341
@@ -335,24 +354,44 @@ class DataFrame:
335
354
``__dataframe__`` method of a public data frame class in a library adhering
336
355
to the dataframe interchange protocol specification.
337
356
"""
338
- def __dataframe__ (self , nan_as_null : bool = False ) -> dict :
357
+ def __dataframe__ (self , nan_as_null : bool = False ,
358
+ allow_copy : bool = True ) -> dict :
339
359
"""
340
- Produces a dictionary object following the dataframe protocol spec
360
+ Produces a dictionary object following the dataframe protocol specification.
341
361
342
362
``nan_as_null`` is a keyword intended for the consumer to tell the
343
363
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
344
364
It is intended for cases where the consumer does not support the bit
345
365
mask or byte mask that is the producer's native representation.
366
+
367
+ ``allow_copy`` is a keyword that defines whether or not the library is
368
+ allowed to make a copy of the data. For example, copying data would be
369
+ necessary if a library supports strided buffers, given that this protocol
370
+ specifies contiguous buffers.
346
371
"""
347
372
self ._nan_as_null = nan_as_null
373
+ self ._allow_zero_zopy = allow_copy
348
374
return {
349
375
"dataframe" : self , # DataFrame object adhering to the protocol
350
376
"version" : 0 # Version number of the protocol
351
377
}
352
378
379
+ @property
380
+ def metadata (self ) -> Dict [str , Any ]:
381
+ """
382
+ The metadata for the data frame, as a dictionary with string keys. The
383
+ contents of `metadata` may be anything, they are meant for a library
384
+ to store information that it needs to, e.g., roundtrip losslessly or
385
+ for two implementations to share data that is not (yet) part of the
386
+ interchange protocol specification. For avoiding collisions with other
387
+ entries, please add name the keys with the name of the library
388
+ followed by a period and the desired name, e.g, ``pandas.indexcol``.
389
+ """
390
+ pass
391
+
353
392
def num_columns (self ) -> int :
354
393
"""
355
- Return the number of columns in the DataFrame
394
+ Return the number of columns in the DataFrame.
356
395
"""
357
396
pass
358
397
@@ -361,13 +400,13 @@ def num_rows(self) -> Optional[int]:
361
400
# why include it if it may be None - what do we expect consumers
362
401
# to do here?
363
402
"""
364
- Return the number of rows in the DataFrame, if available
403
+ Return the number of rows in the DataFrame, if available.
365
404
"""
366
405
pass
367
406
368
407
def num_chunks (self ) -> int :
369
408
"""
370
- Return the number of chunks the DataFrame consists of
409
+ Return the number of chunks the DataFrame consists of.
371
410
"""
372
411
pass
373
412
@@ -397,7 +436,7 @@ def get_columns(self) -> Iterable[Column]:
397
436
398
437
def select_columns (self , indices : Sequence [int ]) -> DataFrame :
399
438
"""
400
- Create a new DataFrame by selecting a subset of columns by index
439
+ Create a new DataFrame by selecting a subset of columns by index.
401
440
"""
402
441
pass
403
442
@@ -417,4 +456,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
417
456
before yielding it.
418
457
"""
419
458
pass
420
-
0 commit comments