8000 Merge branch 'main' into more-tests · iskode/dataframe-api@7d7dcee · GitHub
[go: up one dir, main page]

Skip to content

Commit 7d7dcee

Browse files
committed
Merge branch 'main' into more-tests
update with recent merges.
2 parents 011dd5d + 698989b commit 7d7dcee

File tree

3 files changed

+379
-80
lines changed

3 files changed

+379
-80
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@
33
This repository contains documents, tooling and other content related to the
44
API standard for dataframes.
55

6+
- [Request For Comments (RFC) for the dataframe protocol (blog post)](https://data-apis.org/blog/dataframe_protocol_rfc/)

protocol/dataframe_protocol.py

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,14 @@ class Buffer:
8383
@property
8484
def bufsize(self) -> int:
8585
"""
86-
Buffer size in bytes
86+
Buffer size in bytes.
8787
"""
8888
pass
8989

9090
@property
9191
def ptr(self) -> int:
9292
"""
93-
Pointer to start of the buffer as an integer
93+
Pointer to start of the buffer as an integer.
9494
"""
9595
pass
9696

@@ -133,9 +133,10 @@ class Column:
133133
A column object, with only the methods and properties required by the
134134
interchange protocol defined.
135135
136-
A column can contain one or more chunks. Each chunk can contain either one
137-
or two buffers - one data buffer and (depending on null representation) it
138-
may have a mask buffer.
136+
A column can contain one or more chunks. Each chunk can contain up to three
137+
buffers - a data buffer, a mask buffer (depending on null representation),
138+
and an offsets buffer (if variable-size binary; e.g., variable-length
139+
strings).
139140
140141
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
141142
Instead, it seems to use "children" for both columns with a bit mask,
@@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
185186
@property
186187
def offset(self) -> int:
187188
"""
188-
Offset of first element
189+
Offset of first element.
189190
190191
May be > 0 if using chunks; for example for a column with N chunks of
191192
equal size M (only the last chunk may be shorter),
@@ -196,7 +197,7 @@ def offset(self) -> int:
196197
@property
197198
def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
198199
"""
199-
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
200+
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
200201
201202
Kind :
202203
@@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]:
272273
- 3 : bit mask
273274
- 4 : byte mask
274275
275-
Value : if kind is "sentinel value", the actual value. None otherwise.
276+
Value : if kind is "sentinel value", the actual value. If kind is a bit
277+
mask or a byte mask, the value (0 or 1) indicating a missing value. None
278+
otherwise.
276279
"""
277280
pass
278281

@@ -285,6 +288,13 @@ def null_count(self) -> Optional[int]:
285288
"""
286289
pass
287290

291+
@property
292+
def metadata(self) -> Dict[str, Any]:
293+
"""
294+
The metadata for the column. See `DataFrame.metadata` for more details.
295+
"""
296+
pass
297+
288298
def num_chunks(self) -> int:
289299
"""
290300
Return the number of chunks the column consists of.
@@ -299,24 +309,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
299309
"""
300310
pass
301311

302-
def get_data_buffer(self) -> Buffer:
303-
"""
304-
Return the buffer containing the data.
312+
def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
305313
"""
306-
pass
314+
Return a dictionary containing the underlying buffers.
307315
308-
def get_mask(self) -> Buffer:
309-
"""
310-
Return the buffer containing the mask values indicating missing data.
316+
The returned dictionary has the following contents:
311317
312-
Raises RuntimeError if null representation is not a bit or byte mask.
318+
- "data": a two-element tuple whose first element is a buffer
319+
containing the data and whose second element is the data
320+
buffer's associated dtype.
321+
- "validity": a two-element tuple whose first element is a buffer
322+
containing mask values indicating missing data and
323+
whose second element is the mask value buffer's
324+
associated dtype. None if the null representation is
325+
not a bit or byte mask.
326+
- "offsets": a two-element tuple whose first element is a buffer
327+
containing the offset values for variable-size binary
328+
data (e.g., variable-length strings) and whose second
329+
element is the offsets buffer's associated dtype. None
330+
if the data buffer does not have an associated offsets
331+
buffer.
313332
"""
314333
pass
315334

316335
# def get_children(self) -> Iterable[Column]:
317336
# """
318337
# Children columns underneath the column, each object in this iterator
319-
# must adhere to the column specification
338+
# must adhere to the column specification.
320339
# """
321340
# pass
322341

@@ -335,24 +354,44 @@ class DataFrame:
335354
``__dataframe__`` method of a public data frame class in a library adhering
336355
to the dataframe interchange protocol specification.
337356
"""
338-
def __dataframe__(self, nan_as_null : bool = False) -> dict:
357+
def __dataframe__(self, nan_as_null : bool = False,
358+
allow_copy : bool = True) -> dict:
339359
"""
340-
Produces a dictionary object following the dataframe protocol spec
360+
Produces a dictionary object following the dataframe protocol specification.
341361
342362
``nan_as_null`` is a keyword intended for the consumer to tell the
343363
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
344364
It is intended for cases where the consumer does not support the bit
345365
mask or byte mask that is the producer's native representation.
366+
367+
``allow_copy`` is a keyword that defines whether or not the library is
368+
allowed to make a copy of the data. For example, copying data would be
369+
necessary if a library supports strided buffers, given that this protocol
370+
specifies contiguous buffers.
346371
"""
347372
self._nan_as_null = nan_as_null
373+
self._allow_zero_zopy = allow_copy
348374
return {
349375
"dataframe": self, # DataFrame object adhering to the protocol
350376
"version": 0 # Version number of the protocol
351377
}
352378

379+
@property
380+
def metadata(self) -> Dict[str, Any]:
381+
"""
382+
The metadata for the data frame, as a dictionary with string keys. The
383+
contents of `metadata` may be anything, they are meant for a library
384+
to store information that it needs to, e.g., roundtrip losslessly or
385+
for two implementations to share data that is not (yet) part of the
386+
interchange protocol specification. For avoiding collisions with other
387+
entries, please add name the keys with the name of the library
388+
followed by a period and the desired name, e.g, ``pandas.indexcol``.
389+
"""
390+
pass
391+
353392
def num_columns(self) -> int:
354393
"""
355-
Return the number of columns in the DataFrame
394+
Return the number of columns in the DataFrame.
356395
"""
357396
pass
358397

@@ -361,13 +400,13 @@ def num_rows(self) -> Optional[int]:
361400
# why include it if it may be None - what do we expect consumers
362401
# to do here?
363402
"""
364-
Return the number of rows in the DataFrame, if available
403+
Return the number of rows in the DataFrame, if available.
365404
"""
366405
pass
367406

368407
def num_chunks(self) -> int:
369408
"""
370-
Return the number of chunks the DataFrame consists of
409+
Return the number of chunks the DataFrame consists of.
371410
"""
372411
pass
373412

@@ -397,7 +436,7 @@ def get_columns(self) -> Iterable[Column]:
397436

398437
def select_columns(self, indices: Sequence[int]) -> DataFrame:
399438
"""
400-
Create a new DataFrame by selecting a subset of columns by index
439+
Create a new DataFrame by selecting a subset of columns by index.
401440
"""
402441
pass
403442

@@ -417,4 +456,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
417456
before yielding it.
418457
"""
419458
pass
420-

0 commit comments

Comments
 (0)
0