diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 00cf5b12..b200285c 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -335,7 +335,8 @@ class DataFrame: ``__dataframe__`` method of a public data frame class in a library adhering to the dataframe interchange protocol specification. """ - def __dataframe__(self, nan_as_null : bool = False) -> dict: + def __dataframe__(self, nan_as_null : bool = False, + allow_copy : bool = True) -> dict: """ Produces a dictionary object following the dataframe protocol spec @@ -343,8 +344,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: producer to overwrite null values in the data with ``NaN`` (or ``NaT``). It is intended for cases where the consumer does not support the bit mask or byte mask that is the producer's native representation. + + ``allow_copy`` is a keyword that defines if the given implementation + is going to support striding buffers. It is optional, and the libraries + do not need to implement it. """ self._nan_as_null = nan_as_null + self._allow_zero_zopy = allow_copy return { "dataframe": self, # DataFrame object adhering to the protocol "version": 0 # Version number of the protocol diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md index 91583bc4..57152b12 100644 --- a/protocol/dataframe_protocol_summary.md +++ b/protocol/dataframe_protocol_summary.md @@ -40,8 +40,8 @@ libraries, the example above can change to: def get_df_module(df): """Utility function to support programming against a dataframe API""" if hasattr(df, '__dataframe_namespace__'): - # Retrieve the namespace - pdx = df.__dataframe_namespace__() + # Retrieve the namespace + pdx = df.__dataframe_namespace__() else: # Here we can raise an exception if we only want to support compliant dataframes, # or convert to our default choice of dataframe if we want to accept (e.g.) dicts @@ -168,13 +168,12 @@ We'll also list some things that were discussed but are not requirements: 3. Extension dtypes, i.e. a way to extend the set of dtypes that is explicitly support, are out of scope. _Rationale: complex to support, not used enough to justify that complexity._ -4. "virtual columns", i.e. columns for which the data is not yet in memory - because it uses lazy evaluation, are not supported other than through - letting the producer materialize the data in memory when the consumer - calls `__dataframe__`. - _Rationale: the full dataframe API will support this use case by - "programming to an interface"; this data interchange protocol is - fundamentally built around describing data in memory_. +4. Support for strided storage in buffers. + _Rationale: this is supported by a subset of dataframes only, mainly those + that use NumPy arrays. In many real-world use cases, strided arrays will + force a copy at some point, so requiring contiguous memory layout (and hence + an extra copy at the moment `__dataframe__` is used) is considered a good + trade-off for reduced implementation complexity._ ### To be decided @@ -245,7 +244,7 @@ library that implements `__array__` must depend (optionally at least) on NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`. -### What is wrong with `.to_numpy?` and `.to_arrow()`? +### What is wrong with `.to_numpy?` and `.to_arrow()`? Such methods ask the object it is attached to to turn itself into a NumPy or Arrow array. Which means each library must have at least an optional diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..867f86dd 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -35,7 +35,8 @@ ColumnObject = Any -def from_dataframe(df : DataFrameObject) -> pd.DataFrame: +def from_dataframe(df : DataFrameObject, + allow_copy : bool = True) -> pd.DataFrame: """ Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__`` """ @@ -46,7 +47,7 @@ def from_dataframe(df : DataFrameObject) -> pd.DataFrame: if not hasattr(df, '__dataframe__'): raise ValueError("`df` does not support __dataframe__") - return _from_dataframe(df.__dataframe__()) + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: @@ -160,7 +161,8 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: return series -def __dataframe__(cls, nan_as_null : bool = False) -> dict: +def __dataframe__(cls, nan_as_null : bool = False, + allow_copy : bool = True) -> dict: """ The public method to attach to pd.DataFrame @@ -171,8 +173,14 @@ def __dataframe__(cls, nan_as_null : bool = False) -> dict: producer to overwrite null values in the data with ``NaN`` (or ``NaT``). This currently has no effect; once support for nullable extension dtypes is added, this value should be propagated to columns. + + ``allow_copy`` is a keyword that defines if the given implementation + is going to support striding buffers. It is optional, and the libraries + do not need to implement it. Currently, if the flag is set to ``True`` it + will raise a ``RuntimeError``. """ - return _PandasDataFrame(cls, nan_as_null=nan_as_null) + return _PandasDataFrame( + cls, nan_as_null=nan_as_null, allow_copy=allow_copy) # Monkeypatch the Pandas DataFrame class to support the interchange protocol @@ -187,16 +195,16 @@ class _PandasBuffer: Data in the buffer is guaranteed to be contiguous in memory. """ - def __init__(self, x : np.ndarray) -> None: + def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): - # Array is not contiguous - this is possible to get in Pandas, - # there was some discussion on whether to support it. Som extra - # complexity for libraries that don't support it (e.g. Arrow), - # but would help with numpy-based libraries like Pandas. - raise RuntimeError("Design needs fixing - non-contiguous buffer") + if not allow_copy: + # Array is not contiguous and strided buffers do not need to be + # supported. It brings some extra complexity for libraries that + # don't support it (e.g. Arrow). + raise RuntimeError( + "Exports cannot be zero-copy in the case of a non-contiguous buffer") # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes @@ -251,7 +259,8 @@ class _PandasColumn: """ - def __init__(self, column : pd.Series) -> None: + def __init__(self, column : pd.Series, + allow_copy : bool = True) -> None: """ Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. @@ -262,6 +271,7 @@ def __init__(self, column : pd.Series) -> None: # Store the column as a private attribute self._col = column + self._allow_copy = allow_copy @property def size(self) -> int: @@ -446,11 +456,13 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - buffer = _PandasBuffer(self._col.to_numpy()) + buffer = _PandasBuffer( + self._col.to_numpy(), allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == _k.CATEGORICAL: codes = self._col.values.codes - buffer = _PandasBuffer(codes) + buffer = _PandasBuffer( + codes, allow_copy=self._allow_copy) dtype = self._dtype_from_pandasdtype(codes.dtype) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -483,7 +495,8 @@ class _PandasDataFrame: ``pd.DataFrame.__dataframe__`` as objects with the methods and attributes defined on this class. """ - def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: + def __init__(self, df : pd.DataFrame, nan_as_null : bool = False, + allow_copy : bool = True) -> None: """ Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. @@ -494,6 +507,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None: # This currently has no effect; once support for nullable extension # dtypes is added, this value should be propagated to columns. self._nan_as_null = nan_as_null + self._allow_copy = allow_copy def num_columns(self) -> int: return len(self._df.columns) @@ -508,13 +522,16 @@ def column_names(self) -> Iterable[str]: return self._df.columns.tolist() def get_column(self, i: int) -> _PandasColumn: - return _PandasColumn(self._df.iloc[:, i]) + return _PandasColumn( + self._df.iloc[:, i], allow_copy=self._allow_copy) def get_column_by_name(self, name: str) -> _PandasColumn: - return _PandasColumn(self._df[name]) + return _PandasColumn( + self._df[name], allow_copy=self._allow_copy) def get_columns(self) -> Iterable[_PandasColumn]: - return [_PandasColumn(self._df[name]) for name in self._df.columns] + return [_PandasColumn(self._df[name], allow_copy=self._allow_copy) + for name in self._df.columns] def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame': if not isinstance(indices, collections.Sequence): @@ -552,13 +569,12 @@ def test_mixed_intfloat(): def test_noncontiguous_columns(): - # Currently raises: TBD whether it should work or not, see code comment - # where the RuntimeError is raised. + # Currently raises if the flag of allow zero copy is True. arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) df = pd.DataFrame(arr) assert df[0].to_numpy().strides == (24,) - pytest.raises(RuntimeError, from_dataframe, df) - #df2 = from_dataframe(df) + with pytest.raises(RuntimeError): + df2 = from_dataframe(df, allow_copy=False) def test_categorical_dtype():