diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 00cf5b12..41b2e6e4 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -83,14 +83,14 @@ class Buffer: @property def bufsize(self) -> int: """ - Buffer size in bytes + Buffer size in bytes. """ pass @property def ptr(self) -> int: """ - Pointer to start of the buffer as an integer + Pointer to start of the buffer as an integer. """ pass @@ -133,9 +133,10 @@ class Column: A column object, with only the methods and properties required by the interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain either one - or two buffers - one data buffer and (depending on null representation) it - may have a mask buffer. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, @@ -185,7 +186,7 @@ def size(self) -> Optional[int]: @property def offset(self) -> int: """ - Offset of first element + Offset of first element. May be > 0 if using chunks; for example for a column with N chunks of equal size M (only the last chunk may be shorter), @@ -196,7 +197,7 @@ def offset(self) -> int: @property def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. Kind : @@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. None otherwise. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. """ pass @@ -299,24 +302,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: """ pass - def get_data_buffer(self) -> Buffer: + def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: """ - Return the buffer containing the data. - """ - pass + Return a dictionary containing the underlying buffers. - def get_mask(self) -> Buffer: - """ - Return the buffer containing the mask values indicating missing data. + The returned dictionary has the following contents: - Raises RuntimeError if null representation is not a bit or byte mask. + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. """ pass # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator -# must adhere to the column specification +# must adhere to the column specification. # """ # pass @@ -337,7 +349,7 @@ class DataFrame: """ def __dataframe__(self, nan_as_null : bool = False) -> dict: """ - Produces a dictionary object following the dataframe protocol spec + Produces a dictionary object following the dataframe protocol specification. ``nan_as_null`` is a keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). @@ -352,7 +364,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: def num_columns(self) -> int: """ - Return the number of columns in the DataFrame + Return the number of columns in the DataFrame. """ pass @@ -361,13 +373,13 @@ def num_rows(self) -> Optional[int]: # why include it if it may be None - what do we expect consumers # to do here? """ - Return the number of rows in the DataFrame, if available + Return the number of rows in the DataFrame, if available. """ pass def num_chunks(self) -> int: """ - Return the number of chunks the DataFrame consists of + Return the number of chunks the DataFrame consists of. """ pass @@ -397,7 +409,7 @@ def get_columns(self) -> Iterable[Column]: def select_columns(self, indices: Sequence[int]) -> DataFrame: """ - Create a new DataFrame by selecting a subset of columns by index + Create a new DataFrame by selecting a subset of columns by index. """ pass diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md index 91583bc4..aa33b9a3 100644 --- a/protocol/dataframe_protocol_summary.md +++ b/protocol/dataframe_protocol_summary.md @@ -1,8 +1,9 @@ -# `__dataframe__` protocol - summary +# The `__dataframe__` protocol + +This document aims to describe the scope of the dataframe interchange protocol, +as well as its essential design requirements/principles and the functionality +it needs to support. -_We've had a lot of discussion in a couple of GitHub issues and in meetings. -This description attempts to summarize that, and extract the essential design -requirements/principles and functionality it needs to support._ ## Purpose of `__dataframe__` @@ -11,7 +12,8 @@ a way to convert one type of dataframe into another type (for example, convert a Koalas dataframe into a Pandas dataframe, or a cuDF dataframe into a Vaex dataframe). -Currently (Nov'20) there is no way to do this in an implementation-independent way. +Currently (June 2020) there is no way to do this in an +implementation-independent way. The main use case this protocol intends to enable is to make it possible to write code that can accept any type of dataframe instead of being tied to a @@ -30,7 +32,7 @@ def somefunc(df, ...): ### Non-goals -Providing a _complete standardized dataframe API_ is not a goal of the +Providing a _complete, standardized dataframe API_ is not a goal of the `__dataframe__` protocol. Instead, this is a goal of the full dataframe API standard, which the Consortium for Python Data API Standards aims to provide in the future. When that full API standard is implemented by dataframe @@ -40,8 +42,8 @@ libraries, the example above can change to: def get_df_module(df): """Utility function to support programming against a dataframe API""" if hasattr(df, '__dataframe_namespace__'): - # Retrieve the namespace - pdx = df.__dataframe_namespace__() + # Retrieve the namespace + pdx = df.__dataframe_namespace__() else: # Here we can raise an exception if we only want to support compliant dataframes, # or convert to our default choice of dataframe if we want to accept (e.g.) dicts @@ -57,6 +59,7 @@ def somefunc(df, ...): # From now on, use `df` methods and `pdx` functions/objects ``` + ### Constraints An important constraint on the `__dataframe__` protocol is that it should not @@ -94,13 +97,14 @@ For a protocol to exchange dataframes between libraries, we need both a model of what we mean by "dataframe" conceptually for the purposes of the protocol, and a model of how the data is represented in memory: -![Image of a dataframe model, containing chunks, columns and 1-D arrays](conceptual_model_df_memory.png) +![Conceptual model of a dataframe, containing chunks, columns and 1-D arrays](images/dataframe_conceptual_model.png) -The smallest building block are **1-D arrays**, which are contiguous in -memory and contain data with the same dtype. A **column** consists of one or -more 1-D arrays (if, e.g., missing data is represented with a boolean mask, -that's a separate array). A **chunk** contains a set of columns of uniform -length. A **dataframe** contains one or more chunks. +The smallest building blocks are **1-D arrays** (or "buffers"), which are +contiguous in memory and contain data with the same dtype. A **column** +consists of one or more 1-D arrays (if, e.g., missing data is represented with +a boolean mask, that's a separate array). A **dataframe** contains one or more columns. +A column or a dataframe can be "chunked"; a **chunk** is a subset of a column +or dataframe that contains a set of (neighboring) rows. ## Protocol design requirements @@ -121,7 +125,7 @@ length. A **dataframe** contains one or more chunks. 6. Must avoid device transfers by default (e.g. copy data from GPU to CPU), and provide an explicit way to force such transfers (e.g. a `force=` or `copy=` keyword that the caller can set to `True`). -7. Must be zero-copy if possible. +7. Must be zero-copy wherever possible. 8. Must support missing values (`NA`) for all supported dtypes. 9. Must supports string, categorical and datetime dtypes. 10. Must allow the consumer to inspect the representation for missing values @@ -141,7 +145,7 @@ length. A **dataframe** contains one or more chunks. _Rationale: prescribing a single in-memory representation in this protocol would lead to unnecessary copies being made if that represention isn't the native one a library uses._ - _Note: the memory layout is columnnar. Row-major dataframes can use this + _Note: the memory layout is columnar. Row-major dataframes can use this protocol, but not in a zero-copy fashion (see requirement 2 above)._ 12. Must support chunking, i.e. accessing the data in "batches" of rows. There must be metadata the consumer can access to learn in how many @@ -168,7 +172,13 @@ We'll also list some things that were discussed but are not requirements: 3. Extension dtypes, i.e. a way to extend the set of dtypes that is explicitly support, are out of scope. _Rationale: complex to support, not used enough to justify that complexity._ -4. "virtual columns", i.e. columns for which the data is not yet in memory +4. Support for strided storage in buffers. + _Rationale: this is supported by a subset of dataframes only, mainly those + that use NumPy arrays. In many real-world use cases, strided arrays will + force a copy at some point, so requiring contiguous memory layout (and hence + an extra copy at the moment `__dataframe__` is used) is considered a good + trade-off for reduced implementation complexity._ +5. "virtual columns", i.e. columns for which the data is not yet in memory because it uses lazy evaluation, are not supported other than through letting the producer materialize the data in memory when the consumer calls `__dataframe__`. @@ -176,6 +186,7 @@ We'll also list some things that were discussed but are not requirements: "programming to an interface"; this data interchange protocol is fundamentally built around describing data in memory_. + ### To be decided _The connection between dataframe and array interchange protocols_. If we @@ -194,7 +205,7 @@ _Should there be a standard `from_dataframe` constructor function?_ This isn't completely necessary, however it's expected that a full dataframe API standard will have such a function. The array API standard also has such a function, namely `from_dlpack`. Adding at least a recommendation on syntax -for this function would make sense, e.g., `from_dataframe(df, stream=None)`. +for this function makes sense, e.g., simply `from_dataframe(df)`. Discussion at https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685903651 is relevant. @@ -209,14 +220,16 @@ except `__dataframe__` is a Python-level rather than C-level interface. The data types format specification of that interface is something that could be used unchanged. -The main (only?) limitation seems to be that it does not have device support -- @kkraus14 will bring this up on the Arrow dev mailing list. Also note that -that interface only talks about arrays; dataframes, chunking and the metadata -inspection can all be layered on top in this Python-level protocol, but are -not discussed in the interface itself. +The main limitation is that it does not have device support +-- `@kkraus14` will bring this up on the Arrow dev mailing list. Another +identified issue is that the "deleter" on the Arrow C struct is present at the +column level, and there are use cases for having it at the buffer level +(mixed-device dataframes, more granular control over memory). Note that categoricals are supported, Arrow uses the phrasing -"dictionary-encoded types" for categorical. +"dictionary-encoded types" for categorical. Also, what it calls "array" means +"column" in the terminology of this document (and every Python dataframe +library). The Arrow C Data Interface says specifically it was inspired by [Python's buffer protocol](https://docs.python.org/3/c-api/buffer.html), which is also @@ -245,7 +258,7 @@ library that implements `__array__` must depend (optionally at least) on NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`. -### What is wrong with `.to_numpy?` and `.to_arrow()`? +### What is wrong with `.to_numpy?` and `.to_arrow()`? Such methods ask the object it is attached to to turn itself into a NumPy or Arrow array. Which means each library must have at least an optional @@ -261,7 +274,7 @@ constructor it needs. For example, `x = np.asarray(df['colname'])` (where ### Does an interface describing memory work for virtual columns? -Vaex is an example of a library that can have "virtual columns" (see @maartenbreddels +Vaex is an example of a library that can have "virtual columns" (see `@maartenbreddels` [comment here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-686373569)). If the protocol includes a description of data layout in memory, does that work for such a virtual column? @@ -285,17 +298,15 @@ computational graph approach like Dask uses, etc.)._ ## Possible direction for implementation -### Rough prototypes +### Rough initial prototypes (historical) The `cuDFDataFrame`, `cuDFColumn` and `cuDFBuffer` sketched out by @kkraus14 [here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685123386) -seems to be in the right direction. +looked like it was in the right direction. [This prototype](https://github.com/wesm/dataframe-protocol/pull/1) by Wes McKinney was the first attempt, and has some useful features. -TODO: work this out after making sure we're all on the same page regarding requirements. - ### Relevant existing protocols diff --git a/protocol/images/dataframe_conceptual_model.png b/protocol/images/dataframe_conceptual_model.png new file mode 100644 index 00000000..3643b389 Binary files /dev/null and b/protocol/images/dataframe_conceptual_model.png differ diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..0eea3aae 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -25,7 +25,7 @@ import pandas as pd import numpy as np -import pandas._testing as tm +import pandas.testing as tm import pytest @@ -70,6 +70,8 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: columns[name] = convert_column_to_ndarray(col) elif col.dtype[0] == _k.CATEGORICAL: columns[name] = convert_categorical_column(col) + elif col.dtype[0] == _k.STRING: + columns[name] = convert_string_column(col) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") @@ -88,7 +90,7 @@ class _DtypeKind(enum.IntEnum): def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: """ - Convert an int, uint, float or bool column to a numpy array + Convert an int, uint, float or bool column to a numpy array. """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") @@ -97,7 +99,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: raise NotImplementedError("Null values represented as masks or " "sentinel values not handled yet") - _buffer, _dtype = col.get_data_buffer() + _buffer, _dtype = col.get_buffers()["data"] return buffer_to_ndarray(_buffer, _dtype) @@ -131,7 +133,7 @@ def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray: def convert_categorical_column(col : ColumnObject) -> pd.Series: """ - Convert a categorical column to a Series instance + Convert a categorical column to a Series instance. """ ordered, is_dict, mapping = col.describe_categorical if not is_dict: @@ -141,7 +143,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: # categories = col._col.values.categories.values # codes = col._col.values.codes categories = np.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_data_buffer() + codes_buffer, codes_dtype = col.get_buffers()["data"] codes = buffer_to_ndarray(codes_buffer, codes_dtype) values = categories[codes] @@ -160,11 +162,61 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: return series +def convert_string_column(col : ColumnObject) -> np.ndarray: + """ + Convert a string column to a NumPy array. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + + # Retrieve the mask buffer indicating the presence of missing values + mbuffer, mdtype = buffers["validity"] + + # Retrieve the missing value encoding + null_value = col.describe_null[1] + + # Convert the buffers to NumPy arrays + dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(dbuffer, dt) + + obuf = buffer_to_ndarray(obuffer, odtype) + mbuf = buffer_to_ndarray(mbuffer, mdtype) + + # Assemble the strings from the code units + str_list = [] + for i in range(obuf.size-1): + # Check for missing values + if mbuf[i] == null_value: # FIXME: we need to account for a mask buffer which is a bit array + str_list.append(np.nan) + continue + + # Extract a range of code units + units = dbuf[obuf[i]:obuf[i+1]]; + + # Convert the list of code units to bytes + b = bytes(units) + + # Create the string + s = b.decode(encoding="utf-8") + + # Add to our list of strings + str_list.append(s) + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object") + + def __dataframe__(cls, nan_as_null : bool = False) -> dict: """ - The public method to attach to pd.DataFrame + The public method to attach to pd.DataFrame. - We'll attach it via monkeypatching here for demo purposes. If Pandas adopt + We'll attach it via monkey-patching here for demo purposes. If Pandas adopts the protocol, this will be a regular method on pandas.DataFrame. ``nan_as_null`` is a keyword intended for the consumer to tell the @@ -205,20 +257,20 @@ def __init__(self, x : np.ndarray) -> None: @property def bufsize(self) -> int: """ - Buffer size in bytes + Buffer size in bytes. """ return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: """ - Pointer to start of the buffer as an integer + Pointer to start of the buffer as an integer. """ return self._x.__array_interface__['data'][0] def __dlpack__(self): """ - DLPack not implemented in NumPy yet, so leave it out here + DLPack not implemented in NumPy yet, so leave it out here. """ raise NotImplementedError("__dlpack__") @@ -242,9 +294,10 @@ class _PandasColumn: A column object, with only the methods and properties required by the interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain either one - or two buffers - one data buffer and (depending on null representation) it - may have a mask buffer. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. @@ -318,19 +371,24 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == 'O': + return (_DtypeKind.STRING, 8, 'u', '=') + return self._dtype_from_pandasdtype(dtype) def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: """ - See `self.dtype` for details + See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) _k = _DtypeKind - _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL, - 'U': _k.STRING, - 'M': _k.DATETIME, 'm': _k.DATETIME} + _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL, + "U": _k.STRING, + "M": _k.DATETIME, "m": _k.DATETIME} kind = _np_kinds.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe @@ -340,7 +398,7 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: raise ValueError(f"Data type {dtype} not supported by exchange" "protocol") - if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): + if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING): raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 @@ -396,7 +454,9 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. None otherwise. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. """ _k = _DtypeKind kind = self.dtype[0] @@ -407,15 +467,18 @@ def describe_null(self) -> Tuple[int, Any]: null = 1 # np.datetime64('NaT') elif kind in (_k.INT, _k.UINT, _k.BOOL): # TODO: check if extension dtypes are used once support for them is - # implemented in this procotol code + # implemented in this protocol code null = 0 # integer and boolean dtypes are non-nullable elif kind == _k.CATEGORICAL: # Null values for categoricals are stored as `-1` sentinel values # in the category date (e.g., `col.values.codes` is int8 np.ndarray) null = 2 value = -1 + elif kind == _k.STRING: + null = 4 + value = 0 # follow Arrow in using 1 as valid value and 0 for missing/null value else: - raise NotImplementedError(f'Data type {self.dtype} not yet supported') + raise NotImplementedError(f"Data type {self.dtype} not yet supported") return null, value @@ -440,9 +503,44 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn """ return (self,) - def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple + def get_buffers(self) -> Dict[str, Any]: """ - Return the buffer containing the data. + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers = {} + buffers["data"] = self._get_data_buffer() + try: + buffers["validity"] = self._get_validity_buffer() + except: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except: + buffers["offsets"] = None + + return buffers + + def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's associated dtype. """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): @@ -452,27 +550,107 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype codes = self._col.values.codes buffer = _PandasBuffer(codes) dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == _k.STRING: + # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() + b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later + for i in range(buf.size): + if type(buf[i]) == str: + b.extend(buf[i].encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + dtype = (_k.STRING, 8, "u", "=") # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") return buffer, dtype - def get_mask(self) -> _PandasBuffer: + def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: """ - Return the buffer containing the mask values indicating missing data. + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. Raises RuntimeError if null representation is not a bit or byte mask. """ - null, value = self.describe_null + null, invalid = self.describe_null + + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + # For now, have the mask array be comprised of bytes, rather than a bit array + buf = self._col.to_numpy() + mask = [] + + # Determine the encoding for valid values + if invalid == 0: + valid = 1 + else: + valid = 0 + + for i in range(buf.size): + if type(buf[i]) == str: + v = valid; + else: + v = invalid; + + mask.append(v) + + # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.asarray(mask, dtype="uint8")) + + # Define the dtype of the returned buffer + dtype = (_k.UINT, 8, "C", "=") + + return buffer, dtype + if null == 0: msg = "This column is non-nullable so does not have a mask" elif null == 1: msg = "This column uses NaN as null so does not have a separate mask" else: - raise NotImplementedError('See self.describe_null') + raise NotImplementedError("See self.describe_null") raise RuntimeError(msg) + def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = [ptr] + for v in values: + # For missing values (in this case, `np.nan` values), we don't increment the pointer) + if type(v) == str: + b = v.encode(encoding="utf-8") + ptr += len(b) + + offsets.append(ptr) + + # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + buf = np.asarray(offsets, dtype="int64") + + # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + buffer = _PandasBuffer(buf) + + # Assemble the buffer dtype info + dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness + else: + raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + return buffer, dtype + class _PandasDataFrame: """ @@ -578,9 +756,26 @@ def test_categorical_dtype(): tm.assert_frame_equal(df, df2) +def test_string_dtype(): + df = pd.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) + df["B"] = df["A"].astype("object") + df.at[1, "B"] = np.nan # Set one item to null + + # Test for correctness and null handling: + col = df.__dataframe__().get_column_by_name("B") + assert col.dtype[0] == _DtypeKind.STRING + assert col.null_count == 1 + assert col.describe_null == (4, 0) + assert col.num_chunks() == 1 + + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) + + if __name__ == '__main__': test_categorical_dtype() test_float_only() test_mixed_intfloat() test_noncontiguous_columns() + test_string_dtype()