8000 Add variable-length string support by kgryte · Pull Request #45 · data-apis/dataframe-api · GitHub
[go: up one dir, main page]

Skip to content

Add variable-length string support #45

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
5a8d030
Update protocol to support returning an offsets buffer
kgryte Jun 23, 2021
3aac477
Add punctuation
kgryte Jun 23, 2021
c7728e2
Update protocol to return dtype along with buffer
kgryte Jun 24, 2021
c8000f7
Add string support in various methods and add todos
kgryte Jun 24, 2021
040f928
Add support for resolving an offsets buffer
kgryte Jun 24, 2021
a982987
Add support for returning a data buffer for string dtypes
kgryte Jun 24, 2021
fd4d71b
Update offsets buffer accessor
kgryte Jun 24, 2021
e40f902
Add implementation to convert a string column
kgryte Jun 24, 2021
c122b3c
Add tests
kgryte Jun 24, 2021
0d04af3
Handle missing values
kgryte Jun 24, 2021
58fee89
Update typing and docs
kgryte Jun 24, 2021
2c4a846
Add comment
kgryte Jun 24, 2021
2e3914f
Requirements document for the dataframe interchange protocol (#35)
rgommers Jun 25, 2021
f9f259c
Remove outdated figures
rgommers Jun 25, 2021
a545faa
Document that strided buffers do not need to be supported
rgommers Jun 25, 2021
52abf7a
Merge pull request #38 from data-apis/protocol-impl
rgommers Jun 25, 2021
6010ae7
Add todo
kgryte Jun 28, 2021
8000 ac1a5ca
Merge branch 'main' of https://github.com/data-apis/dataframe-api int…
kgryte Jun 28, 2021
89a7996
Remove colons
kgryte Jun 28, 2021
a3ff4e7
Fix grammar
kgryte Jul 8, 2021
ff84e8c
Rename methods
kgryte Jul 8, 2021
c954f3c
Rename methods
kgryte Jul 8, 2021
ed64fb7
Update describe_null to indicate a byte array for string dtype
kgryte Jul 8, 2021
9b9aecf
Return encoding for missing values
kgryte Jul 19, 2021
4026900
Update test
kgryte Jul 19, 2021
87d7143
Use invalid value encoding
kgryte Jul 19, 2021
56ee2da
Update copy
kgryte Jul 19, 2021
0035c90
Use Arrow format strings
kgryte Jul 19, 2021
91ed6a1
Add `get_buffers` method to the protocol
kgryte Jul 19, 2021
26fb48d
Remove individual methods
kgryte Jul 19, 2021
0d0e94b
Update copy
kgryte Jul 19, 2021
9ec830c
Refactor to return a dictionary of buffers
kgryte Jul 19, 2021
0dd4e2c
Update comments
kgryte Jul 19, 2021
ade0d76
Fix copy
kgryte Jul 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor to return a dictionary of buffers
  • Loading branch information
kgryte committed Jul 19, 2021
commit 9ec830cc495199c438a832e16ade227ded6ed75b
54 changes: 46 additions & 8 deletions protocol/pandas_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray:
raise NotImplementedError("Null values represented as masks or "
"sentinel values not handled yet")

_buffer, _dtype = col.get_data_buffer()
_buffer, _dtype = col.get_buffers()["data"]
return buffer_to_ndarray(_buffer, _dtype)


Expand Down Expand Up @@ -143,7 +143,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
# categories = col._col.values.categories.values
# codes = col._col.values.codes
categories = np.asarray(list(mapping.values()))
codes_buffer, codes_dtype = col.get_data_buffer()
codes_buffer, codes_dtype = col.get_buffers()["data"]
codes = buffer_to_ndarray(codes_buffer, codes_dtype)
values = categories[codes]

Expand All @@ -166,14 +166,17 @@ def convert_string_column(col : ColumnObject) -> np.ndarray:
"""
Convert a string column to a NumPy array.
"""
# Retrieve the data buffers:
buffers = col.get_buffers()

# Retrieve the data buffer containing the UTF-8 code units
dbuffer, bdtype = col.get_data_buffer()
dbuffer, bdtype = buffers["data"]

# Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string
obuffer, odtype = col.get_offsets_buffer()
obuffer, odtype = buffers["offsets"]

# Retrieve the mask buffer indicating the presence of missing values:
mbuffer, mdtype = col.get_validity_buffer()
mbuffer, mdtype = buffers["validity"]

# Retrieve the missing value encoding:
null_value = col.describe_null[1]
Expand Down Expand Up @@ -500,7 +503,42 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn
"""
return (self,)

def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple
def get_buffers(self) -> Dict[str, Any]:
"""
Return a dictionary containing the underlying buffers.

The returned dictionary has the following contents:

- "data": a two-element tuple whose first element is a tuple
containing the buffer containing the data and whose second
element is the data buffer's associated dtype.
- "validity": a two-element tuple whose first element is a tuple
containing the buffer containing mask values
indicating missing data and whose second element is
the mask value buffer's associated dtype. None if the
null representation is not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a tuple
containing the buffer containing the offset values for
variable-size binary data (e.g., variable-length
strings) and whose second element is the offsets
buffer's associated dtype. None if the data buffer does
not have an associated offsets buffer.
"""
buffers = {}
buffers["data"] = self._get_data_buffer()
try:
buffers["validity"] = self._get_validity_buffer()
except:
buffers["validity"] = None

try:
buffers["offsets"] = self._get_offsets_buffer()
except:
buffers["offsets"] = None

return buffers

def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple
"""
Return the buffer containing the data and the buffer's associated dtype.
"""
Expand Down Expand Up @@ -532,7 +570,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype

return buffer, dtype

def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:
def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:
"""
Return the buffer containing the mask values indicating missing data and
the buffer's associated dtype.
Expand Down Expand Up @@ -578,7 +616,7 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]:

raise RuntimeError(msg)

def get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]:
"""
Return the buffer containing the offset values for variable-size binary
data (e.g., variable-length strings) and the buffer's associated dtype.
Expand Down
0