8000 feat: remove DataFusion pyarrow feat (#1000) · chenkovsky/datafusion-python@8b51390 · GitHub
[go: up one dir, main page]

Skip to content

Commit 8b51390

Browse files
authored
feat: remove DataFusion pyarrow feat (apache#1000)
* Add developer instructions to speed up build processes * Remove pyarrow dep from datafusion. Add in PyScalarValue wrapper and rename DataFusionError to PyDataFusionError to be less confusing * Removed unnecessary cloning of scalar value when going from rust to python. Also removed the rust unit tests copied over from upstream repo that were failing due to apache#941 in pyo3 * Change return types to PyDataFusionError to simplify code * Update exception handling to fix build errors in recent rust toolchains
1 parent 78e72c9 commit 8b51390

27 files changed

+524
-348
lines changed

Cargo.lock

Lines changed: 87 additions & 58 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync
3838
pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] }
3939
pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]}
4040
arrow = { version = "53", features = ["pyarrow"] }
41-
datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
41+
datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] }
4242
datafusion-substrait = { version = "44.0.0", optional = true }
4343
datafusion-proto = { version = "44.0.0" }
4444
datafusion-ffi = { version = "44.0.0" }

docs/source/contributor-guide/introduction.rst

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,56 @@ To update dependencies, run
9595
.. code-block:: shell
9696
9797
uv sync --dev --no-install-package datafusion
98+
99+
Improving Build Speed
100+
---------------------
101+
102+
The `pyo3 <https://github.com/PyO3/pyo3>`_ dependency of this project contains a ``build.rs`` file which
103+
can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE``
104+
environment variable that points to a file with your build configuration. Whenever your build configuration
105+
changes, such as during some major version updates, you will need to regenerate this file. This variable
106+
should point to a fully resolved path on your build machine.
107+
108+
To generate this file, use the following command:
109+
110+
.. code-block:: shell
111+
112+
PYO3_PRINT_CONFIG=1 cargo build
113+
114+
This will generate some output that looks like the following. You will want to copy these contents intro
115+
a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will
116+
be ignored by ``git``.
117+
118+
.. code-block::
119+
120+
implementation=CPython
121+
version=3.8
122+
shared=true
123+
abi3=true
124+
lib_name=python3.12
125+
lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib
126+
executable=/Users/myusername/src/datafusion-python/.venv/bin/python
127+
pointer_width=64
128+
build_flags=
129+
suppress_build_script_link_lines=false
130+
131+
Add the environment variable to your system.
132+
133+
.. code-block:: shell
134+
135+
export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config"
136+
137+
If you are on a Mac and you use VS Code for your IDE, you will want to add these variables
138+
to your settings. You can find the appropriate rust flags by looking in the
139+
``.cargo/config.toml`` file.
140+
141+
.. code-block::
142+
143+
"rust-analyzer.cargo.extraEnv": {
144+
"RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
145+
"PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config"
146+
},
147+
"rust-analyzer.runnables.extraEnv": {
148+
"RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
149+
"PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config"
150+
}

python/tests/test_indexing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def test_err(df):
4343
with pytest.raises(Exception) as e_info:
4444
df["c"]
4545

46-
assert "Schema error: No field named c." in e_info.value.args[0]
46+
for e in ["SchemaError", "FieldNotFound", 'name: "c"']:
47+
assert e in e_info.value.args[0]
4748

4849
with pytest.raises(Exception) as e_info:
4950
df[1]

src/catalog.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::sync::Arc;
2121
use pyo3::exceptions::PyKeyError;
2222
use pyo3::prelude::*;
2323

24-
use crate::errors::DataFusionError;
24+
use crate::errors::{PyDataFusionError, PyDataFusionResult};
2525
use crate::utils::wait_for_future;
2626
use datafusion::{
2727
arrow::pyarrow::ToPyArrow,
@@ -96,11 +96,13 @@ impl PyDatabase {
9696
self.database.table_names().into_iter().collect()
9797
}
9898

99-
fn table(&self, name: &str, py: Python) -> PyResult<PyTable> {
99+
fn table(&self, name: &str, py: Python) -> PyDataFusionResult<PyTable> {
100100
if let Some(table) = wait_for_future(py, self.database.table(name))? {
101101
Ok(PyTable::new(table))
102102
} else {
103-
Err(DataFusionError::Common(format!("Table not found: {name}")).into())
103+
Err(PyDataFusionError::Common(format!(
104+
"Table not found: {name}"
105+
)))
104106
}
105107
}
106108

src/common/data_type.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,20 @@ use pyo3::{exceptions::PyValueError, prelude::*};
2323

2424
use crate::errors::py_datafusion_err;
2525

26+
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
27+
pub struct PyScalarValue(pub ScalarValue);
28+
29+
impl From<ScalarValue> for PyScalarValue {
30+
fn from(value: ScalarValue) -> Self {
31+
Self(value)
32+
}
33+
}
34+
impl From<PyScalarValue> for ScalarValue {
35+
fn from(value: PyScalarValue) -> Self {
36+
value.0
37+
}
38+
}
39+
2640
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
2741
#[pyclass(eq, eq_int, name = "RexType", module = "datafusion.common")]
2842
pub enum RexType {

src/config.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ use pyo3::types::*;
2121
use datafusion::common::ScalarValue;
2222
use datafusion::config::ConfigOptions;
2323

24+
use crate::errors::PyDataFusionResult;
25+
2426
#[pyclass(name = "Config", module = "datafusion", subclass)]
2527
#[derive(Clone)]
2628
pub(crate) struct PyConfig {
@@ -38,7 +40,7 @@ impl PyConfig {
3840

3941
/// Get configurations from environment variables
4042
#[staticmethod]
41-
pub fn from_env() -> PyResult<Self> {
43+
pub fn from_env() -> PyDataFusionResult<Self> {
4244
Ok(Self {
4345
config: ConfigOptions::from_env()?,
4446
})
@@ -56,11 +58,10 @@ impl PyConfig {
5658
}
5759

5860
/// Set a configuration option
59-
pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyResult<()> {
61+
pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyDataFusionResult<()> {
6062
let scalar_value = py_obj_to_scalar_value(py, value);
61-
self.config
62-
.set(key, scalar_value.to_string().as_str())
63-
.map_err(|e| e.into())
63+
self.config.set(key, scalar_value.to_string().as_str())?;
64+
Ok(())
6465
}
6566

6667
/// Get all configuration options

0 commit comments

Comments
 (0)
0