Improved DatasetExec physical plan printing.

Added nested filter test.
apache · andygrove · Jul 26, 2022 · Jul 19, 2022 · Jul 20, 2022 · Jul 21, 2022
commit 34e3df07cd3d5f76a2633a94305f1bb1865d053d
diff --git a/datafusion/tests/test_context.py b/datafusion/tests/test_context.py
@@ -18,6 +18,8 @@
 import pyarrow as pa
 import pyarrow.dataset as ds
 
+from datafusion import column, literal
+
 
 def test_register_record_batches(ctx):
     # create a RecordBatch and register it as memtable
@@ -90,7 +92,7 @@ def test_register_dataset(ctx):
     assert result[0].column(0) == pa.array([5, 7, 9])
     assert result[0].column(1) == pa.array([-3, -3, -3])
 
-def test_dataset_filter(ctx):
+def test_dataset_filter(ctx, capfd):
     # create a RecordBatch and register it as a pyarrow.dataset.Dataset
     batch = pa.RecordBatch.from_arrays(
         [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
@@ -100,8 +102,43 @@ def test_dataset_filter(ctx):
     ctx.register_dataset("t", dataset)
 
     assert ctx.tables() == {"t"}
+    df = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5")
+
+    # Make sure the filter was pushed down in Physical Plan
+    df.explain()
+    captured = capfd.readouterr()
+    assert "filter_expr=(((2 <= a) and (a <= 3)) and (b > 5))" in captured.out
+
+    result = df.collect()
+
+    assert result[0].column(0) == pa.array([9])
+    assert result[0].column(1) == pa.array([-3])
+
+
+def test_dataset_filter_nested_data(ctx):
+    # create Arrow StructArrays to test nested data types
+    data = pa.StructArray.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    batch = pa.RecordBatch.from_arrays(
+        [data],
+        names=["nested_data"],
+    )
+    dataset = ds.dataset([batch])
+    ctx.register_dataset("t", dataset)
+
+    assert ctx.tables() == {"t"}
+
+    df = ctx.table("t")
+
+    # This filter will not be pushed down to DatasetExec since it isn't supported
+    df = df.select(
+        column("nested_data")["a"] + column("nested_data")["b"],
+        column("nested_data")["a"] - column("nested_data")["b"],
+    ).filter(column("nested_data")["b"] > literal(5))
 
-    result = ctx.sql("SELECT a+b, a-b FROM t WHERE a BETWEEN 2 and 3 AND b > 5").collect()
+    result = df.collect()
 
     assert result[0].column(0) == pa.array([9])
     assert result[0].column(1) == pa.array([-3])
diff --git a/src/dataset_exec.rs b/src/dataset_exec.rs
@@ -233,18 +233,32 @@ impl ExecutionPlan for DatasetExec {
 
     fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         Python::with_gil(|py| {
-            let fragments = self.fragments.as_ref(py);
-            let files: Result<Vec<String>, PyErr> = fragments
-                .iter()
-                .map(|fragment| -> Result<String, PyErr> { fragment.extract() })
-                .collect();
+            let number_of_fragments = self.fragments.as_ref(py).len();
             match t {
                 DisplayFormatType::Default => {
-                    write!(
-                        f,
-                        "DatasetExec: files={:?}, projection={:?}",
-                        files, self.columns,
-                    )
+                    let projected_columns: Vec<String> = self
+                        .schema
+                        .fields()
+                        .iter()
+                        .map(|x| x.name().to_owned())
+                        .collect();
+                    if let Some(filter_expr) = &self.filter_expr {
+                        let filter_expr = filter_expr.as_ref(py).str().or(Err(std::fmt::Error))?;
+                        write!(
+                            f,
+                            "DatasetExec: number_of_fragments={}, filter_expr={}, projection=[{}]",
+                            number_of_fragments,
+                            filter_expr,
+                            projected_columns.join(", "),
+                        )
+                    } else {
+                        write!(
+                            f,
+                            "DatasetExec: number_of_fragments={}, projection=[{}]",
+                            number_of_fragments,
+                            projected_columns.join(", "),
+                        )
+                    }
                 }
             }
         })