From f9b78fa3180c5d6c20eaa3b6d0af7426d7084093 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 18:58:05 +0800
Subject: [PATCH 01/40] feat: add configurable max table bytes and min table
 rows for DataFrame display

---
 python/datafusion/html_formatter.py | 19 +++++++-
 src/dataframe.rs                    | 69 ++++++++++++++++-------------
 2 files changed, 57 insertions(+), 31 deletions(-)
diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index a50e14fd5..2eb116cab 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -98,6 +98,8 @@ class DataFrameHtmlFormatter:
         style_provider: Custom provider for cell and header styles
         use_shared_styles: Whether to load styles and scripts only once per notebook
           session
+        max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) 
+        min_table_rows: Minimum number of table rows to display (default: 20)
     """
 
     # Class variable to track if styles have been loaded in the notebook
@@ -113,6 +115,8 @@ def __init__(
         show_truncation_message: bool = True,
         style_provider: Optional[StyleProvider] = None,
         use_shared_styles: bool = True,
+        max_table_bytes: int = 2 * 1024 * 1024,  # 2 MB
+        min_table_rows: int = 20,
     ) -> None:
         """Initialize the HTML formatter.
 
@@ -135,11 +139,16 @@ def __init__(
             is used.
         use_shared_styles : bool, default True
             Whether to use shared styles across multiple tables.
+        max_table_bytes : int, default 2MB (2 * 1024 * 1024)
+            Maximum bytes to display for table presentation.
+        min_table_rows : int, default 20
+            Minimum number of table rows to display.
 
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, or max_height is not a positive integer.
+            If max_cell_length, max_width, max_height, max_table_bytes, or min_table_rows 
+            is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
             not a boolean,
@@ -158,6 +167,12 @@ def __init__(
         if not isinstance(max_height, int) or max_height <= 0:
             msg = "max_height must be a positive integer"
             raise ValueError(msg)
+        if not isinstance(max_table_bytes, int) or max_table_bytes <= 0:
+            msg = "max_table_bytes must be a positive integer"
+            raise ValueError(msg)
+        if not isinstance(min_table_rows, int) or min_table_rows <= 0:
+            msg = "min_table_rows must be a positive integer"
+            raise ValueError(msg)
 
         # Validate boolean parameters
         if not isinstance(enable_cell_expansion, bool):
@@ -188,6 +203,8 @@ def __init__(
         self.show_truncation_message = show_truncation_message
         self.style_provider = style_provider or DefaultStyleProvider()
         self.use_shared_styles = use_shared_styles
+        self.max_table_bytes = max_table_bytes
+        self.min_table_rows = min_table_rows
         # Registry for custom type formatters
         self._type_formatters: dict[type, CellFormatter] = {}
         # Custom cell builders
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 9b610b5d7..e9f73a70d 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -71,8 +71,6 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -81,12 +79,16 @@ const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    display_config: Arc<PyDataframeDisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+    pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
+        Self {
+            df: Arc::new(df),
+            display_config: Arc::new(display_config),
+        }
     }
 }
 
@@ -116,7 +118,12 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                10,
+                10,
+                self.display_config.max_table_bytes,
+            ),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -139,8 +146,9 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
+                self.display_config.min_table_rows,
                 usize::MAX,
+                self.display_config.max_table_bytes,
             ),
         )?;
         if batches.is_empty() {
@@ -181,7 +189,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df))
+        Ok(Self::new(stat_df, (*self.display_config).clone()))
     }
 
     /// Returns the schema from the logical plan
@@ -211,31 +219,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -245,7 +253,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -256,27 +264,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -293,7 +301,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -318,7 +326,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn join(
@@ -352,7 +360,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn join_on(
@@ -381,7 +389,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Print the query plan
@@ -414,7 +422,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -426,7 +434,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -442,7 +450,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -453,7 +461,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -494,13 +502,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -798,6 +806,7 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
+    max_table_bytes: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -806,7 +815,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
+    while (size_estimate_so_far < max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -821,8 +830,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > max_table_bytes {
+                let ratio = max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From 4d8fa38007b7dfe689344fc44d5392a8734c64f5 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 18:58:23 +0800
Subject: [PATCH 02/40] Revert "feat: add configurable max table bytes and min
 table rows for DataFrame display"

This reverts commit f9b78fa3180c5d6c20eaa3b6d0af7426d7084093.
---
 python/datafusion/html_formatter.py | 19 +-------
 src/dataframe.rs                    | 69 +++++++++++++----------------
 2 files changed, 31 insertions(+), 57 deletions(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index 2eb116cab..a50e14fd5 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -98,8 +98,6 @@ class DataFrameHtmlFormatter:
         style_provider: Custom provider for cell and header styles
         use_shared_styles: Whether to load styles and scripts only once per notebook
           session
-        max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) 
-        min_table_rows: Minimum number of table rows to display (default: 20)
     """
 
     # Class variable to track if styles have been loaded in the notebook
@@ -115,8 +113,6 @@ def __init__(
         show_truncation_message: bool = True,
         style_provider: Optional[StyleProvider] = None,
         use_shared_styles: bool = True,
-        max_table_bytes: int = 2 * 1024 * 1024,  # 2 MB
-        min_table_rows: int = 20,
     ) -> None:
         """Initialize the HTML formatter.
 
@@ -139,16 +135,11 @@ def __init__(
             is used.
         use_shared_styles : bool, default True
             Whether to use shared styles across multiple tables.
-        max_table_bytes : int, default 2MB (2 * 1024 * 1024)
-            Maximum bytes to display for table presentation.
-        min_table_rows : int, default 20
-            Minimum number of table rows to display.
 
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, max_height, max_table_bytes, or min_table_rows 
-            is not a positive integer.
+            If max_cell_length, max_width, or max_height is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
             not a boolean,
@@ -167,12 +158,6 @@ def __init__(
         if not isinstance(max_height, int) or max_height <= 0:
             msg = "max_height must be a positive integer"
             raise ValueError(msg)
-        if not isinstance(max_table_bytes, int) or max_table_bytes <= 0:
-            msg = "max_table_bytes must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(min_table_rows, int) or min_table_rows <= 0:
-            msg = "min_table_rows must be a positive integer"
-            raise ValueError(msg)
 
         # Validate boolean parameters
         if not isinstance(enable_cell_expansion, bool):
@@ -203,8 +188,6 @@ def __init__(
         self.show_truncation_message = show_truncation_message
         self.style_provider = style_provider or DefaultStyleProvider()
         self.use_shared_styles = use_shared_styles
-        self.max_table_bytes = max_table_bytes
-        self.min_table_rows = min_table_rows
         # Registry for custom type formatters
         self._type_formatters: dict[type, CellFormatter] = {}
         # Custom cell builders
diff --git a/src/dataframe.rs b/src/dataframe.rs
index e9f73a70d..9b610b5d7 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -71,6 +71,8 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
+const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
+const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -79,16 +81,12 @@ impl PyTableProvider {
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
-    display_config: Arc<PyDataframeDisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
-        Self {
-            df: Arc::new(df),
-            display_config: Arc::new(display_config),
-        }
+    pub fn new(df: DataFrame) -> Self {
+        Self { df: Arc::new(df) }
     }
 }
 
@@ -118,12 +116,7 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                10,
-                10,
-                self.display_config.max_table_bytes,
-            ),
+            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -146,9 +139,8 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                self.display_config.min_table_rows,
+                MIN_TABLE_ROWS_TO_DISPLAY,
                 usize::MAX,
-                self.display_config.max_table_bytes,
             ),
         )?;
         if batches.is_empty() {
@@ -189,7 +181,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df, (*self.display_config).clone()))
+        Ok(Self::new(stat_df))
     }
 
     /// Returns the schema from the logical plan
@@ -219,31 +211,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -253,7 +245,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -264,27 +256,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -301,7 +293,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -326,7 +318,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn join(
@@ -360,7 +352,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     fn join_on(
@@ -389,7 +381,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df, (*self.display_config).clone()))
+        Ok(Self::new(df))
     }
 
     /// Print the query plan
@@ -422,7 +414,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -434,7 +426,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -450,7 +442,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -461,7 +453,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -502,13 +494,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df, (*self.display_config).clone()))
+        Ok(Self::new(new_df))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -806,7 +798,6 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
-    max_table_bytes: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -815,7 +806,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < max_table_bytes && rows_so_far < max_rows)
+    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -830,8 +821,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > max_table_bytes {
-                let ratio = max_table_bytes as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
+                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From a9178feb501c11c3c9ee0a20f71418a8ea4168f7 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 19:20:13 +0800
Subject: [PATCH 03/40] feat: add FormatterConfig for configurable DataFrame
 display options

---
 src/dataframe.rs | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 9b610b5d7..cef950988 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -71,9 +71,62 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
+
+/// Configuration for DataFrame display formatting
+#[derive(Debug, Clone)]
+pub struct FormatterConfig {
+    /// Maximum memory in bytes to use for display (default: 2MB)
+    pub max_bytes: usize,
+    /// Minimum number of rows to display (default: 20)
+    pub min_rows: usize,
+    /// Number of rows to include in __repr__ output (default: 10)
+    pub repr_rows: usize,
+}
+
+impl Default for FormatterConfig {
+    fn default() -> Self {
+        Self {
+            max_bytes: 2 * 1024 * 1024, // 2MB
+            min_rows: 20,
+            repr_rows: 10,
+        }
+    }
+}
+
+// Keep constants for backward compatibility
 const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
 const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 
+fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
+    let formatter_module = py.import("datafusion.html_formatter")?;
+    let get_formatter = formatter_module.getattr("get_formatter")?;
+    let formatter = get_formatter.call0()?;
+
+    // Get max_memory_bytes (or fallback to default)
+    let max_bytes = formatter
+        .getattr("max_memory_bytes")
+        .and_then(|v| v.extract::<usize>())
+        .unwrap_or(FormatterConfig::default().max_bytes);
+
+    // Get min_rows_display (or fallback to default)
+    let min_rows = formatter
+        .getattr("min_rows_display")
+        .and_then(|v| v.extract::<usize>())
+        .unwrap_or(FormatterConfig::default().min_rows);
+
+    // Get repr_rows (or fallback to default)
+    let repr_rows = formatter
+        .getattr("repr_rows")
+        .and_then(|v| v.extract::<usize>())
+        .unwrap_or(FormatterConfig::default().repr_rows);
+
+    Ok(FormatterConfig {
+        max_bytes,
+        min_rows,
+        repr_rows,
+    })
+}
+
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
 /// The actual execution of a plan runs natively on Rust and Arrow on a multi-threaded environment.

From d0209cf7d90400675f09b490cad0ca700d74f4c7 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 19:23:42 +0800
Subject: [PATCH 04/40] refactor: simplify attribute extraction in
 get_formatter_config function

---
 src/dataframe.rs | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index cef950988..ea838d845 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -102,23 +102,22 @@ fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
     let get_formatter = formatter_module.getattr("get_formatter")?;
     let formatter = get_formatter.call0()?;
 
-    // Get max_memory_bytes (or fallback to default)
-    let max_bytes = formatter
-        .getattr("max_memory_bytes")
-        .and_then(|v| v.extract::<usize>())
-        .unwrap_or(FormatterConfig::default().max_bytes);
-
-    // Get min_rows_display (or fallback to default)
-    let min_rows = formatter
-        .getattr("min_rows_display")
-        .and_then(|v| v.extract::<usize>())
-        .unwrap_or(FormatterConfig::default().min_rows);
-
-    // Get repr_rows (or fallback to default)
-    let repr_rows = formatter
-        .getattr("repr_rows")
-        .and_then(|v| v.extract::<usize>())
-        .unwrap_or(FormatterConfig::default().repr_rows);
+    // Helper function to extract attributes with fallback to default
+    fn get_attr<'a>(
+        formatter: &'a Bound<'a, PyAny>,
+        attr_name: &str,
+        default_value: usize,
+    ) -> usize {
+        formatter
+            .getattr(attr_name)
+            .and_then(|v| v.extract::<usize>())
+            .unwrap_or(default_value)
+    }
+
+    let default_config = FormatterConfig::default();
+    let max_bytes = get_attr(&formatter, "max_memory_bytes", default_config.max_bytes);
+    let min_rows = get_attr(&formatter, "min_rows_display", default_config.min_rows);
+    let repr_rows = get_attr(&formatter, "repr_rows", default_config.repr_rows);
 
     Ok(FormatterConfig {
         max_bytes,

From 2ef013f1d9f9af3113e2a16b5e92d2274f9cd3e3 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 19:33:01 +0800
Subject: [PATCH 05/40] refactor: remove hardcoded constants and use
 FormatterConfig for display options

---
 src/dataframe.rs | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index ea838d845..e6dd4f70d 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -93,10 +93,6 @@ impl Default for FormatterConfig {
     }
 }
 
-// Keep constants for backward compatibility
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
-
 fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
     let formatter_module = py.import("datafusion.html_formatter")?;
     let get_formatter = formatter_module.getattr("get_formatter")?;
@@ -166,9 +162,14 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
+        let config = get_formatter_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                config.repr_rows,
+                config.repr_rows,
+            ),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -187,11 +188,12 @@ impl PyDataFrame {
     }
 
     fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
+        let config = get_formatter_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
+                config.min_rows,
                 usize::MAX,
             ),
         )?;
@@ -851,6 +853,9 @@ async fn collect_record_batches_to_display(
     min_rows: usize,
     max_rows: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
+    let config = FormatterConfig::default();
+    let max_bytes = config.max_bytes;
+
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
     let mut size_estimate_so_far = 0;
@@ -858,9 +863,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
-        || rows_so_far < min_rows
-    {
+    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {
                 break;
@@ -873,8 +876,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > max_bytes {
+                let ratio = max_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;

From bea52a31a3b6c2ee481a9d21d28ffa00674e9dd6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:02:20 +0800
Subject: [PATCH 06/40] refactor: simplify record batch collection by using
 FormatterConfig for display options

---
 src/dataframe.rs | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index e6dd4f70d..62069461a 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -165,11 +165,7 @@ impl PyDataFrame {
         let config = get_formatter_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                config.repr_rows,
-                config.repr_rows,
-            ),
+            collect_record_batches_to_display(self.df.as_ref().clone(), config),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -191,11 +187,7 @@ impl PyDataFrame {
         let config = get_formatter_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(
-                self.df.as_ref().clone(),
-                config.min_rows,
-                usize::MAX,
-            ),
+            collect_record_batches_to_display(self.df.as_ref().clone(), config),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -850,11 +842,11 @@ fn record_batch_into_schema(
 /// rows, set min_rows == max_rows.
 async fn collect_record_batches_to_display(
     df: DataFrame,
-    min_rows: usize,
-    max_rows: usize,
+    config: FormatterConfig,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
-    let config = FormatterConfig::default();
     let max_bytes = config.max_bytes;
+    let min_rows = config.min_rows;
+    let max_rows = config.repr_rows;
 
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();

From ce15f1dcf8d595044cd0a90f76f3612871cbd80e Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:02:30 +0800
Subject: [PATCH 07/40] feat: add max_memory_bytes, min_rows_display, and
 repr_rows parameters to DataFrameHtmlFormatter

---
 python/datafusion/html_formatter.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index a50e14fd5..065b7262c 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -91,6 +91,9 @@ class DataFrameHtmlFormatter:
         max_cell_length: Maximum characters to display in a cell before truncation
         max_width: Maximum width of the HTML table in pixels
         max_height: Maximum height of the HTML table in pixels
+        max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB)
+        min_rows_display: Minimum number of rows to display
+        repr_rows: Default number of rows to display in repr output
         enable_cell_expansion: Whether to add expand/collapse buttons for long cell
           values
         custom_css: Additional CSS to include in the HTML output
@@ -108,6 +111,9 @@ def __init__(
         max_cell_length: int = 25,
         max_width: int = 1000,
         max_height: int = 300,
+        max_memory_bytes: int = 2 * 1024 * 1024,  # 2 MB
+        min_rows_display: int = 20,
+        repr_rows: int = 10,
         enable_cell_expansion: bool = True,
         custom_css: Optional[str] = None,
         show_truncation_message: bool = True,
@@ -124,6 +130,12 @@ def __init__(
             Maximum width of the displayed table in pixels.
         max_height : int, default 300
             Maximum height of the displayed table in pixels.
+        max_memory_bytes : int, default 2097152 (2MB)
+            Maximum memory in bytes for rendered data.
+        min_rows_display : int, default 20
+            Minimum number of rows to display.
+        repr_rows : int, default 10
+            Default number of rows to display in repr output.
         enable_cell_expansion : bool, default True
             Whether to allow cells to expand when clicked.
         custom_css : str, optional
@@ -139,7 +151,8 @@ def __init__(
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, or max_height is not a positive integer.
+            If max_cell_length, max_width, max_height, max_memory_bytes, 
+            min_rows_display, or repr_rows is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
             not a boolean,
@@ -158,6 +171,15 @@ def __init__(
         if not isinstance(max_height, int) or max_height <= 0:
             msg = "max_height must be a positive integer"
             raise ValueError(msg)
+        if not isinstance(max_memory_bytes, int) or max_memory_bytes <= 0:
+            msg = "max_memory_bytes must be a positive integer"
+            raise ValueError(msg)
+        if not isinstance(min_rows_display, int) or min_rows_display <= 0:
+            msg = "min_rows_display must be a positive integer"
+            raise ValueError(msg)
+        if not isinstance(repr_rows, int) or repr_rows <= 0:
+            msg = "repr_rows must be a positive integer"
+            raise ValueError(msg)
 
         # Validate boolean parameters
         if not isinstance(enable_cell_expansion, bool):
@@ -183,6 +205,9 @@ def __init__(
         self.max_cell_length = max_cell_length
         self.max_width = max_width
         self.max_height = max_height
+        self.max_memory_bytes = max_memory_bytes
+        self.min_rows_display = min_rows_display
+        self.repr_rows = repr_rows
         self.enable_cell_expansion = enable_cell_expansion
         self.custom_css = custom_css
         self.show_truncation_message = show_truncation_message

From e089d7b282e53e587116b11d92760e6d292ec871 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:15:50 +0800
Subject: [PATCH 08/40] feat: add tests for HTML formatter row display settings
 and memory limit

---
 python/tests/test_dataframe.py | 136 ++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 464b884db..2a6f7ec5a 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -679,6 +679,9 @@ def test_html_formatter_configuration(df, clean_formatter_state):
         max_width=500,
         max_height=200,
         enable_cell_expansion=False,
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=15,
+        repr_rows=5,
     )
 
     html_output = df._repr_html_()
@@ -690,6 +693,71 @@ def test_html_formatter_configuration(df, clean_formatter_state):
     assert "expandable-container" not in html_output
 
 
+def test_html_formatter_row_display_settings(clean_formatter_state):
+    """Test that min_rows_display and repr_rows affect the output."""
+    ctx = SessionContext()
+    
+    # Create a dataframe with 30 rows
+    data = list(range(30))
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array(data)],
+        names=["value"],
+    )
+    df = ctx.create_dataframe([[batch]])
+    
+    # Test with default settings (should use repr_rows)
+    configure_formatter(repr_rows=7, min_rows_display=20)
+    html_default = df._repr_html_()
+    
+    # Verify we only show repr_rows (7) rows in the output
+    # by counting the number of value cells
+    value_cells = re.findall(r"<td[^>]*>\s*\d+\s*</td>", html_default)
+    assert len(value_cells) == 7
+    assert "... with 23 more rows" in html_default
+    
+    # Configure to show all rows since it's below min_rows_display
+    reset_formatter()
+    configure_formatter(repr_rows=5, min_rows_display=50)
+    html_all = df._repr_html_()
+    
+    # Verify we show all rows
+    value_cells = re.findall(r"<td[^>]*>\s*\d+\s*</td>", html_all)
+    assert len(value_cells) == 30
+    assert "... with" not in html_all
+
+
+def test_html_formatter_memory_limit(clean_formatter_state):
+    """Test that max_memory_bytes limits the HTML rendering."""
+    ctx = SessionContext()
+    
+    # Create a large string that will consume substantial memory when rendered
+    large_string = "x" * 100000
+    
+    # Create a dataframe with 10 rows of large strings
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([large_string] * 10)],
+        names=["large_value"],
+    )
+    df = ctx.create_dataframe([[batch]])
+    
+    # Set very small memory limit
+    configure_formatter(max_memory_bytes=1000)  # 1KB
+    
+    html_limited = df._repr_html_()
+    
+    # Verify that memory limit warning is included in the output
+    assert "Memory usage limit reached" in html_limited
+    
+    # Now with larger limit, should display normally
+    reset_formatter()
+    configure_formatter(max_memory_bytes=10 * 1024 * 1024)  # 10MB
+    
+    html_full = df._repr_html_()
+    
+    # Verify no memory limit warning
+    assert "Memory usage limit reached" not in html_full
+
+
 def test_html_formatter_custom_style_provider(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter."""
 
@@ -771,74 +839,6 @@ def custom_cell_builder(value, row, col, table_id):
         r'<td style="background-color: #d3e9f0"[^>]*>(\d+)-low</td>', html_output
     )
     mid_cells = re.findall(
-        r'<td style="border: 1px solid #ddd"[^>]*>(\d+)-mid</td>', html_output
-    )
-    high_cells = re.findall(
-        r'<td style="background-color: #d9f0d3"[^>]*>(\d+)-high</td>', html_output
-    )
-
-    # Sort the extracted values for consistent comparison
-    low_cells = sorted(map(int, low_cells))
-    mid_cells = sorted(map(int, mid_cells))
-    high_cells = sorted(map(int, high_cells))
-
-    # Verify specific values have the correct styling applied
-    assert low_cells == [1, 2]  # Values < 3
-    assert mid_cells == [3, 4, 5, 5]  # Values 3-5
-    assert high_cells == [6, 8, 8]  # Values > 5
-
-    # Verify the exact content with styling appears in the output
-    assert (
-        '<td style="background-color: #d3e9f0" data-test="low">1-low</td>'
-        in html_output
-    )
-    assert (
-        '<td style="background-color: #d3e9f0" data-test="low">2-low</td>'
-        in html_output
-    )
-    assert (
-        '<td style="border: 1px solid #ddd" data-test="mid">3-mid</td>' in html_output
-    )
-    assert (
-        '<td style="border: 1px solid #ddd" data-test="mid">4-mid</td>' in html_output
-    )
-    assert (
-        '<td style="background-color: #d9f0d3" data-test="high">6-high</td>'
-        in html_output
-    )
-    assert (
-        '<td style="background-color: #d9f0d3" data-test="high">8-high</td>'
-        in html_output
-    )
-
-    # Count occurrences to ensure all cells are properly styled
-    assert html_output.count("-low</td>") == 2  # Two low values (1, 2)
-    assert html_output.count("-mid</td>") == 4  # Four mid values (3, 4, 5, 5)
-    assert html_output.count("-high</td>") == 3  # Three high values (6, 8, 8)
-
-    # Create a custom cell builder that changes background color based on value
-    def custom_cell_builder(value, row, col, table_id):
-        # Handle numeric values regardless of their exact type
-        try:
-            num_value = int(value)
-            if num_value > 5:  # Values > 5 get green background
-                return f'<td style="background-color: #d9f0d3">{value}</td>'
-            if num_value < 3:  # Values < 3 get light blue background
-                return f'<td style="background-color: #d3e9f0">{value}</td>'
-        except (ValueError, TypeError):
-            pass
-
-        # Default styling for other cells
-        return f'<td style="border: 1px solid #ddd">{value}</td>'
-
-    # Set our custom cell builder
-    formatter = get_formatter()
-    formatter.set_custom_cell_builder(custom_cell_builder)
-
-    html_output = df._repr_html_()
-
-    # Verify our custom cell styling was applied
-    assert "background-color: #d3e9f0" in html_output  # For values 1,2
 
 
 def test_html_formatter_custom_header_builder(df, clean_formatter_state):

From a6792c9379c677f0a2456d1c886524136b5489de Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:19:43 +0800
Subject: [PATCH 09/40] refactor: extract Python formatter retrieval into a
 separate function

---
 src/dataframe.rs | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 62069461a..98983473a 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -93,10 +93,15 @@ impl Default for FormatterConfig {
     }
 }
 
-fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
+/// Get the Python formatter from the datafusion.html_formatter module
+fn get_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
     let formatter_module = py.import("datafusion.html_formatter")?;
     let get_formatter = formatter_module.getattr("get_formatter")?;
-    let formatter = get_formatter.call0()?;
+    get_formatter.call0()
+}
+
+fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
+    let formatter = get_python_formatter(py)?;
 
     // Helper function to extract attributes with fallback to default
     fn get_attr<'a>(
@@ -205,9 +210,7 @@ impl PyDataFrame {
         let py_schema = self.schema().into_pyobject(py)?;
 
         // Get the Python formatter module and call format_html
-        let formatter_module = py.import("datafusion.html_formatter")?;
-        let get_formatter = formatter_module.getattr("get_formatter")?;
-        let formatter = get_formatter.call0()?;
+        let formatter = get_python_formatter(py)?;
 
         // Call format_html method on the formatter
         let kwargs = pyo3::types::PyDict::new(py);

From af678b526d0f6f735ce4f06232a30c54775e95fd Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:23:15 +0800
Subject: [PATCH 10/40] Revert "feat: add tests for HTML formatter row display
 settings and memory limit"

This reverts commit e089d7b282e53e587116b11d92760e6d292ec871.
---
 python/tests/test_dataframe.py | 136 ++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 2a6f7ec5a..464b884db 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -679,9 +679,6 @@ def test_html_formatter_configuration(df, clean_formatter_state):
         max_width=500,
         max_height=200,
         enable_cell_expansion=False,
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=15,
-        repr_rows=5,
     )
 
     html_output = df._repr_html_()
@@ -693,71 +690,6 @@ def test_html_formatter_configuration(df, clean_formatter_state):
     assert "expandable-container" not in html_output
 
 
-def test_html_formatter_row_display_settings(clean_formatter_state):
-    """Test that min_rows_display and repr_rows affect the output."""
-    ctx = SessionContext()
-    
-    # Create a dataframe with 30 rows
-    data = list(range(30))
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array(data)],
-        names=["value"],
-    )
-    df = ctx.create_dataframe([[batch]])
-    
-    # Test with default settings (should use repr_rows)
-    configure_formatter(repr_rows=7, min_rows_display=20)
-    html_default = df._repr_html_()
-    
-    # Verify we only show repr_rows (7) rows in the output
-    # by counting the number of value cells
-    value_cells = re.findall(r"<td[^>]*>\s*\d+\s*</td>", html_default)
-    assert len(value_cells) == 7
-    assert "... with 23 more rows" in html_default
-    
-    # Configure to show all rows since it's below min_rows_display
-    reset_formatter()
-    configure_formatter(repr_rows=5, min_rows_display=50)
-    html_all = df._repr_html_()
-    
-    # Verify we show all rows
-    value_cells = re.findall(r"<td[^>]*>\s*\d+\s*</td>", html_all)
-    assert len(value_cells) == 30
-    assert "... with" not in html_all
-
-
-def test_html_formatter_memory_limit(clean_formatter_state):
-    """Test that max_memory_bytes limits the HTML rendering."""
-    ctx = SessionContext()
-    
-    # Create a large string that will consume substantial memory when rendered
-    large_string = "x" * 100000
-    
-    # Create a dataframe with 10 rows of large strings
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([large_string] * 10)],
-        names=["large_value"],
-    )
-    df = ctx.create_dataframe([[batch]])
-    
-    # Set very small memory limit
-    configure_formatter(max_memory_bytes=1000)  # 1KB
-    
-    html_limited = df._repr_html_()
-    
-    # Verify that memory limit warning is included in the output
-    assert "Memory usage limit reached" in html_limited
-    
-    # Now with larger limit, should display normally
-    reset_formatter()
-    configure_formatter(max_memory_bytes=10 * 1024 * 1024)  # 10MB
-    
-    html_full = df._repr_html_()
-    
-    # Verify no memory limit warning
-    assert "Memory usage limit reached" not in html_full
-
-
 def test_html_formatter_custom_style_provider(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter."""
 
@@ -839,6 +771,74 @@ def custom_cell_builder(value, row, col, table_id):
         r'<td style="background-color: #d3e9f0"[^>]*>(\d+)-low</td>', html_output
     )
     mid_cells = re.findall(
+        r'<td style="border: 1px solid #ddd"[^>]*>(\d+)-mid</td>', html_output
+    )
+    high_cells = re.findall(
+        r'<td style="background-color: #d9f0d3"[^>]*>(\d+)-high</td>', html_output
+    )
+
+    # Sort the extracted values for consistent comparison
+    low_cells = sorted(map(int, low_cells))
+    mid_cells = sorted(map(int, mid_cells))
+    high_cells = sorted(map(int, high_cells))
+
+    # Verify specific values have the correct styling applied
+    assert low_cells == [1, 2]  # Values < 3
+    assert mid_cells == [3, 4, 5, 5]  # Values 3-5
+    assert high_cells == [6, 8, 8]  # Values > 5
+
+    # Verify the exact content with styling appears in the output
+    assert (
+        '<td style="background-color: #d3e9f0" data-test="low">1-low</td>'
+        in html_output
+    )
+    assert (
+        '<td style="background-color: #d3e9f0" data-test="low">2-low</td>'
+        in html_output
+    )
+    assert (
+        '<td style="border: 1px solid #ddd" data-test="mid">3-mid</td>' in html_output
+    )
+    assert (
+        '<td style="border: 1px solid #ddd" data-test="mid">4-mid</td>' in html_output
+    )
+    assert (
+        '<td style="background-color: #d9f0d3" data-test="high">6-high</td>'
+        in html_output
+    )
+    assert (
+        '<td style="background-color: #d9f0d3" data-test="high">8-high</td>'
+        in html_output
+    )
+
+    # Count occurrences to ensure all cells are properly styled
+    assert html_output.count("-low</td>") == 2  # Two low values (1, 2)
+    assert html_output.count("-mid</td>") == 4  # Four mid values (3, 4, 5, 5)
+    assert html_output.count("-high</td>") == 3  # Three high values (6, 8, 8)
+
+    # Create a custom cell builder that changes background color based on value
+    def custom_cell_builder(value, row, col, table_id):
+        # Handle numeric values regardless of their exact type
+        try:
+            num_value = int(value)
+            if num_value > 5:  # Values > 5 get green background
+                return f'<td style="background-color: #d9f0d3">{value}</td>'
+            if num_value < 3:  # Values < 3 get light blue background
+                return f'<td style="background-color: #d3e9f0">{value}</td>'
+        except (ValueError, TypeError):
+            pass
+
+        # Default styling for other cells
+        return f'<td style="border: 1px solid #ddd">{value}</td>'
+
+    # Set our custom cell builder
+    formatter = get_formatter()
+    formatter.set_custom_cell_builder(custom_cell_builder)
+
+    html_output = df._repr_html_()
+
+    # Verify our custom cell styling was applied
+    assert "background-color: #d3e9f0" in html_output  # For values 1,2
 
 
 def test_html_formatter_custom_header_builder(df, clean_formatter_state):

From 4090fd2f7378855b045d6bfd1368d088cc9ada75 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:26:31 +0800
Subject: [PATCH 11/40] feat: add tests for HTML formatter row and memory limit
 configurations

---
 python/tests/test_dataframe.py | 1107 ++++++++++++++++++++++++++++++++
 1 file changed, 1107 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 464b884db..64b53f491 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -690,6 +690,1063 @@ def test_html_formatter_configuration(df, clean_formatter_state):
     assert "expandable-container" not in html_output
 
 
+def test_html_formatter_row_memory_limits(clean_formatter_state):
+    """Test the HTML formatter's row and memory limit parameters."""
+    ctx = SessionContext()
+    
+    # Create a DataFrame with 50 rows and some wide string data
+    wide_data = ["x" * 1000] * 50  # 1000 character strings to test memory limits
+    ids = list(range(50))
+    
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array(ids), pa.array(wide_data)],
+        names=["id", "wide_data"],
+    )
+    df = ctx.create_dataframe([[batch]])
+    
+    # Test with custom repr_rows (show only 5 rows in repr)
+    configure_formatter(
+        repr_rows=5,
+        min_rows_display=20,  # This should not override repr_rows
+        max_memory_bytes=2 * 1024 * 1024,  # Default 2MB
+    )
+    
+    html_output = df._repr_html_()
+    
+    # Only 5 rows should be rendered (first few rows + last few rows)
+    # The string "id: 4" should appear (last of first chunk)
+    # The string "id: 45" should appear (first of last chunk)
+    row_matches = re.findall(r"<td[^>]*?>(\d+)</td>", html_output)
+    assert len(row_matches) <= 10  # Should have at most 10 (5 from top, 5 from bottom)
+    
+    # Test with smaller memory limit
+    configure_formatter(
+        repr_rows=50,  # Try to show all rows
+        max_memory_bytes=10 * 1000,  # Only ~10 rows of our data should fit
+    )
+    
+    html_output = df._repr_html_()
+    
+    # Memory limit should cause truncation despite higher repr_rows
+    truncation_message = re.search(r"Output truncated|memory limit", html_output, re.IGNORECASE)
+    assert truncation_message is not None
+    
+    # Test with min_rows_display
+    small_batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
+        names=["id", "value"],
+    )
+    small_df = ctx.create_dataframe([[small_batch]])
+    
+    # Set min_rows_display higher than actual rows
+    configure_formatter(
+        min_rows_display=10,
+        repr_rows=5,
+    )
+    
+    html_output = small_df._repr_html_()
+    
+    # All rows should be shown without truncation since fewer than min_rows_display
+    assert "truncated" not in html_output.lower()
+    
+    # All 3 rows should be present
+    row_count = len(re.findall(r"<tr", html_output)) - 1  # Subtract header row
+    assert row_count == 3
+
+
+def test_html_formatter_configuration_all_params(df, clean_formatter_state):
+    """Test configuring all HTML formatter parameters together."""
+    # Configure with all available settings
+    configure_formatter(
+        max_cell_length=5,
+        max_width=500,
+        max_height=200,
+        enable_cell_expansion=False,
+        max_memory_bytes=1 * 1024 * 1024,  # 1 MB
+        min_rows_display=15,
+        repr_rows=7,
+        use_shared_styles=True,
+    )
+
+    html_output = df._repr_html_()
+
+    # Basic verification that configuration was applied
+    assert "max-height: 200px" in html_output
+    assert "max-width: 500px" in html_output
+    assert "expandable-container" not in html_output
+
+    # Reset with different values
+    reset_formatter()
+    configure_formatter(
+        repr_rows=2,
+        min_rows_display=5,
+    )
+
+    # With df having only 3 rows and min_rows_display=5, all rows should be shown
+    html_output = df._repr_html_()
+    row_count = len(re.findall(r"<tr", html_output)) - 1  # Subtract header row
+    assert row_count == 3
+
+
+def test_html_formatter_custom_style_provider(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+
+def test_html_formatter_type_formatters(df, clean_formatter_state):
+    """Test registering custom type formatters for specific data types."""
+
+    # Get current formatter and register custom formatters
+    formatter = get_formatter()
+
+    # Format integers with color based on value
+    # Using int as the type for the formatter will work since we convert
+    # Arrow scalar values to Python native types in _get_cell_value
+    def format_int(value):
+        return f'<span style="color: {"red" if value > 2 else "blue"}">{value}</span>'
+
+    formatter.register_formatter(int, format_int)
+
+    html_output = df._repr_html_()
+
+    # Our test dataframe has values 1,2,3 so we should see:
+    assert '<span style="color: blue">1</span>' in html_output
+
+
+def test_html_formatter_custom_cell_builder(df, clean_formatter_state):
+    """Test using a custom cell builder function."""
+
+    # Create a custom cell builder with distinct styling for different value ranges
+    def custom_cell_builder(value, row, col, table_id):
+        try:
+            num_value = int(value)
+            if num_value > 5:  # Values > 5 get green background with indicator
+                return (
+                    '<td style="background-color: #d9f0d3" '
+                    f'data-test="high">{value}-high</td>'
+                )
+            if num_value < 3:  # Values < 3 get blue background with indicator
+                return (
+                    '<td style="background-color: #d3e9f0" '
+                    f'data-test="low">{value}-low</td>'
+                )
+        except (ValueError, TypeError):
+            pass
+
+        # Default styling for other cells (3, 4, 5)
+        return f'<td style="border: 1px solid #ddd" data-test="mid">{value}-mid</td>'
+
+    # Set our custom cell builder
+    formatter = get_formatter()
+    formatter.set_custom_cell_builder(custom_cell_builder)
+
+    html_output = df._repr_html_()
+
+    # Extract cells with specific styling using regex
+    low_cells = re.findall(
+        r'<td style="background-color: #d3e9f0"[^>]*>(\d+)-low</td>', html_output
+    )
+    mid_cells = re.findall(
+        r'<td style="border: 1px solid #ddd"[^>]*>(\d+)-mid</td>', html_output
+    )
+    high_cells = re.findall(
+        r'<td style="background-color: #d9f0d3"[^>]*>(\d+)-high</td>', html_output
+    )
+
+    # Sort the extracted values for consistent comparison
+    low_cells = sorted(map(int, low_cells))
+    mid_cells = sorted(map(int, mid_cells))
+    high_cells = sorted(map(int, high_cells))
+
+    # Verify specific values have the correct styling applied
+    assert low_cells == [1, 2]  # Values < 3
+    assert mid_cells == [3, 4, 5, 5]  # Values 3-5
+    assert high_cells == [6, 8, 8]  # Values > 5
+
+    # Verify the exact content with styling appears in the output
+    assert (
+        '<td style="background-color: #d3e9f0" data-test="low">1-low</td>'
+        in html_output
+    )
+    assert (
+        '<td style="background-color: #d3e9f0" data-test="low">2-low</td>'
+        in html_output
+    )
+    assert (
+        '<td style="border: 1px solid #ddd" data-test="mid">3-mid</td>' in html_output
+    )
+    assert (
+        '<td style="border: 1px solid #ddd" data-test="mid">4-mid</td>' in html_output
+    )
+    assert (
+        '<td style="background-color: #d9f0d3" data-test="high">6-high</td>'
+        in html_output
+    )
+    assert (
+        '<td style="background-color: #d9f0d3" data-test="high">8-high</td>'
+        in html_output
+    )
+
+    # Count occurrences to ensure all cells are properly styled
+    assert html_output.count("-low</td>") == 2  # Two low values (1, 2)
+    assert html_output.count("-mid</td>") == 4  # Four mid values (3, 4, 5, 5)
+    assert html_output.count("-high</td>") == 3  # Three high values (6, 8, 8)
+
+    # Create a custom cell builder that changes background color based on value
+    def custom_cell_builder(value, row, col, table_id):
+        # Handle numeric values regardless of their exact type
+        try:
+            num_value = int(value)
+            if num_value > 5:  # Values > 5 get green background
+                return f'<td style="background-color: #d9f0d3">{value}</td>'
+            if num_value < 3:  # Values < 3 get light blue background
+                return f'<td style="background-color: #d3e9f0">{value}</td>'
+        except (ValueError, TypeError):
+            pass
+
+        # Default styling for other cells
+        return f'<td style="border: 1px solid #ddd">{value}</td>'
+
+    # Set our custom cell builder
+    formatter = get_formatter()
+    formatter.set_custom_cell_builder(custom_cell_builder)
+
+    html_output = df._repr_html_()
+
+    # Verify our custom cell styling was applied
+    assert "background-color: #d3e9f0" in html_output  # For values 1,2
+
+
+def test_html_formatter_custom_header_builder(df, clean_formatter_state):
+    """Test using a custom header builder function."""
+
+    # Create a custom header builder with tooltips
+    def custom_header_builder(field):
+        tooltips = {
+            "a": "Primary key column",
+            "b": "Secondary values",
+            "c": "Additional data",
+        }
+        tooltip = tooltips.get(field.name, "")
+        return (
+            f'<th style="background-color: #333; color: white" '
+            f'title="{tooltip}">{field.name}</th>'
+        )
+
+    # Set our custom header builder
+    formatter = get_formatter()
+    formatter.set_custom_header_builder(custom_header_builder)
+
+    html_output = df._repr_html_()
+
+    # Verify our custom headers were applied
+    assert 'title="Primary key column"' in html_output
+    assert 'title="Secondary values"' in html_output
+    assert "background-color: #333; color: white" in html_output
+
+
+def test_html_formatter_complex_customization(df, clean_formatter_state):
+    """Test combining multiple customization options together."""
+
+    # Create a dark mode style provider
+    class DarkModeStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #222; color: #eee; "
+                "padding: 8px; border: 1px solid #444;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #111; color: #fff; padding: 10px; "
+                "border: 1px solid #333;"
+            )
+
+    # Configure with dark mode style
+    configure_formatter(
+        max_cell_length=10,
+        style_provider=DarkModeStyleProvider(),
+        custom_css="""
+            .datafusion-table {
+                font-family: monospace;
+                border-collapse: collapse;
+            }
+            .datafusion-table tr:hover td {
+                background-color: #444 !important;
+            }
+        """,
+    )
+
+    # Add type formatters for special formatting - now working with native int values
+    formatter = get_formatter()
+    formatter.register_formatter(
+        int,
+        lambda n: f'<span style="color: {"#5af" if n % 2 == 0 else "#f5a"}">{n}</span>',
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our customizations were applied
+    assert "background-color: #222" in html_output
+    assert "background-color: #111" in html_output
+    assert ".datafusion-table" in html_output
+    assert "color: #5af" in html_output  # Even numbers
+
+
+def test_get_dataframe(tmp_path):
+    ctx = SessionContext()
+
+    path = tmp_path / "test.csv"
+    table = pa.Table.from_arrays(
+        [
+            [1, 2, 3, 4],
+            ["a", "b", "c", "d"],
+            [1.1, 2.2, 3.3, 4.4],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table, path)
+
+    ctx.register_csv("csv", path)
+
+    df = ctx.table("csv")
+    assert isinstance(df, DataFrame)
+
+
+def test_struct_select(struct_df):
+    df = struct_df.select(
+        column("a")["c"] + column("b"),
+        column("a")["c"] - column("b"),
+    )
+
+    # execute and collect the first (and only) batch
+    result = df.collect()[0]
+
+    assert result.column(0) == pa.array([5, 7, 9])
+    assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_explain(df):
+    df = df.select(
+        column("a") + column("b"),
+        column("a") - column("b"),
+    )
+    df.explain()
+
+
+def test_logical_plan(aggregate_df):
+    plan = aggregate_df.logical_plan()
+
+    expected = "Projection: test.c1, sum(test.c2)"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Projection: test.c1, sum(test.c2)\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "    TableScan: test"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_optimized_logical_plan(aggregate_df):
+    plan = aggregate_df.optimized_logical_plan()
+
+    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "  TableScan: test projection=[c1, c2]"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_execution_plan(aggregate_df):
+    plan = aggregate_df.execution_plan()
+
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
+
+    assert expected == plan.display()
+
+    # Check the number of partitions is as expected.
+    assert isinstance(plan.partition_count, int)
+
+    expected = (
+        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
+        "    TableScan: test projection=[c1, c2]"
+    )
+
+    indent = plan.display_indent()
+
+    # indent plan will be different for everyone due to absolute path
+    # to filename, so we just check for some expected content
+    assert "AggregateExec:" in indent
+    assert "CoalesceBatchesExec:" in indent
+    assert "RepartitionExec:" in indent
+    assert "DataSourceExec:" in indent
+    assert "file_type=csv" in indent
+
+    ctx = SessionContext()
+    rows_returned = 0
+    for idx in range(plan.partition_count):
+        stream = ctx.execute(plan, idx)
+        try:
+            batch = stream.next()
+            assert batch is not None
+            rows_returned += len(batch.to_pyarrow()[0])
+        except StopIteration:
+            # This is one of the partitions with no values
+            pass
+        with pytest.raises(StopIteration):
+            stream.next()
+
+    assert rows_returned == 5
+
+
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
+def test_repartition(df):
+    df.repartition(2)
+
+
+def test_repartition_by_hash(df):
+    df.repartition_by_hash(column("a"), num=2)
+
+
+def test_intersect():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3]), pa.array([6])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_i_b.collect()
+
+
+def test_except_all():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([4, 5])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_e_b.collect()
+
+
+def test_collect_partitioned():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
+
+
+def test_union(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_union_distinct(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_cache(df):
+    assert df.cache().collect() == df.collect()
+
+
+def test_count(df):
+    # Get number of rows
+    assert df.count() == 3
+
+
+def test_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert datafusion dataframe to pandas dataframe
+    pandas_df = df.to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (3, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert empty datafusion dataframe to pandas dataframe
+    pandas_df = df.limit(0).to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (0, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert datafusion dataframe to polars dataframe
+    polars_df = df.to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (3, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert empty datafusion dataframe to polars dataframe
+    polars_df = df.limit(0).to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (0, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_to_arrow_table(df):
+    # Convert datafusion dataframe to pyarrow Table
+    pyarrow_table = df.to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream(df):
+    stream = df.execute_stream()
+    assert all(batch is not None for batch in stream)
+    assert not list(stream)  # after one iteration the generator must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
+@pytest.mark.parametrize("schema", [True, False])
+def test_execute_stream_to_arrow_table(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            (batch.to_pyarrow() for batch in stream), schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream_partitioned(df):
+    streams = df.execute_stream_partitioned()
+    assert all(batch is not None for stream in streams for batch in stream)
+    assert all(
+        not list(stream) for stream in streams
+    )  # after one iteration all generators must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
+def test_empty_to_arrow_table(df):
+    # Convert empty datafusion dataframe to pyarrow Table
+    pyarrow_table = df.limit(0).to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (0, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_to_pylist(df):
+    # Convert datafusion dataframe to Python list
+    pylist = df.to_pylist()
+    assert isinstance(pylist, list)
+    assert pylist == [
+        {"a": 1, "b": 4, "c": 8},
+        {"a": 2, "b": 5, "c": 5},
+        {"a": 3, "b": 6, "c": 8},
+    ]
+
+
+def test_to_pydict(df):
+    # Convert datafusion dataframe to Python dictionary
+    pydict = df.to_pydict()
+    assert isinstance(pydict, dict)
+    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
+
+
+def test_describe(df):
+    # Calculate statistics
+    df = df.describe()
+
+    # Collect the result
+    result = df.to_pydict()
+
+    assert result == {
+        "describe": [
+            "count",
+            "null_count",
+            "mean",
+            "std",
+            "min",
+            "max",
+            "median",
+        ],
+        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
+        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
+        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
+    }
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_csv(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_csv(path, with_header=True)
+
+    ctx.register_csv("csv", path)
+    result = ctx.table("csv").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_json(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_json(path)
+
+    ctx.register_json("json", path)
+    result = ctx.table("json").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_parquet(df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_parquet(str(path))
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
+)
+def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
+    path = tmp_path
+
+    df.write_parquet(
+        str(path), compression=compression, compression_level=compression_level
+    )
+
+    # test that the actual compression scheme is the one written
+    for _root, _dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".parquet"):
+                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
+                for row_group in metadata["row_groups"]:
+                    for columns in row_group["columns"]:
+                        assert columns["compression"].lower() == compression
+
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
+)
+def test_write_compressed_parquet_wrong_compression_level(
+    df, tmp_path, compression, compression_level
+):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(
+            str(path),
+            compression=compression,
+            compression_level=compression_level,
+        )
+
+
+@pytest.mark.parametrize("compression", ["wrong"])
+def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(str(path), compression=compression)
+
+
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
+@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
+def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
+    # Test write_parquet with zstd, brotli, gzip default compression level,
+    # ie don't specify compression level
+    # should complete without error
+    path = tmp_path
+
+    df.write_parquet(str(path), compression=compression)
+
+
+def test_dataframe_export(df) -> None:
+    # Guarantees that we have the canonical implementation
+    # reading our dataframe export
+    table = pa.table(df)
+    assert table.num_columns == 3
+    assert table.num_rows == 3
+
+    desired_schema = pa.schema([("a", pa.int64())])
+
+    # Verify we can request a schema
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+
+    # Expect a table of nulls if the schema don't overlap
+    desired_schema = pa.schema([("g", pa.string())])
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+    for i in range(3):
+        assert table[0][i].as_py() is None
+
+    # Expect an error when we cannot convert schema
+    desired_schema = pa.schema([("a", pa.float32())])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+    # Expect an error when we have a not set non-nullable
+    desired_schema = pa.schema([("g", pa.string(), False)])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+
+def test_dataframe_transform(df):
+    def add_string_col(df_internal) -> DataFrame:
+        return df_internal.with_column("string_col", literal("string data"))
+
+    def add_with_parameter(df_internal, value: Any) -> DataFrame:
+        return df_internal.with_column("new_col", literal(value))
+
+    df = df.transform(add_string_col).transform(add_with_parameter, 3)
+
+    result = df.to_pydict()
+
+    assert result["a"] == [1, 2, 3]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
+
+
+def test_dataframe_repr_html_structure(df) -> None:
+    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+    import re
+
+    output = df._repr_html_()
+
+    # Since we've added a fair bit of processing to the html output, lets just verify
+    # the values we are expecting in the table exist. Use regex and ignore everything
+    # between the <th></th> and <td></td>. We also don't want the closing > on the
+    # td and th segments because that is where the formatting data is written.
+
+    headers = ["a", "b", "c"]
+    headers = [f"<th(.*?)>{v}</th>" for v in headers]
+    header_pattern = "(.*?)".join(headers)
+    header_matches = re.findall(header_pattern, output, re.DOTALL)
+    assert len(header_matches) == 1
+
+    # Update the pattern to handle values that may be wrapped in spans
+    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
+
+    body_lines = [
+        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
+        for inner in body_data
+        for v in inner
+    ]
+    body_pattern = "(.*?)".join(body_lines)
+
+    body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+    """Test that DataFrame._repr_html_ contains the expected data values."""
+    html = df._repr_html_()
+    assert html is not None
+
+    # Create a more flexible pattern that handles values being wrapped in spans
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # of formatting
+    pattern = re.compile(
+        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
+        re.DOTALL,
+    )
+
+    # Print debug info if the test fails
+    matches = re.findall(pattern, html)
+    if not matches:
+        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
+
+    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+    """Test that shared styles work correctly across multiple tables."""
+
+    # First, ensure we're using shared styles
+    configure_formatter(use_shared_styles=True)
+
+    # Get HTML output for first table - should include styles
+    html_first = df._repr_html_()
+
+    # Verify styles are included in first render
+    assert "<style>" in html_first
+    assert ".expandable-container" in html_first
+
+    # Get HTML output for second table - should NOT include styles
+    html_second = df._repr_html_()
+
+    # Verify styles are NOT included in second render
+    assert "<style>" not in html_second
+    assert ".expandable-container" not in html_second
+
+    # Reset the styles loaded state and verify styles are included again
+    reset_styles_loaded_state()
+    html_after_reset = df._repr_html_()
+
+    # Verify styles are included after reset
+    assert "<style>" in html_after_reset
+    assert ".expandable-container" in html_after_reset
+
+
+def test_html_formatter_no_shared_styles(df, clean_formatter_state):
+    """Test that styles are always included when shared styles are disabled."""
+
+    # Configure formatter to NOT use shared styles
+    configure_formatter(use_shared_styles=False)
+
+    # Generate HTML multiple times
+    html_first = df._repr_html_()
+    html_second = df._repr_html_()
+
+    # Verify styles are included in both renders
+    assert "<style>" in html_first
+    assert "<style>" in html_second
+    assert ".expandable-container" in html_first
+    assert ".expandable-container" in html_second
+
+
+def test_html_formatter_manual_format_html(clean_formatter_state):
+    """Test direct usage of format_html method with shared styles."""
+
+    # Create sample data
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    formatter = get_formatter()
+
+    # First call should include styles
+    html_first = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_first
+
+    # Second call should not include styles (using shared styles by default)
+    html_second = formatter.format_html([batch], batch.schema)
+    assert "<style>" not in html_second
+
+    # Reset loaded state
+    reset_styles_loaded_state()
+
+    # After reset, styles should be included again
+    html_reset = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_reset
+
+    # Create a new formatter with shared_styles=False
+    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
+
+    # Both calls should include styles
+    local_html_1 = local_formatter.format_html([batch], batch.schema)
+    local_html_2 = local_formatter.format_html([batch], batch.schema)
+
+    assert "<style>" in local_html_1
+    assert "<style>" in local_html_2
+
+
 def test_html_formatter_custom_style_provider(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter."""
 
@@ -859,6 +1916,7 @@ def custom_header_builder(field):
 
     # Set our custom header builder
     formatter = get_formatter()
+    formatter```python
     formatter.set_custom_header_builder(custom_header_builder)
 
     html_output = df._repr_html_()
@@ -1647,3 +2705,52 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
+```
+
+
+def test_html_formatter_custom_header_builder(df, clean_formatter_state):
+    """Test using a custom header builder function."""
+
+    # Create a custom header builder with tooltips
+    def custom_header_builder(field):
+        tooltips = {
+            "a": "Primary key column",
+            "b": "Secondary values",
+            "c": "Additional data",
+        }
+        tooltip = tooltips.get(field.name, "")
+        return (
+            f'<th style="background-color: #333; color: white" '
+            f'title="{tooltip}">{field.name}</th>'
+        )
+
+    # Set our custom header builder
+    formatter = get_formatter()
+    formatter.set_custom_header_builder(custom_header_builder)
+
+    html_output = df._repr_html_()
+
+    # Verify our custom headers were applied
+    assert 'title="Primary key column"' in html_output
+    assert 'title="Secondary values"' in html_output
+    assert "background-color: #333; color: white" in html_output
+
+
+def test_html_formatter_complex_customization(df, clean_formatter_state):
+    """Test combining multiple customization options together."""
+
+    # Create a dark mode style provider
+    class DarkModeStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #222; color: #eee; "
+                "padding: 8px; border: 1px solid #444;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #111; color: #fff; padding: 10px; "
+                "border: 1px solid #333;"
+            )
+
+    # Configure with dark mode style
\ No newline at end of file

From 174374a0eb3f85fdca19f19084bebd167519706d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:38:12 +0800
Subject: [PATCH 12/40] Revert "feat: add tests for HTML formatter row and
 memory limit configurations"

This reverts commit 4090fd2f7378855b045d6bfd1368d088cc9ada75.
---
 python/tests/test_dataframe.py | 1107 --------------------------------
 1 file changed, 1107 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 64b53f491..464b884db 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -690,1063 +690,6 @@ def test_html_formatter_configuration(df, clean_formatter_state):
     assert "expandable-container" not in html_output
 
 
-def test_html_formatter_row_memory_limits(clean_formatter_state):
-    """Test the HTML formatter's row and memory limit parameters."""
-    ctx = SessionContext()
-    
-    # Create a DataFrame with 50 rows and some wide string data
-    wide_data = ["x" * 1000] * 50  # 1000 character strings to test memory limits
-    ids = list(range(50))
-    
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array(ids), pa.array(wide_data)],
-        names=["id", "wide_data"],
-    )
-    df = ctx.create_dataframe([[batch]])
-    
-    # Test with custom repr_rows (show only 5 rows in repr)
-    configure_formatter(
-        repr_rows=5,
-        min_rows_display=20,  # This should not override repr_rows
-        max_memory_bytes=2 * 1024 * 1024,  # Default 2MB
-    )
-    
-    html_output = df._repr_html_()
-    
-    # Only 5 rows should be rendered (first few rows + last few rows)
-    # The string "id: 4" should appear (last of first chunk)
-    # The string "id: 45" should appear (first of last chunk)
-    row_matches = re.findall(r"<td[^>]*?>(\d+)</td>", html_output)
-    assert len(row_matches) <= 10  # Should have at most 10 (5 from top, 5 from bottom)
-    
-    # Test with smaller memory limit
-    configure_formatter(
-        repr_rows=50,  # Try to show all rows
-        max_memory_bytes=10 * 1000,  # Only ~10 rows of our data should fit
-    )
-    
-    html_output = df._repr_html_()
-    
-    # Memory limit should cause truncation despite higher repr_rows
-    truncation_message = re.search(r"Output truncated|memory limit", html_output, re.IGNORECASE)
-    assert truncation_message is not None
-    
-    # Test with min_rows_display
-    small_batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array(["a", "b", "c"])],
-        names=["id", "value"],
-    )
-    small_df = ctx.create_dataframe([[small_batch]])
-    
-    # Set min_rows_display higher than actual rows
-    configure_formatter(
-        min_rows_display=10,
-        repr_rows=5,
-    )
-    
-    html_output = small_df._repr_html_()
-    
-    # All rows should be shown without truncation since fewer than min_rows_display
-    assert "truncated" not in html_output.lower()
-    
-    # All 3 rows should be present
-    row_count = len(re.findall(r"<tr", html_output)) - 1  # Subtract header row
-    assert row_count == 3
-
-
-def test_html_formatter_configuration_all_params(df, clean_formatter_state):
-    """Test configuring all HTML formatter parameters together."""
-    # Configure with all available settings
-    configure_formatter(
-        max_cell_length=5,
-        max_width=500,
-        max_height=200,
-        enable_cell_expansion=False,
-        max_memory_bytes=1 * 1024 * 1024,  # 1 MB
-        min_rows_display=15,
-        repr_rows=7,
-        use_shared_styles=True,
-    )
-
-    html_output = df._repr_html_()
-
-    # Basic verification that configuration was applied
-    assert "max-height: 200px" in html_output
-    assert "max-width: 500px" in html_output
-    assert "expandable-container" not in html_output
-
-    # Reset with different values
-    reset_formatter()
-    configure_formatter(
-        repr_rows=2,
-        min_rows_display=5,
-    )
-
-    # With df having only 3 rows and min_rows_display=5, all rows should be shown
-    html_output = df._repr_html_()
-    row_count = len(re.findall(r"<tr", html_output)) - 1  # Subtract header row
-    assert row_count == 3
-
-
-def test_html_formatter_custom_style_provider(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-
-def test_html_formatter_type_formatters(df, clean_formatter_state):
-    """Test registering custom type formatters for specific data types."""
-
-    # Get current formatter and register custom formatters
-    formatter = get_formatter()
-
-    # Format integers with color based on value
-    # Using int as the type for the formatter will work since we convert
-    # Arrow scalar values to Python native types in _get_cell_value
-    def format_int(value):
-        return f'<span style="color: {"red" if value > 2 else "blue"}">{value}</span>'
-
-    formatter.register_formatter(int, format_int)
-
-    html_output = df._repr_html_()
-
-    # Our test dataframe has values 1,2,3 so we should see:
-    assert '<span style="color: blue">1</span>' in html_output
-
-
-def test_html_formatter_custom_cell_builder(df, clean_formatter_state):
-    """Test using a custom cell builder function."""
-
-    # Create a custom cell builder with distinct styling for different value ranges
-    def custom_cell_builder(value, row, col, table_id):
-        try:
-            num_value = int(value)
-            if num_value > 5:  # Values > 5 get green background with indicator
-                return (
-                    '<td style="background-color: #d9f0d3" '
-                    f'data-test="high">{value}-high</td>'
-                )
-            if num_value < 3:  # Values < 3 get blue background with indicator
-                return (
-                    '<td style="background-color: #d3e9f0" '
-                    f'data-test="low">{value}-low</td>'
-                )
-        except (ValueError, TypeError):
-            pass
-
-        # Default styling for other cells (3, 4, 5)
-        return f'<td style="border: 1px solid #ddd" data-test="mid">{value}-mid</td>'
-
-    # Set our custom cell builder
-    formatter = get_formatter()
-    formatter.set_custom_cell_builder(custom_cell_builder)
-
-    html_output = df._repr_html_()
-
-    # Extract cells with specific styling using regex
-    low_cells = re.findall(
-        r'<td style="background-color: #d3e9f0"[^>]*>(\d+)-low</td>', html_output
-    )
-    mid_cells = re.findall(
-        r'<td style="border: 1px solid #ddd"[^>]*>(\d+)-mid</td>', html_output
-    )
-    high_cells = re.findall(
-        r'<td style="background-color: #d9f0d3"[^>]*>(\d+)-high</td>', html_output
-    )
-
-    # Sort the extracted values for consistent comparison
-    low_cells = sorted(map(int, low_cells))
-    mid_cells = sorted(map(int, mid_cells))
-    high_cells = sorted(map(int, high_cells))
-
-    # Verify specific values have the correct styling applied
-    assert low_cells == [1, 2]  # Values < 3
-    assert mid_cells == [3, 4, 5, 5]  # Values 3-5
-    assert high_cells == [6, 8, 8]  # Values > 5
-
-    # Verify the exact content with styling appears in the output
-    assert (
-        '<td style="background-color: #d3e9f0" data-test="low">1-low</td>'
-        in html_output
-    )
-    assert (
-        '<td style="background-color: #d3e9f0" data-test="low">2-low</td>'
-        in html_output
-    )
-    assert (
-        '<td style="border: 1px solid #ddd" data-test="mid">3-mid</td>' in html_output
-    )
-    assert (
-        '<td style="border: 1px solid #ddd" data-test="mid">4-mid</td>' in html_output
-    )
-    assert (
-        '<td style="background-color: #d9f0d3" data-test="high">6-high</td>'
-        in html_output
-    )
-    assert (
-        '<td style="background-color: #d9f0d3" data-test="high">8-high</td>'
-        in html_output
-    )
-
-    # Count occurrences to ensure all cells are properly styled
-    assert html_output.count("-low</td>") == 2  # Two low values (1, 2)
-    assert html_output.count("-mid</td>") == 4  # Four mid values (3, 4, 5, 5)
-    assert html_output.count("-high</td>") == 3  # Three high values (6, 8, 8)
-
-    # Create a custom cell builder that changes background color based on value
-    def custom_cell_builder(value, row, col, table_id):
-        # Handle numeric values regardless of their exact type
-        try:
-            num_value = int(value)
-            if num_value > 5:  # Values > 5 get green background
-                return f'<td style="background-color: #d9f0d3">{value}</td>'
-            if num_value < 3:  # Values < 3 get light blue background
-                return f'<td style="background-color: #d3e9f0">{value}</td>'
-        except (ValueError, TypeError):
-            pass
-
-        # Default styling for other cells
-        return f'<td style="border: 1px solid #ddd">{value}</td>'
-
-    # Set our custom cell builder
-    formatter = get_formatter()
-    formatter.set_custom_cell_builder(custom_cell_builder)
-
-    html_output = df._repr_html_()
-
-    # Verify our custom cell styling was applied
-    assert "background-color: #d3e9f0" in html_output  # For values 1,2
-
-
-def test_html_formatter_custom_header_builder(df, clean_formatter_state):
-    """Test using a custom header builder function."""
-
-    # Create a custom header builder with tooltips
-    def custom_header_builder(field):
-        tooltips = {
-            "a": "Primary key column",
-            "b": "Secondary values",
-            "c": "Additional data",
-        }
-        tooltip = tooltips.get(field.name, "")
-        return (
-            f'<th style="background-color: #333; color: white" '
-            f'title="{tooltip}">{field.name}</th>'
-        )
-
-    # Set our custom header builder
-    formatter = get_formatter()
-    formatter.set_custom_header_builder(custom_header_builder)
-
-    html_output = df._repr_html_()
-
-    # Verify our custom headers were applied
-    assert 'title="Primary key column"' in html_output
-    assert 'title="Secondary values"' in html_output
-    assert "background-color: #333; color: white" in html_output
-
-
-def test_html_formatter_complex_customization(df, clean_formatter_state):
-    """Test combining multiple customization options together."""
-
-    # Create a dark mode style provider
-    class DarkModeStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #222; color: #eee; "
-                "padding: 8px; border: 1px solid #444;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #111; color: #fff; padding: 10px; "
-                "border: 1px solid #333;"
-            )
-
-    # Configure with dark mode style
-    configure_formatter(
-        max_cell_length=10,
-        style_provider=DarkModeStyleProvider(),
-        custom_css="""
-            .datafusion-table {
-                font-family: monospace;
-                border-collapse: collapse;
-            }
-            .datafusion-table tr:hover td {
-                background-color: #444 !important;
-            }
-        """,
-    )
-
-    # Add type formatters for special formatting - now working with native int values
-    formatter = get_formatter()
-    formatter.register_formatter(
-        int,
-        lambda n: f'<span style="color: {"#5af" if n % 2 == 0 else "#f5a"}">{n}</span>',
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our customizations were applied
-    assert "background-color: #222" in html_output
-    assert "background-color: #111" in html_output
-    assert ".datafusion-table" in html_output
-    assert "color: #5af" in html_output  # Even numbers
-
-
-def test_get_dataframe(tmp_path):
-    ctx = SessionContext()
-
-    path = tmp_path / "test.csv"
-    table = pa.Table.from_arrays(
-        [
-            [1, 2, 3, 4],
-            ["a", "b", "c", "d"],
-            [1.1, 2.2, 3.3, 4.4],
-        ],
-        names=["int", "str", "float"],
-    )
-    write_csv(table, path)
-
-    ctx.register_csv("csv", path)
-
-    df = ctx.table("csv")
-    assert isinstance(df, DataFrame)
-
-
-def test_struct_select(struct_df):
-    df = struct_df.select(
-        column("a")["c"] + column("b"),
-        column("a")["c"] - column("b"),
-    )
-
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
-
-    assert result.column(0) == pa.array([5, 7, 9])
-    assert result.column(1) == pa.array([-3, -3, -3])
-
-
-def test_explain(df):
-    df = df.select(
-        column("a") + column("b"),
-        column("a") - column("b"),
-    )
-    df.explain()
-
-
-def test_logical_plan(aggregate_df):
-    plan = aggregate_df.logical_plan()
-
-    expected = "Projection: test.c1, sum(test.c2)"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Projection: test.c1, sum(test.c2)\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "    TableScan: test"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_optimized_logical_plan(aggregate_df):
-    plan = aggregate_df.optimized_logical_plan()
-
-    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "  TableScan: test projection=[c1, c2]"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_execution_plan(aggregate_df):
-    plan = aggregate_df.execution_plan()
-
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
-
-    assert expected == plan.display()
-
-    # Check the number of partitions is as expected.
-    assert isinstance(plan.partition_count, int)
-
-    expected = (
-        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
-        "    TableScan: test projection=[c1, c2]"
-    )
-
-    indent = plan.display_indent()
-
-    # indent plan will be different for everyone due to absolute path
-    # to filename, so we just check for some expected content
-    assert "AggregateExec:" in indent
-    assert "CoalesceBatchesExec:" in indent
-    assert "RepartitionExec:" in indent
-    assert "DataSourceExec:" in indent
-    assert "file_type=csv" in indent
-
-    ctx = SessionContext()
-    rows_returned = 0
-    for idx in range(plan.partition_count):
-        stream = ctx.execute(plan, idx)
-        try:
-            batch = stream.next()
-            assert batch is not None
-            rows_returned += len(batch.to_pyarrow()[0])
-        except StopIteration:
-            # This is one of the partitions with no values
-            pass
-        with pytest.raises(StopIteration):
-            stream.next()
-
-    assert rows_returned == 5
-
-
-@pytest.mark.asyncio
-async def test_async_iteration_of_df(aggregate_df):
-    rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
-        assert batch is not None
-        rows_returned += len(batch.to_pyarrow()[0])
-
-    assert rows_returned == 5
-
-
-def test_repartition(df):
-    df.repartition(2)
-
-
-def test_repartition_by_hash(df):
-    df.repartition_by_hash(column("a"), num=2)
-
-
-def test_intersect():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3]), pa.array([6])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_i_b.collect()
-
-
-def test_except_all():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([4, 5])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_e_b.collect()
-
-
-def test_collect_partitioned():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
-
-
-def test_union(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_union_distinct(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_cache(df):
-    assert df.cache().collect() == df.collect()
-
-
-def test_count(df):
-    # Get number of rows
-    assert df.count() == 3
-
-
-def test_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert datafusion dataframe to pandas dataframe
-    pandas_df = df.to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (3, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert empty datafusion dataframe to pandas dataframe
-    pandas_df = df.limit(0).to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (0, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert datafusion dataframe to polars dataframe
-    polars_df = df.to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (3, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert empty datafusion dataframe to polars dataframe
-    polars_df = df.limit(0).to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (0, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_to_arrow_table(df):
-    # Convert datafusion dataframe to pyarrow Table
-    pyarrow_table = df.to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream(df):
-    stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
-    assert not list(stream)  # after one iteration the generator must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_async(df):
-    stream = df.execute_stream()
-    batches = [batch async for batch in stream]
-
-    assert all(batch is not None for batch in batches)
-
-    # After consuming all batches, the stream should be exhausted
-    remaining_batches = [batch async for batch in stream]
-    assert not remaining_batches
-
-
-@pytest.mark.parametrize("schema", [True, False])
-def test_execute_stream_to_arrow_table(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            (batch.to_pyarrow() for batch in stream), schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("schema", [True, False])
-async def test_execute_stream_to_arrow_table_async(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream]
-        )
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream_partitioned(df):
-    streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_partitioned_async(df):
-    streams = df.execute_stream_partitioned()
-
-    for stream in streams:
-        batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
-
-        # Ensure the stream is exhausted after iteration
-        remaining_batches = [batch async for batch in stream]
-        assert not remaining_batches
-
-
-def test_empty_to_arrow_table(df):
-    # Convert empty datafusion dataframe to pyarrow Table
-    pyarrow_table = df.limit(0).to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (0, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_to_pylist(df):
-    # Convert datafusion dataframe to Python list
-    pylist = df.to_pylist()
-    assert isinstance(pylist, list)
-    assert pylist == [
-        {"a": 1, "b": 4, "c": 8},
-        {"a": 2, "b": 5, "c": 5},
-        {"a": 3, "b": 6, "c": 8},
-    ]
-
-
-def test_to_pydict(df):
-    # Convert datafusion dataframe to Python dictionary
-    pydict = df.to_pydict()
-    assert isinstance(pydict, dict)
-    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
-
-
-def test_describe(df):
-    # Calculate statistics
-    df = df.describe()
-
-    # Collect the result
-    result = df.to_pydict()
-
-    assert result == {
-        "describe": [
-            "count",
-            "null_count",
-            "mean",
-            "std",
-            "min",
-            "max",
-            "median",
-        ],
-        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
-        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
-        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
-    }
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_csv(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_csv(path, with_header=True)
-
-    ctx.register_csv("csv", path)
-    result = ctx.table("csv").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_json(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_json(path)
-
-    ctx.register_json("json", path)
-    result = ctx.table("json").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_parquet(df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_parquet(str(path))
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
-)
-def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
-    path = tmp_path
-
-    df.write_parquet(
-        str(path), compression=compression, compression_level=compression_level
-    )
-
-    # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
-        for file in files:
-            if file.endswith(".parquet"):
-                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
-                for row_group in metadata["row_groups"]:
-                    for columns in row_group["columns"]:
-                        assert columns["compression"].lower() == compression
-
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
-)
-def test_write_compressed_parquet_wrong_compression_level(
-    df, tmp_path, compression, compression_level
-):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(
-            str(path),
-            compression=compression,
-            compression_level=compression_level,
-        )
-
-
-@pytest.mark.parametrize("compression", ["wrong"])
-def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(str(path), compression=compression)
-
-
-# not testing lzo because it it not implemented yet
-# https://github.com/apache/arrow-rs/issues/6970
-@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
-def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
-    # Test write_parquet with zstd, brotli, gzip default compression level,
-    # ie don't specify compression level
-    # should complete without error
-    path = tmp_path
-
-    df.write_parquet(str(path), compression=compression)
-
-
-def test_dataframe_export(df) -> None:
-    # Guarantees that we have the canonical implementation
-    # reading our dataframe export
-    table = pa.table(df)
-    assert table.num_columns == 3
-    assert table.num_rows == 3
-
-    desired_schema = pa.schema([("a", pa.int64())])
-
-    # Verify we can request a schema
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-
-    # Expect a table of nulls if the schema don't overlap
-    desired_schema = pa.schema([("g", pa.string())])
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-    for i in range(3):
-        assert table[0][i].as_py() is None
-
-    # Expect an error when we cannot convert schema
-    desired_schema = pa.schema([("a", pa.float32())])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-    # Expect an error when we have a not set non-nullable
-    desired_schema = pa.schema([("g", pa.string(), False)])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-
-def test_dataframe_transform(df):
-    def add_string_col(df_internal) -> DataFrame:
-        return df_internal.with_column("string_col", literal("string data"))
-
-    def add_with_parameter(df_internal, value: Any) -> DataFrame:
-        return df_internal.with_column("new_col", literal(value))
-
-    df = df.transform(add_string_col).transform(add_with_parameter, 3)
-
-    result = df.to_pydict()
-
-    assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
-
-
-def test_dataframe_repr_html_structure(df) -> None:
-    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
-
-    output = df._repr_html_()
-
-    # Since we've added a fair bit of processing to the html output, lets just verify
-    # the values we are expecting in the table exist. Use regex and ignore everything
-    # between the <th></th> and <td></td>. We also don't want the closing > on the
-    # td and th segments because that is where the formatting data is written.
-
-    headers = ["a", "b", "c"]
-    headers = [f"<th(.*?)>{v}</th>" for v in headers]
-    header_pattern = "(.*?)".join(headers)
-    header_matches = re.findall(header_pattern, output, re.DOTALL)
-    assert len(header_matches) == 1
-
-    # Update the pattern to handle values that may be wrapped in spans
-    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
-
-    body_lines = [
-        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
-        for inner in body_data
-        for v in inner
-    ]
-    body_pattern = "(.*?)".join(body_lines)
-
-    body_matches = re.findall(body_pattern, output, re.DOTALL)
-
-    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
-
-
-def test_dataframe_repr_html_values(df):
-    """Test that DataFrame._repr_html_ contains the expected data values."""
-    html = df._repr_html_()
-    assert html is not None
-
-    # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
-    # of formatting
-    pattern = re.compile(
-        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
-        re.DOTALL,
-    )
-
-    # Print debug info if the test fails
-    matches = re.findall(pattern, html)
-    if not matches:
-        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
-
-    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
-
-
-def test_html_formatter_shared_styles(df, clean_formatter_state):
-    """Test that shared styles work correctly across multiple tables."""
-
-    # First, ensure we're using shared styles
-    configure_formatter(use_shared_styles=True)
-
-    # Get HTML output for first table - should include styles
-    html_first = df._repr_html_()
-
-    # Verify styles are included in first render
-    assert "<style>" in html_first
-    assert ".expandable-container" in html_first
-
-    # Get HTML output for second table - should NOT include styles
-    html_second = df._repr_html_()
-
-    # Verify styles are NOT included in second render
-    assert "<style>" not in html_second
-    assert ".expandable-container" not in html_second
-
-    # Reset the styles loaded state and verify styles are included again
-    reset_styles_loaded_state()
-    html_after_reset = df._repr_html_()
-
-    # Verify styles are included after reset
-    assert "<style>" in html_after_reset
-    assert ".expandable-container" in html_after_reset
-
-
-def test_html_formatter_no_shared_styles(df, clean_formatter_state):
-    """Test that styles are always included when shared styles are disabled."""
-
-    # Configure formatter to NOT use shared styles
-    configure_formatter(use_shared_styles=False)
-
-    # Generate HTML multiple times
-    html_first = df._repr_html_()
-    html_second = df._repr_html_()
-
-    # Verify styles are included in both renders
-    assert "<style>" in html_first
-    assert "<style>" in html_second
-    assert ".expandable-container" in html_first
-    assert ".expandable-container" in html_second
-
-
-def test_html_formatter_manual_format_html(clean_formatter_state):
-    """Test direct usage of format_html method with shared styles."""
-
-    # Create sample data
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    formatter = get_formatter()
-
-    # First call should include styles
-    html_first = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_first
-
-    # Second call should not include styles (using shared styles by default)
-    html_second = formatter.format_html([batch], batch.schema)
-    assert "<style>" not in html_second
-
-    # Reset loaded state
-    reset_styles_loaded_state()
-
-    # After reset, styles should be included again
-    html_reset = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_reset
-
-    # Create a new formatter with shared_styles=False
-    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
-
-    # Both calls should include styles
-    local_html_1 = local_formatter.format_html([batch], batch.schema)
-    local_html_2 = local_formatter.format_html([batch], batch.schema)
-
-    assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
 def test_html_formatter_custom_style_provider(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter."""
 
@@ -1916,7 +859,6 @@ def custom_header_builder(field):
 
     # Set our custom header builder
     formatter = get_formatter()
-    formatter```python
     formatter.set_custom_header_builder(custom_header_builder)
 
     html_output = df._repr_html_()
@@ -2705,52 +1647,3 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
-```
-
-
-def test_html_formatter_custom_header_builder(df, clean_formatter_state):
-    """Test using a custom header builder function."""
-
-    # Create a custom header builder with tooltips
-    def custom_header_builder(field):
-        tooltips = {
-            "a": "Primary key column",
-            "b": "Secondary values",
-            "c": "Additional data",
-        }
-        tooltip = tooltips.get(field.name, "")
-        return (
-            f'<th style="background-color: #333; color: white" '
-            f'title="{tooltip}">{field.name}</th>'
-        )
-
-    # Set our custom header builder
-    formatter = get_formatter()
-    formatter.set_custom_header_builder(custom_header_builder)
-
-    html_output = df._repr_html_()
-
-    # Verify our custom headers were applied
-    assert 'title="Primary key column"' in html_output
-    assert 'title="Secondary values"' in html_output
-    assert "background-color: #333; color: white" in html_output
-
-
-def test_html_formatter_complex_customization(df, clean_formatter_state):
-    """Test combining multiple customization options together."""
-
-    # Create a dark mode style provider
-    class DarkModeStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #222; color: #eee; "
-                "padding: 8px; border: 1px solid #444;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #111; color: #fff; padding: 10px; "
-                "border: 1px solid #333;"
-            )
-
-    # Configure with dark mode style
\ No newline at end of file

From c10d7ebe591049b84ddbb5752c340d0a03daaaad Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 08:14:12 +0800
Subject: [PATCH 13/40] feat: add tests for new parameters and validation in
 DataFrameHtmlFormatter

---
 python/tests/test_dataframe.py | 83 ++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 464b884db..7cbdc8563 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1647,3 +1647,86 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
+
+
+def test_html_formatter_memory_and_row_parameters():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+
+
+def test_html_formatter_memory_and_row_validation():
+    """Test validation for the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test invalid max_memory_bytes
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    # Test invalid min_rows_display
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    # Test invalid repr_rows
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider and memory/row parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7

From 620d25ee9be04b221a76e81c6132271cd1e03448 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 08:25:13 +0800
Subject: [PATCH 14/40] Reorganize tests

---
 python/tests/test_dataframe.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 7cbdc8563..47ff40846 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1649,7 +1649,7 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
     assert "<style>" in local_html_2
 
 
-def test_html_formatter_memory_and_row_parameters():
+def test_html_formatter_memory_and_rows():
     """Test the memory and row control parameters in DataFrameHtmlFormatter."""
     
     # Test default values
@@ -1667,26 +1667,20 @@ def test_html_formatter_memory_and_row_parameters():
     assert formatter.max_memory_bytes == 1024 * 1024
     assert formatter.min_rows_display == 10
     assert formatter.repr_rows == 5
-
-
-def test_html_formatter_memory_and_row_validation():
-    """Test validation for the memory and row control parameters in DataFrameHtmlFormatter."""
     
-    # Test invalid max_memory_bytes
+    # Test validation for invalid parameters
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=0)
     
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=-100)
     
-    # Test invalid min_rows_display
     with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
         DataFrameHtmlFormatter(min_rows_display=0)
     
     with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
         DataFrameHtmlFormatter(min_rows_display=-5)
     
-    # Test invalid repr_rows
     with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
         DataFrameHtmlFormatter(repr_rows=0)
     
@@ -1710,7 +1704,20 @@ def get_header_style(self) -> str:
                 "padding: 10px; border: 1px solid #3367d6;"
             )
 
-    # Configure with custom style provider and memory/row parameters
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
     configure_formatter(
         style_provider=CustomStyleProvider(),
         max_memory_bytes=3 * 1024 * 1024,  # 3 MB
@@ -1725,7 +1732,7 @@ def get_header_style(self) -> str:
     assert "color: white" in html_output
     assert "background-color: #f5f5f5" in html_output
 
-    # Test memory and row parameters
+    # Test memory and row parameters were properly set
     formatter = get_formatter()
     assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
     assert formatter.min_rows_display == 15

From 82ddd2836fdb80eea3e9100aacd3371ae11a1fce Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 08:41:30 +0800
Subject: [PATCH 15/40] refactor: rename and restructure formatter functions
 for clarity and maintainability

---
 src/dataframe.rs | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 98983473a..a3f60b5f3 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -94,37 +94,31 @@ impl Default for FormatterConfig {
 }
 
 /// Get the Python formatter from the datafusion.html_formatter module
-fn get_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
+fn import_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
     let formatter_module = py.import("datafusion.html_formatter")?;
     let get_formatter = formatter_module.getattr("get_formatter")?;
     get_formatter.call0()
 }
+// Helper function to extract attributes with fallback to default
+fn get_attr<'a>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: usize) -> usize {
+    py_object
+        .getattr(attr_name)
+        .and_then(|v| v.extract::<usize>())
+        .unwrap_or(default_value)
+}
 
-fn get_formatter_config(py: Python) -> PyResult<FormatterConfig> {
-    let formatter = get_python_formatter(py)?;
-
-    // Helper function to extract attributes with fallback to default
-    fn get_attr<'a>(
-        formatter: &'a Bound<'a, PyAny>,
-        attr_name: &str,
-        default_value: usize,
-    ) -> usize {
-        formatter
-            .getattr(attr_name)
-            .and_then(|v| v.extract::<usize>())
-            .unwrap_or(default_value)
-    }
-
+/// Helper function to create a FormatterConfig from a Python formatter object
+fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> FormatterConfig {
     let default_config = FormatterConfig::default();
-    let max_bytes = get_attr(&formatter, "max_memory_bytes", default_config.max_bytes);
-    let min_rows = get_attr(&formatter, "min_rows_display", default_config.min_rows);
-    let repr_rows = get_attr(&formatter, "repr_rows", default_config.repr_rows);
+    let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes);
+    let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows);
+    let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows);
 
-    Ok(FormatterConfig {
+    FormatterConfig {
         max_bytes,
         min_rows,
         repr_rows,
-    })
+    }
 }
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
@@ -167,7 +161,9 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        let config = get_formatter_config(py)?;
+        // Get the Python formatter module and call format_html
+        let formatter = import_python_formatter(py)?;
+        let config = build_formatter_config_from_python(&formatter);
         let (batches, has_more) = wait_for_future(
             py,
             collect_record_batches_to_display(self.df.as_ref().clone(), config),
@@ -189,7 +185,10 @@ impl PyDataFrame {
     }
 
     fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
-        let config = get_formatter_config(py)?;
+        // Get the Python formatter module and call format_html
+        let formatter = import_python_formatter(py)?;
+        let config = build_formatter_config_from_python(&formatter);
+
         let (batches, has_more) = wait_for_future(
             py,
             collect_record_batches_to_display(self.df.as_ref().clone(), config),
@@ -209,9 +208,6 @@ impl PyDataFrame {
 
         let py_schema = self.schema().into_pyobject(py)?;
 
-        // Get the Python formatter module and call format_html
-        let formatter = get_python_formatter(py)?;
-
         // Call format_html method on the formatter
         let kwargs = pyo3::types::PyDict::new(py);
         let py_batches_list = PyList::new(py, py_batches.as_slice())?;

From 302f785bc4ac3cb7aa6dd0e2d0f4904badd70cc5 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 08:52:37 +0800
Subject: [PATCH 16/40] feat: implement PythonFormatter struct and refactor
 formatter retrieval for improved clarity

---
 src/dataframe.rs | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index a3f60b5f3..acaada74e 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -93,6 +93,21 @@ impl Default for FormatterConfig {
     }
 }
 
+/// Holds the Python formatter and its configuration
+struct PythonFormatter<'py> {
+    /// The Python formatter object
+    formatter: Bound<'py, PyAny>,
+    /// The formatter configuration
+    config: FormatterConfig,
+}
+
+/// Get the Python formatter and its configuration
+fn get_python_formatter_with_config<'py>(py: Python<'py>) -> PyResult<PythonFormatter<'py>> {
+    let formatter = import_python_formatter(py)?;
+    let config = build_formatter_config_from_python(&formatter);
+    Ok(PythonFormatter { formatter, config })
+}
+
 /// Get the Python formatter from the datafusion.html_formatter module
 fn import_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
     let formatter_module = py.import("datafusion.html_formatter")?;
@@ -161,9 +176,11 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        // Get the Python formatter module and call format_html
-        let formatter = import_python_formatter(py)?;
-        let config = build_formatter_config_from_python(&formatter);
+        // Get the Python formatter and config
+        let PythonFormatter {
+            formatter: _,
+            config,
+        } = get_python_formatter_with_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
             collect_record_batches_to_display(self.df.as_ref().clone(), config),
@@ -185,10 +202,8 @@ impl PyDataFrame {
     }
 
     fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
-        // Get the Python formatter module and call format_html
-        let formatter = import_python_formatter(py)?;
-        let config = build_formatter_config_from_python(&formatter);
-
+        // Get the Python formatter and config
+        let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
         let (batches, has_more) = wait_for_future(
             py,
             collect_record_batches_to_display(self.df.as_ref().clone(), config),
@@ -216,6 +231,7 @@ impl PyDataFrame {
         kwargs.set_item("has_more", has_more)?;
         kwargs.set_item("table_uuid", table_uuid)?;
 
+        // Use the formatter from the struct
         let html_result = formatter.call_method("format_html", (), Some(&kwargs))?;
         let html_str: String = html_result.extract()?;
 

From 2936455e83f5d2d7418019624d7fc56f90851503 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 09:05:46 +0800
Subject: [PATCH 17/40] refactor: improve comments and restructure
 FormatterConfig usage in PyDataFrame

---
 src/dataframe.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index acaada74e..a221a3649 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -176,7 +176,7 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        // Get the Python formatter and config
+        // Get the Python formatter config
         let PythonFormatter {
             formatter: _,
             config,
@@ -223,7 +223,6 @@ impl PyDataFrame {
 
         let py_schema = self.schema().into_pyobject(py)?;
 
-        // Call format_html method on the formatter
         let kwargs = pyo3::types::PyDict::new(py);
         let py_batches_list = PyList::new(py, py_batches.as_slice())?;
         kwargs.set_item("batches", py_batches_list)?;
@@ -231,7 +230,6 @@ impl PyDataFrame {
         kwargs.set_item("has_more", has_more)?;
         kwargs.set_item("table_uuid", table_uuid)?;
 
-        // Use the formatter from the struct
         let html_result = formatter.call_method("format_html", (), Some(&kwargs))?;
         let html_str: String = html_result.extract()?;
 
@@ -859,9 +857,11 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     config: FormatterConfig,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
-    let max_bytes = config.max_bytes;
-    let min_rows = config.min_rows;
-    let max_rows = config.repr_rows;
+    let FormatterConfig {
+        max_bytes,
+        min_rows,
+        repr_rows,
+    } = config;
 
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -870,7 +870,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows {
+    while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {
                 break;
@@ -900,8 +900,8 @@ async fn collect_record_batches_to_display(
                 }
             }
 
-            if rows_in_rb + rows_so_far > max_rows {
-                rb = rb.slice(0, max_rows - rows_so_far);
+            if rows_in_rb + rows_so_far > repr_rows {
+                rb = rb.slice(0, repr_rows - rows_so_far);
                 has_more = true;
             }
 

From 877226ab635b4b981b23debd9653c073c789ba46 Mon Sep 17 00:00:00 2001
From: kosiew <kosiew@gmail.com>
Date: Sun, 27 Apr 2025 21:41:01 +0800
Subject: [PATCH 18/40] Add DataFrame usage guide with HTML rendering
 customization options (#1108)

* docs: enhance user guide with detailed DataFrame operations and examples

* move /docs/source/api/dataframe.rst into user-guide

* docs: remove  DataFrame API documentation

* docs: fix formatting inconsistencies in DataFrame user guide

* Two minor corrections to documentation rendering

---------

Co-authored-by: Tim Saucer <timsaucer@gmail.com>
---
 docs/source/index.rst                |   1 +
 docs/source/user-guide/basics.rst    |   5 +-
 docs/source/user-guide/dataframe.rst | 179 +++++++++++++++++++++++++++
 3 files changed, 184 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/user-guide/dataframe.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 558b2d572..c18793822 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -72,6 +72,7 @@ Example
    user-guide/introduction
    user-guide/basics
    user-guide/data-sources
+   user-guide/dataframe
    user-guide/common-operations/index
    user-guide/io/index
    user-guide/configuration
diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst
index 6636c0c6a..2975d9a6b 100644
--- a/docs/source/user-guide/basics.rst
+++ b/docs/source/user-guide/basics.rst
@@ -21,7 +21,8 @@ Concepts
 ========
 
 In this section, we will cover a basic example to introduce a few key concepts. We will use the
-2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).
+2021 Yellow Taxi Trip Records (`download <https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet>`_),
+from the `TLC Trip Record Data <https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page>`_.
 
 .. ipython:: python
 
@@ -72,6 +73,8 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c
 calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`,
 and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition.
 
+For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`.
+
 Expressions
 -----------
 
diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst
new file mode 100644
index 000000000..a78fd8073
--- /dev/null
+++ b/docs/source/user-guide/dataframe.rst
@@ -0,0 +1,179 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+DataFrames
+==========
+
+Overview
+--------
+
+DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. 
+It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust 
+and Arrow.
+
+A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation.
+The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called.
+
+Basic Usage
+-----------
+
+.. code-block:: python
+
+    import datafusion
+    from datafusion import col, lit
+
+    # Create a context and register a data source
+    ctx = datafusion.SessionContext()
+    ctx.register_csv("my_table", "path/to/data.csv")
+    
+    # Create and manipulate a DataFrame
+    df = ctx.sql("SELECT * FROM my_table")
+    
+    # Or use the DataFrame API directly
+    df = (ctx.table("my_table")
+          .filter(col("age") > lit(25))
+          .select([col("name"), col("age")]))
+    
+    # Execute and collect results
+    result = df.collect()
+    
+    # Display the first few rows
+    df.show()
+
+HTML Rendering
+--------------
+
+When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will
+automatically display as formatted HTML tables, making it easier to visualize your data.
+
+The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method 
+controls how DataFrames appear in notebook environments, providing a richer visualization than
+plain text output.
+
+Customizing HTML Rendering
+--------------------------
+
+You can customize how DataFrames are rendered in HTML by configuring the formatter:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import configure_formatter
+    
+    # Change the default styling
+    configure_formatter(
+        max_rows=50,           # Maximum number of rows to display
+        max_width=None,        # Maximum width in pixels (None for auto)
+        theme="light",         # Theme: "light" or "dark" 
+        precision=2,           # Floating point precision
+        thousands_separator=",", # Separator for thousands
+        date_format="%Y-%m-%d", # Date format
+        truncate_width=20      # Max width for string columns before truncating
+    )
+
+The formatter settings affect all DataFrames displayed after configuration.
+
+Custom Style Providers
+----------------------
+
+For advanced styling needs, you can create a custom style provider:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import StyleProvider, configure_formatter
+    
+    class MyStyleProvider(StyleProvider):
+        def get_table_styles(self):
+            return {
+                "table": "border-collapse: collapse; width: 100%;",
+                "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;",
+                "td": "border: 1px solid #ddd; padding: 8px;",
+                "tr:nth-child(even)": "background-color: #f2f2f2;",
+            }
+            
+        def get_value_styles(self, dtype, value):
+            """Return custom styles for specific values"""
+            if dtype == "float" and value < 0:
+                return "color: red;"
+            return None
+    
+    # Apply the custom style provider
+    configure_formatter(style_provider=MyStyleProvider())
+
+Creating a Custom Formatter
+---------------------------
+
+For complete control over rendering, you can implement a custom formatter:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import Formatter, get_formatter
+    
+    class MyFormatter(Formatter):
+        def format_html(self, batches, schema, has_more=False, table_uuid=None):
+            # Create your custom HTML here
+            html = "<div class='my-custom-table'>"
+            # ... formatting logic ...
+            html += "</div>"
+            return html
+    
+    # Set as the global formatter
+    configure_formatter(formatter_class=MyFormatter)
+    
+    # Or use the formatter just for specific operations
+    formatter = get_formatter()
+    custom_html = formatter.format_html(batches, schema)
+
+Managing Formatters
+-------------------
+
+Reset to default formatting:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import reset_formatter
+    
+    # Reset to default settings
+    reset_formatter()
+
+Get the current formatter settings:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import get_formatter
+    
+    formatter = get_formatter()
+    print(formatter.max_rows)
+    print(formatter.theme)
+
+Contextual Formatting
+---------------------
+
+You can also use a context manager to temporarily change formatting settings:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import formatting_context
+    
+    # Default formatting
+    df.show()
+    
+    # Temporarily use different formatting
+    with formatting_context(max_rows=100, theme="dark"):
+        df.show()  # Will use the temporary settings
+    
+    # Back to default formatting
+    df.show()

From ccf2549b21d47fc5cf323168fd0e7ddd38fd7c3b Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 09:15:48 +0800
Subject: [PATCH 19/40] Update documentation

---
 docs/source/user-guide/dataframe.rst | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst
index a78fd8073..2dad0266f 100644
--- a/docs/source/user-guide/dataframe.rst
+++ b/docs/source/user-guide/dataframe.rst
@@ -75,13 +75,17 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt
     
     # Change the default styling
     configure_formatter(
-        max_rows=50,           # Maximum number of rows to display
-        max_width=None,        # Maximum width in pixels (None for auto)
-        theme="light",         # Theme: "light" or "dark" 
-        precision=2,           # Floating point precision
-        thousands_separator=",", # Separator for thousands
-        date_format="%Y-%m-%d", # Date format
-        truncate_width=20      # Max width for string columns before truncating
+        max_cell_length=25,        # Maximum characters in a cell before truncation
+        max_width=1000,            # Maximum width in pixels
+        max_height=300,            # Maximum height in pixels
+        max_memory_bytes=2097152,  # Maximum memory for rendering (2MB)
+        min_rows_display=20,       # Minimum number of rows to display
+        repr_rows=10,              # Number of rows to display in __repr__
+        enable_cell_expansion=True,# Allow expanding truncated cells
+        custom_css=None,           # Additional custom CSS
+        show_truncation_message=True, # Show message when data is truncated
+        style_provider=None,       # Custom styling provider
+        use_shared_styles=True     # Share styles across tables
     )
 
 The formatter settings affect all DataFrames displayed after configuration.

From 194ed4671c48c8d2f1f70b7dfd4adb7a30d701b6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 09:37:08 +0800
Subject: [PATCH 20/40] refactor: streamline HTML rendering documentation

---
 docs/source/user-guide/dataframe.rst | 101 +++++++--------------------
 1 file changed, 27 insertions(+), 74 deletions(-)

diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst
index 2dad0266f..cd4abe05f 100644
--- a/docs/source/user-guide/dataframe.rst
+++ b/docs/source/user-guide/dataframe.rst
@@ -90,94 +90,47 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt
 
 The formatter settings affect all DataFrames displayed after configuration.
 
-Custom Style Providers
-----------------------
+Performance Optimization with Shared Styles
+------------------------------------------
 
-For advanced styling needs, you can create a custom style provider:
+The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying 
+multiple DataFrames in notebook environments:
 
 .. code-block:: python
 
-    from datafusion.html_formatter import StyleProvider, configure_formatter
+    # Default: Use shared styles (recommended for notebooks)
+    configure_formatter(use_shared_styles=True)
     
-    class MyStyleProvider(StyleProvider):
-        def get_table_styles(self):
-            return {
-                "table": "border-collapse: collapse; width: 100%;",
-                "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;",
-                "td": "border: 1px solid #ddd; padding: 8px;",
-                "tr:nth-child(even)": "background-color: #f2f2f2;",
-            }
-            
-        def get_value_styles(self, dtype, value):
-            """Return custom styles for specific values"""
-            if dtype == "float" and value < 0:
-                return "color: red;"
-            return None
-    
-    # Apply the custom style provider
-    configure_formatter(style_provider=MyStyleProvider())
-
-Creating a Custom Formatter
----------------------------
-
-For complete control over rendering, you can implement a custom formatter:
-
-.. code-block:: python
-
-    from datafusion.html_formatter import Formatter, get_formatter
-    
-    class MyFormatter(Formatter):
-        def format_html(self, batches, schema, has_more=False, table_uuid=None):
-            # Create your custom HTML here
-            html = "<div class='my-custom-table'>"
-            # ... formatting logic ...
-            html += "</div>"
-            return html
-    
-    # Set as the global formatter
-    configure_formatter(formatter_class=MyFormatter)
-    
-    # Or use the formatter just for specific operations
-    formatter = get_formatter()
-    custom_html = formatter.format_html(batches, schema)
-
-Managing Formatters
--------------------
-
-Reset to default formatting:
+    # Disable shared styles (each DataFrame includes its own styles)
+    configure_formatter(use_shared_styles=False)
 
-.. code-block:: python
+When ``use_shared_styles=True``:
 
-    from datafusion.html_formatter import reset_formatter
-    
-    # Reset to default settings
-    reset_formatter()
+- CSS styles and JavaScript are included only once per notebook session
+- This reduces HTML output size and prevents style duplication
+- Improves rendering performance with many DataFrames
+- Applies consistent styling across all DataFrames
 
-Get the current formatter settings:
+If you switch between notebooks or need to refresh styles:
 
 .. code-block:: python
 
-    from datafusion.html_formatter import get_formatter
+    from datafusion.html_formatter import reset_styles_loaded_state
     
-    formatter = get_formatter()
-    print(formatter.max_rows)
-    print(formatter.theme)
+    # Force styles to be included in the next DataFrame display
+    reset_styles_loaded_state()
 
-Contextual Formatting
----------------------
+Memory and Display Controls
+--------------------------
 
-You can also use a context manager to temporarily change formatting settings:
+You can control how much data is displayed and how much memory is used for rendering:
 
 .. code-block:: python
 
-    from datafusion.html_formatter import formatting_context
-    
-    # Default formatting
-    df.show()
-    
-    # Temporarily use different formatting
-    with formatting_context(max_rows=100, theme="dark"):
-        df.show()  # Will use the temporary settings
-    
-    # Back to default formatting
-    df.show()
+    configure_formatter(
+        max_memory_bytes=4 * 1024 * 1024,  # 4MB maximum memory for display
+        min_rows_display=50,               # Always show at least 50 rows
+        repr_rows=20                       # Show 20 rows in __repr__ output
+    )
+
+These parameters help balance comprehensive data display against performance considerations.

From 86671288b7e0e9a17291613a7c8bf44af8644d48 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 09:37:15 +0800
Subject: [PATCH 21/40] refactor: extract validation logic into separate
 functions for clarity

---
 python/datafusion/html_formatter.py | 69 +++++++++++++++++------------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index 065b7262c..ccf4da31c 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -24,9 +24,41 @@
     Optional,
     Protocol,
     runtime_checkable,
+    TypeVar,
+    Union,
 )
 
 
+def _validate_positive_int(value: Any, param_name: str) -> None:
+    """Validate that a parameter is a positive integer.
+
+    Args:
+        value: The value to validate
+        param_name: Name of the parameter (used in error message)
+
+    Raises:
+        ValueError: If the value is not a positive integer
+    """
+    if not isinstance(value, int) or value <= 0:
+        msg = f"{param_name} must be a positive integer"
+        raise ValueError(msg)
+
+
+def _validate_bool(value: Any, param_name: str) -> None:
+    """Validate that a parameter is a boolean.
+
+    Args:
+        value: The value to validate
+        param_name: Name of the parameter (used in error message)
+
+    Raises:
+        TypeError: If the value is not a boolean
+    """
+    if not isinstance(value, bool):
+        msg = f"{param_name} must be a boolean"
+        raise TypeError(msg)
+
+
 @runtime_checkable
 class CellFormatter(Protocol):
     """Protocol for cell value formatters."""
@@ -161,36 +193,17 @@ def __init__(
             protocol.
         """
         # Validate numeric parameters
-
-        if not isinstance(max_cell_length, int) or max_cell_length <= 0:
-            msg = "max_cell_length must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(max_width, int) or max_width <= 0:
-            msg = "max_width must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(max_height, int) or max_height <= 0:
-            msg = "max_height must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(max_memory_bytes, int) or max_memory_bytes <= 0:
-            msg = "max_memory_bytes must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(min_rows_display, int) or min_rows_display <= 0:
-            msg = "min_rows_display must be a positive integer"
-            raise ValueError(msg)
-        if not isinstance(repr_rows, int) or repr_rows <= 0:
-            msg = "repr_rows must be a positive integer"
-            raise ValueError(msg)
+        _validate_positive_int(max_cell_length, "max_cell_length")
+        _validate_positive_int(max_width, "max_width")
+        _validate_positive_int(max_height, "max_height")
+        _validate_positive_int(max_memory_bytes, "max_memory_bytes")
+        _validate_positive_int(min_rows_display, "min_rows_display")
+        _validate_positive_int(repr_rows, "repr_rows")
 
         # Validate boolean parameters
-        if not isinstance(enable_cell_expansion, bool):
-            msg = "enable_cell_expansion must be a boolean"
-            raise TypeError(msg)
-        if not isinstance(show_truncation_message, bool):
-            msg = "show_truncation_message must be a boolean"
-            raise TypeError(msg)
-        if not isinstance(use_shared_styles, bool):
-            msg = "use_shared_styles must be a boolean"
-            raise TypeError(msg)
+        _validate_bool(enable_cell_expansion, "enable_cell_expansion")
+        _validate_bool(show_truncation_message, "show_truncation_message")
+        _validate_bool(use_shared_styles, "use_shared_styles")
 
         # Validate custom_css
         if custom_css is not None and not isinstance(custom_css, str):

From 15435bf2d2eef6432826d929d970904b92ea2d3d Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 10:26:02 +0800
Subject: [PATCH 22/40] Implement feature X to enhance user experience and
 optimize performance

---
 python/tests/test_dataframe.py | 3444 ++++++++++++++++++++++++++++++++
 1 file changed, 3444 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 47ff40846..ed3d6012e 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -917,6 +917,3439 @@ def get_header_style(self) -> str:
     assert "color: #5af" in html_output  # Even numbers
 
 
+def test_html_formatter_memory_and_rows():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+    
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
+    # Test validation for invalid parameters
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters were properly set
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7
+
+
+def test_get_dataframe(tmp_path):
+    ctx = SessionContext()
+
+    path = tmp_path / "test.csv"
+    table = pa.Table.from_arrays(
+        [
+            [1, 2, 3, 4],
+            ["a", "b", "c", "d"],
+            [1.1, 2.2, 3.3, 4.4],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table, path)
+
+    ctx.register_csv("csv", path)
+
+    df = ctx.table("csv")
+    assert isinstance(df, DataFrame)
+
+
+def test_struct_select(struct_df):
+    df = struct_df.select(
+        column("a")["c"] + column("b"),
+        column("a")["c"] - column("b"),
+    )
+
+    # execute and collect the first (and only) batch
+    result = df.collect()[0]
+
+    assert result.column(0) == pa.array([5, 7, 9])
+    assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_explain(df):
+    df = df.select(
+        column("a") + column("b"),
+        column("a") - column("b"),
+    )
+    df.explain()
+
+
+def test_logical_plan(aggregate_df):
+    plan = aggregate_df.logical_plan()
+
+    expected = "Projection: test.c1, sum(test.c2)"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Projection: test.c1, sum(test.c2)\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "    TableScan: test"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_optimized_logical_plan(aggregate_df):
+    plan = aggregate_df.optimized_logical_plan()
+
+    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "  TableScan: test projection=[c1, c2]"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_execution_plan(aggregate_df):
+    plan = aggregate_df.execution_plan()
+
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
+
+    assert expected == plan.display()
+
+    # Check the number of partitions is as expected.
+    assert isinstance(plan.partition_count, int)
+
+    expected = (
+        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
+        "    TableScan: test projection=[c1, c2]"
+    )
+
+    indent = plan.display_indent()
+
+    # indent plan will be different for everyone due to absolute path
+    # to filename, so we just check for some expected content
+    assert "AggregateExec:" in indent
+    assert "CoalesceBatchesExec:" in indent
+    assert "RepartitionExec:" in indent
+    assert "DataSourceExec:" in indent
+    assert "file_type=csv" in indent
+
+    ctx = SessionContext()
+    rows_returned = 0
+    for idx in range(plan.partition_count):
+        stream = ctx.execute(plan, idx)
+        try:
+            batch = stream.next()
+            assert batch is not None
+            rows_returned += len(batch.to_pyarrow()[0])
+        except StopIteration:
+            # This is one of the partitions with no values
+            pass
+        with pytest.raises(StopIteration):
+            stream.next()
+
+    assert rows_returned == 5
+
+
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
+def test_repartition(df):
+    df.repartition(2)
+
+
+def test_repartition_by_hash(df):
+    df.repartition_by_hash(column("a"), num=2)
+
+
+def test_intersect():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3]), pa.array([6])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_i_b.collect()
+
+
+def test_except_all():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([4, 5])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_e_b.collect()
+
+
+def test_collect_partitioned():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
+
+
+def test_union(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_union_distinct(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_cache(df):
+    assert df.cache().collect() == df.collect()
+
+
+def test_count(df):
+    # Get number of rows
+    assert df.count() == 3
+
+
+def test_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert datafusion dataframe to pandas dataframe
+    pandas_df = df.to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (3, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert empty datafusion dataframe to pandas dataframe
+    pandas_df = df.limit(0).to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (0, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert datafusion dataframe to polars dataframe
+    polars_df = df.to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (3, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert empty datafusion dataframe to polars dataframe
+    polars_df = df.limit(0).to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (0, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_to_arrow_table(df):
+    # Convert datafusion dataframe to pyarrow Table
+    pyarrow_table = df.to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream(df):
+    stream = df.execute_stream()
+    assert all(batch is not None for batch in stream)
+    assert not list(stream)  # after one iteration the generator must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
+@pytest.mark.parametrize("schema", [True, False])
+def test_execute_stream_to_arrow_table(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            (batch.to_pyarrow() for batch in stream), schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream_partitioned(df):
+    streams = df.execute_stream_partitioned()
+    assert all(batch is not None for stream in streams for batch in stream)
+    assert all(
+        not list(stream) for stream in streams
+    )  # after one iteration all generators must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
+def test_empty_to_arrow_table(df):
+    # Convert empty datafusion dataframe to pyarrow Table
+    pyarrow_table = df.limit(0).to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (0, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_to_pylist(df):
+    # Convert datafusion dataframe to Python list
+    pylist = df.to_pylist()
+    assert isinstance(pylist, list)
+    assert pylist == [
+        {"a": 1, "b": 4, "c": 8},
+        {"a": 2, "b": 5, "c": 5},
+        {"a": 3, "b": 6, "c": 8},
+    ]
+
+
+def test_to_pydict(df):
+    # Convert datafusion dataframe to Python dictionary
+    pydict = df.to_pydict()
+    assert isinstance(pydict, dict)
+    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
+
+
+def test_describe(df):
+    # Calculate statistics
+    df = df.describe()
+
+    # Collect the result
+    result = df.to_pydict()
+
+    assert result == {
+        "describe": [
+            "count",
+            "null_count",
+            "mean",
+            "std",
+            "min",
+            "max",
+            "median",
+        ],
+        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
+        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
+        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
+    }
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_csv(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_csv(path, with_header=True)
+
+    ctx.register_csv("csv", path)
+    result = ctx.table("csv").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_json(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_json(path)
+
+    ctx.register_json("json", path)
+    result = ctx.table("json").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_parquet(df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_parquet(str(path))
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
+)
+def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
+    path = tmp_path
+
+    df.write_parquet(
+        str(path), compression=compression, compression_level=compression_level
+    )
+
+    # test that the actual compression scheme is the one written
+    for _root, _dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".parquet"):
+                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
+                for row_group in metadata["row_groups"]:
+                    for columns in row_group["columns"]:
+                        assert columns["compression"].lower() == compression
+
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
+)
+def test_write_compressed_parquet_wrong_compression_level(
+    df, tmp_path, compression, compression_level
+):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(
+            str(path),
+            compression=compression,
+            compression_level=compression_level,
+        )
+
+
+@pytest.mark.parametrize("compression", ["wrong"])
+def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(str(path), compression=compression)
+
+
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
+@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
+def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
+    # Test write_parquet with zstd, brotli, gzip default compression level,
+    # ie don't specify compression level
+    # should complete without error
+    path = tmp_path
+
+    df.write_parquet(str(path), compression=compression)
+
+
+def test_dataframe_export(df) -> None:
+    # Guarantees that we have the canonical implementation
+    # reading our dataframe export
+    table = pa.table(df)
+    assert table.num_columns == 3
+    assert table.num_rows == 3
+
+    desired_schema = pa.schema([("a", pa.int64())])
+
+    # Verify we can request a schema
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+
+    # Expect a table of nulls if the schema don't overlap
+    desired_schema = pa.schema([("g", pa.string())])
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+    for i in range(3):
+        assert table[0][i].as_py() is None
+
+    # Expect an error when we cannot convert schema
+    desired_schema = pa.schema([("a", pa.float32())])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+    # Expect an error when we have a not set non-nullable
+    desired_schema = pa.schema([("g", pa.string(), False)])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+
+def test_dataframe_transform(df):
+    def add_string_col(df_internal) -> DataFrame:
+        return df_internal.with_column("string_col", literal("string data"))
+
+    def add_with_parameter(df_internal, value: Any) -> DataFrame:
+        return df_internal.with_column("new_col", literal(value))
+
+    df = df.transform(add_string_col).transform(add_with_parameter, 3)
+
+    result = df.to_pydict()
+
+    assert result["a"] == [1, 2, 3]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
+
+
+def test_dataframe_repr_html_structure(df) -> None:
+    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+    import re
+
+    output = df._repr_html_()
+
+    # Since we've added a fair bit of processing to the html output, lets just verify
+    # the values we are expecting in the table exist. Use regex and ignore everything
+    # between the <th></th> and <td></td>. We also don't want the closing > on the
+    # td and th segments because that is where the formatting data is written.
+
+    headers = ["a", "b", "c"]
+    headers = [f"<th(.*?)>{v}</th>" for v in headers]
+    header_pattern = "(.*?)".join(headers)
+    header_matches = re.findall(header_pattern, output, re.DOTALL)
+    assert len(header_matches) == 1
+
+    # Update the pattern to handle values that may be wrapped in spans
+    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
+
+    body_lines = [
+        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
+        for inner in body_data
+        for v in inner
+    ]
+    body_pattern = "(.*?)".join(body_lines)
+
+    body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+    """Test that DataFrame._repr_html_ contains the expected data values."""
+    html = df._repr_html_()
+    assert html is not None
+
+    # Create a more flexible pattern that handles values being wrapped in spans
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # of formatting
+    pattern = re.compile(
+        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
+        re.DOTALL,
+    )
+
+    # Print debug info if the test fails
+    matches = re.findall(pattern, html)
+    if not matches:
+        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
+
+    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+    """Test that shared styles work correctly across multiple tables."""
+
+    # First, ensure we're using shared styles
+    configure_formatter(use_shared_styles=True)
+
+    # Get HTML output for first table - should include styles
+    html_first = df._repr_html_()
+
+    # Verify styles are included in first render
+    assert "<style>" in html_first
+    assert ".expandable-container" in html_first
+
+    # Get HTML output for second table - should NOT include styles
+    html_second = df._repr_html_()
+
+    # Verify styles are NOT included in second render
+    assert "<style>" not in html_second
+    assert ".expandable-container" not in html_second
+
+    # Reset the styles loaded state and verify styles are included again
+    reset_styles_loaded_state()
+    html_after_reset = df._repr_html_()
+
+    # Verify styles are included after reset
+    assert "<style>" in html_after_reset
+    assert ".expandable-container" in html_after_reset
+
+
+def test_html_formatter_no_shared_styles(df, clean_formatter_state):
+    """Test that styles are always included when shared styles are disabled."""
+
+    # Configure formatter to NOT use shared styles
+    configure_formatter(use_shared_styles=False)
+
+    # Generate HTML multiple times
+    html_first = df._repr_html_()
+    html_second = df._repr_html_()
+
+    # Verify styles are included in both renders
+    assert "<style>" in html_first
+    assert "<style>" in html_second
+    assert ".expandable-container" in html_first
+    assert ".expandable-container" in html_second
+
+
+def test_html_formatter_manual_format_html(clean_formatter_state):
+    """Test direct usage of format_html method with shared styles."""
+
+    # Create sample data
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    formatter = get_formatter()
+
+    # First call should include styles
+    html_first = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_first
+
+    # Second call should not include styles (using shared styles by default)
+    html_second = formatter.format_html([batch], batch.schema)
+    assert "<style>" not in html_second
+
+    # Reset loaded state
+    reset_styles_loaded_state()
+
+    # After reset, styles should be included again
+    html_reset = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_reset
+
+    # Create a new formatter with shared_styles=False
+    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
+
+    # Both calls should include styles
+    local_html_1 = local_formatter.format_html([batch], batch.schema)
+    local_html_2 = local_formatter.format_html([batch], batch.schema)
+
+    assert "<style>" in local_html_1
+    assert "<style>" in local_html_2
+
+
+def test_html_formatter_memory_and_rows():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+    
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
+    # Test validation for invalid parameters
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters were properly set
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7
+
+
+def test_get_dataframe(tmp_path):
+    ctx = SessionContext()
+
+    path = tmp_path / "test.csv"
+    table = pa.Table.from_arrays(
+        [
+            [1, 2, 3, 4],
+            ["a", "b", "c", "d"],
+            [1.1, 2.2, 3.3, 4.4],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table, path)
+
+    ctx.register_csv("csv", path)
+
+    df = ctx.table("csv")
+    assert isinstance(df, DataFrame)
+
+
+def test_struct_select(struct_df):
+    df = struct_df.select(
+        column("a")["c"] + column("b"),
+        column("a")["c"] - column("b"),
+    )
+
+    # execute and collect the first (and only) batch
+    result = df.collect()[0]
+
+    assert result.column(0) == pa.array([5, 7, 9])
+    assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_explain(df):
+    df = df.select(
+        column("a") + column("b"),
+        column("a") - column("b"),
+    )
+    df.explain()
+
+
+def test_logical_plan(aggregate_df):
+    plan = aggregate_df.logical_plan()
+
+    expected = "Projection: test.c1, sum(test.c2)"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Projection: test.c1, sum(test.c2)\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "    TableScan: test"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_optimized_logical_plan(aggregate_df):
+    plan = aggregate_df.optimized_logical_plan()
+
+    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "  TableScan: test projection=[c1, c2]"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_execution_plan(aggregate_df):
+    plan = aggregate_df.execution_plan()
+
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
+
+    assert expected == plan.display()
+
+    # Check the number of partitions is as expected.
+    assert isinstance(plan.partition_count, int)
+
+    expected = (
+        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
+        "    TableScan: test projection=[c1, c2]"
+    )
+
+    indent = plan.display_indent()
+
+    # indent plan will be different for everyone due to absolute path
+    # to filename, so we just check for some expected content
+    assert "AggregateExec:" in indent
+    assert "CoalesceBatchesExec:" in indent
+    assert "RepartitionExec:" in indent
+    assert "DataSourceExec:" in indent
+    assert "file_type=csv" in indent
+
+    ctx = SessionContext()
+    rows_returned = 0
+    for idx in range(plan.partition_count):
+        stream = ctx.execute(plan, idx)
+        try:
+            batch = stream.next()
+            assert batch is not None
+            rows_returned += len(batch.to_pyarrow()[0])
+        except StopIteration:
+            # This is one of the partitions with no values
+            pass
+        with pytest.raises(StopIteration):
+            stream.next()
+
+    assert rows_returned == 5
+
+
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
+def test_repartition(df):
+    df.repartition(2)
+
+
+def test_repartition_by_hash(df):
+    df.repartition_by_hash(column("a"), num=2)
+
+
+def test_intersect():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3]), pa.array([6])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_i_b.collect()
+
+
+def test_except_all():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([4, 5])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_e_b.collect()
+
+
+def test_collect_partitioned():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
+
+
+def test_union(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_union_distinct(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_cache(df):
+    assert df.cache().collect() == df.collect()
+
+
+def test_count(df):
+    # Get number of rows
+    assert df.count() == 3
+
+
+def test_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert datafusion dataframe to pandas dataframe
+    pandas_df = df.to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (3, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert empty datafusion dataframe to pandas dataframe
+    pandas_df = df.limit(0).to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (0, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert datafusion dataframe to polars dataframe
+    polars_df = df.to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (3, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert empty datafusion dataframe to polars dataframe
+    polars_df = df.limit(0).to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (0, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_to_arrow_table(df):
+    # Convert datafusion dataframe to pyarrow Table
+    pyarrow_table = df.to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream(df):
+    stream = df.execute_stream()
+    assert all(batch is not None for batch in stream)
+    assert not list(stream)  # after one iteration the generator must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
+@pytest.mark.parametrize("schema", [True, False])
+def test_execute_stream_to_arrow_table(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            (batch.to_pyarrow() for batch in stream), schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream_partitioned(df):
+    streams = df.execute_stream_partitioned()
+    assert all(batch is not None for stream in streams for batch in stream)
+    assert all(
+        not list(stream) for stream in streams
+    )  # after one iteration all generators must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
+def test_empty_to_arrow_table(df):
+    # Convert empty datafusion dataframe to pyarrow Table
+    pyarrow_table = df.limit(0).to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (0, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_to_pylist(df):
+    # Convert datafusion dataframe to Python list
+    pylist = df.to_pylist()
+    assert isinstance(pylist, list)
+    assert pylist == [
+        {"a": 1, "b": 4, "c": 8},
+        {"a": 2, "b": 5, "c": 5},
+        {"a": 3, "b": 6, "c": 8},
+    ]
+
+
+def test_to_pydict(df):
+    # Convert datafusion dataframe to Python dictionary
+    pydict = df.to_pydict()
+    assert isinstance(pydict, dict)
+    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
+
+
+def test_describe(df):
+    # Calculate statistics
+    df = df.describe()
+
+    # Collect the result
+    result = df.to_pydict()
+
+    assert result == {
+        "describe": [
+            "count",
+            "null_count",
+            "mean",
+            "std",
+            "min",
+            "max",
+            "median",
+        ],
+        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
+        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
+        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
+    }
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_csv(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_csv(path, with_header=True)
+
+    ctx.register_csv("csv", path)
+    result = ctx.table("csv").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_json(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_json(path)
+
+    ctx.register_json("json", path)
+    result = ctx.table("json").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_parquet(df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_parquet(str(path))
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
+)
+def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
+    path = tmp_path
+
+    df.write_parquet(
+        str(path), compression=compression, compression_level=compression_level
+    )
+
+    # test that the actual compression scheme is the one written
+    for _root, _dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".parquet"):
+                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
+                for row_group in metadata["row_groups"]:
+                    for columns in row_group["columns"]:
+                        assert columns["compression"].lower() == compression
+
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
+)
+def test_write_compressed_parquet_wrong_compression_level(
+    df, tmp_path, compression, compression_level
+):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(
+            str(path),
+            compression=compression,
+            compression_level=compression_level,
+        )
+
+
+@pytest.mark.parametrize("compression", ["wrong"])
+def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(str(path), compression=compression)
+
+
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
+@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
+def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
+    # Test write_parquet with zstd, brotli, gzip default compression level,
+    # ie don't specify compression level
+    # should complete without error
+    path = tmp_path
+
+    df.write_parquet(str(path), compression=compression)
+
+
+def test_dataframe_export(df) -> None:
+    # Guarantees that we have the canonical implementation
+    # reading our dataframe export
+    table = pa.table(df)
+    assert table.num_columns == 3
+    assert table.num_rows == 3
+
+    desired_schema = pa.schema([("a", pa.int64())])
+
+    # Verify we can request a schema
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+
+    # Expect a table of nulls if the schema don't overlap
+    desired_schema = pa.schema([("g", pa.string())])
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+    for i in range(3):
+        assert table[0][i].as_py() is None
+
+    # Expect an error when we cannot convert schema
+    desired_schema = pa.schema([("a", pa.float32())])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+    # Expect an error when we have a not set non-nullable
+    desired_schema = pa.schema([("g", pa.string(), False)])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+
+def test_dataframe_transform(df):
+    def add_string_col(df_internal) -> DataFrame:
+        return df_internal.with_column("string_col", literal("string data"))
+
+    def add_with_parameter(df_internal, value: Any) -> DataFrame:
+        return df_internal.with_column("new_col", literal(value))
+
+    df = df.transform(add_string_col).transform(add_with_parameter, 3)
+
+    result = df.to_pydict()
+
+    assert result["a"] == [1, 2, 3]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
+
+
+def test_dataframe_repr_html_structure(df) -> None:
+    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+    import re
+
+    output = df._repr_html_()
+
+    # Since we've added a fair bit of processing to the html output, lets just verify
+    # the values we are expecting in the table exist. Use regex and ignore everything
+    # between the <th></th> and <td></td>. We also don't want the closing > on the
+    # td and th segments because that is where the formatting data is written.
+
+    headers = ["a", "b", "c"]
+    headers = [f"<th(.*?)>{v}</th>" for v in headers]
+    header_pattern = "(.*?)".join(headers)
+    header_matches = re.findall(header_pattern, output, re.DOTALL)
+    assert len(header_matches) == 1
+
+    # Update the pattern to handle values that may be wrapped in spans
+    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
+
+    body_lines = [
+        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
+        for inner in body_data
+        for v in inner
+    ]
+    body_pattern = "(.*?)".join(body_lines)
+
+    body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+    """Test that DataFrame._repr_html_ contains the expected data values."""
+    html = df._repr_html_()
+    assert html is not None
+
+    # Create a more flexible pattern that handles values being wrapped in spans
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # of formatting
+    pattern = re.compile(
+        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
+        re.DOTALL,
+    )
+
+    # Print debug info if the test fails
+    matches = re.findall(pattern, html)
+    if not matches:
+        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
+
+    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+    """Test that shared styles work correctly across multiple tables."""
+
+    # First, ensure we're using shared styles
+    configure_formatter(use_shared_styles=True)
+
+    # Get HTML output for first table - should include styles
+    html_first = df._repr_html_()
+
+    # Verify styles are included in first render
+    assert "<style>" in html_first
+    assert ".expandable-container" in html_first
+
+    # Get HTML output for second table - should NOT include styles
+    html_second = df._repr_html_()
+
+    # Verify styles are NOT included in second render
+    assert "<style>" not in html_second
+    assert ".expandable-container" not in html_second
+
+    # Reset the styles loaded state and verify styles are included again
+    reset_styles_loaded_state()
+    html_after_reset = df._repr_html_()
+
+    # Verify styles are included after reset
+    assert "<style>" in html_after_reset
+    assert ".expandable-container" in html_after_reset
+
+
+def test_html_formatter_no_shared_styles(df, clean_formatter_state):
+    """Test that styles are always included when shared styles are disabled."""
+
+    # Configure formatter to NOT use shared styles
+    configure_formatter(use_shared_styles=False)
+
+    # Generate HTML multiple times
+    html_first = df._repr_html_()
+    html_second = df._repr_html_()
+
+    # Verify styles are included in both renders
+    assert "<style>" in html_first
+    assert "<style>" in html_second
+    assert ".expandable-container" in html_first
+    assert ".expandable-container" in html_second
+
+
+def test_html_formatter_manual_format_html(clean_formatter_state):
+    """Test direct usage of format_html method with shared styles."""
+
+    # Create sample data
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    formatter = get_formatter()
+
+    # First call should include styles
+    html_first = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_first
+
+    # Second call should not include styles (using shared styles by default)
+    html_second = formatter.format_html([batch], batch.schema)
+    assert "<style>" not in html_second
+
+    # Reset loaded state
+    reset_styles_loaded_state()
+
+    # After reset, styles should be included again
+    html_reset = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_reset
+
+    # Create a new formatter with shared_styles=False
+    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
+
+    # Both calls should include styles
+    local_html_1 = local_formatter.format_html([batch], batch.schema)
+    local_html_2 = local_formatter.format_html([batch], batch.schema)
+
+    assert "<style>" in local_html_1
+    assert "<style>" in local_html_2
+
+
+def test_html_formatter_memory_and_rows():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+    
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
+    # Test validation for invalid parameters
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters were properly set
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7
+
+
+def test_get_dataframe(tmp_path):
+    ctx = SessionContext()
+
+    path = tmp_path / "test.csv"
+    table = pa.Table.from_arrays(
+        [
+            [1, 2, 3, 4],
+            ["a", "b", "c", "d"],
+            [1.1, 2.2, 3.3, 4.4],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table, path)
+
+    ctx.register_csv("csv", path)
+
+    df = ctx.table("csv")
+    assert isinstance(df, DataFrame)
+
+
+def test_struct_select(struct_df):
+    df = struct_df.select(
+        column("a")["c"] + column("b"),
+        column("a")["c"] - column("b"),
+    )
+
+    # execute and collect the first (and only) batch
+    result = df.collect()[0]
+
+    assert result.column(0) == pa.array([5, 7, 9])
+    assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_explain(df):
+    df = df.select(
+        column("a") + column("b"),
+        column("a") - column("b"),
+    )
+    df.explain()
+
+
+def test_logical_plan(aggregate_df):
+    plan = aggregate_df.logical_plan()
+
+    expected = "Projection: test.c1, sum(test.c2)"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Projection: test.c1, sum(test.c2)\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "    TableScan: test"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_optimized_logical_plan(aggregate_df):
+    plan = aggregate_df.optimized_logical_plan()
+
+    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "  TableScan: test projection=[c1, c2]"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_execution_plan(aggregate_df):
+    plan = aggregate_df.execution_plan()
+
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
+
+    assert expected == plan.display()
+
+    # Check the number of partitions is as expected.
+    assert isinstance(plan.partition_count, int)
+
+    expected = (
+        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
+        "    TableScan: test projection=[c1, c2]"
+    )
+
+    indent = plan.display_indent()
+
+    # indent plan will be different for everyone due to absolute path
+    # to filename, so we just check for some expected content
+    assert "AggregateExec:" in indent
+    assert "CoalesceBatchesExec:" in indent
+    assert "RepartitionExec:" in indent
+    assert "DataSourceExec:" in indent
+    assert "file_type=csv" in indent
+
+    ctx = SessionContext()
+    rows_returned = 0
+    for idx in range(plan.partition_count):
+        stream = ctx.execute(plan, idx)
+        try:
+            batch = stream.next()
+            assert batch is not None
+            rows_returned += len(batch.to_pyarrow()[0])
+        except StopIteration:
+            # This is one of the partitions with no values
+            pass
+        with pytest.raises(StopIteration):
+            stream.next()
+
+    assert rows_returned == 5
+
+
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
+def test_repartition(df):
+    df.repartition(2)
+
+
+def test_repartition_by_hash(df):
+    df.repartition_by_hash(column("a"), num=2)
+
+
+def test_intersect():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3]), pa.array([6])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_i_b.collect()
+
+
+def test_except_all():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([4, 5])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_e_b.collect()
+
+
+def test_collect_partitioned():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
+
+
+def test_union(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_union_distinct(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_cache(df):
+    assert df.cache().collect() == df.collect()
+
+
+def test_count(df):
+    # Get number of rows
+    assert df.count() == 3
+
+
+def test_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert datafusion dataframe to pandas dataframe
+    pandas_df = df.to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (3, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert empty datafusion dataframe to pandas dataframe
+    pandas_df = df.limit(0).to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (0, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert datafusion dataframe to polars dataframe
+    polars_df = df.to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (3, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert empty datafusion dataframe to polars dataframe
+    polars_df = df.limit(0).to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (0, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_to_arrow_table(df):
+    # Convert datafusion dataframe to pyarrow Table
+    pyarrow_table = df.to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream(df):
+    stream = df.execute_stream()
+    assert all(batch is not None for batch in stream)
+    assert not list(stream)  # after one iteration the generator must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
+@pytest.mark.parametrize("schema", [True, False])
+def test_execute_stream_to_arrow_table(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            (batch.to_pyarrow() for batch in stream), schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream_partitioned(df):
+    streams = df.execute_stream_partitioned()
+    assert all(batch is not None for stream in streams for batch in stream)
+    assert all(
+        not list(stream) for stream in streams
+    )  # after one iteration all generators must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
+def test_empty_to_arrow_table(df):
+    # Convert empty datafusion dataframe to pyarrow Table
+    pyarrow_table = df.limit(0).to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (0, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_to_pylist(df):
+    # Convert datafusion dataframe to Python list
+    pylist = df.to_pylist()
+    assert isinstance(pylist, list)
+    assert pylist == [
+        {"a": 1, "b": 4, "c": 8},
+        {"a": 2, "b": 5, "c": 5},
+        {"a": 3, "b": 6, "c": 8},
+    ]
+
+
+def test_to_pydict(df):
+    # Convert datafusion dataframe to Python dictionary
+    pydict = df.to_pydict()
+    assert isinstance(pydict, dict)
+    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
+
+
+def test_describe(df):
+    # Calculate statistics
+    df = df.describe()
+
+    # Collect the result
+    result = df.to_pydict()
+
+    assert result == {
+        "describe": [
+            "count",
+            "null_count",
+            "mean",
+            "std",
+            "min",
+            "max",
+            "median",
+        ],
+        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
+        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
+        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
+    }
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_csv(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_csv(path, with_header=True)
+
+    ctx.register_csv("csv", path)
+    result = ctx.table("csv").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_json(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_json(path)
+
+    ctx.register_json("json", path)
+    result = ctx.table("json").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_parquet(df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_parquet(str(path))
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
+)
+def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
+    path = tmp_path
+
+    df.write_parquet(
+        str(path), compression=compression, compression_level=compression_level
+    )
+
+    # test that the actual compression scheme is the one written
+    for _root, _dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".parquet"):
+                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
+                for row_group in metadata["row_groups"]:
+                    for columns in row_group["columns"]:
+                        assert columns["compression"].lower() == compression
+
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
+)
+def test_write_compressed_parquet_wrong_compression_level(
+    df, tmp_path, compression, compression_level
+):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(
+            str(path),
+            compression=compression,
+            compression_level=compression_level,
+        )
+
+
+@pytest.mark.parametrize("compression", ["wrong"])
+def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(str(path), compression=compression)
+
+
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
+@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
+def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
+    # Test write_parquet with zstd, brotli, gzip default compression level,
+    # ie don't specify compression level
+    # should complete without error
+    path = tmp_path
+
+    df.write_parquet(str(path), compression=compression)
+
+
+def test_dataframe_export(df) -> None:
+    # Guarantees that we have the canonical implementation
+    # reading our dataframe export
+    table = pa.table(df)
+    assert table.num_columns == 3
+    assert table.num_rows == 3
+
+    desired_schema = pa.schema([("a", pa.int64())])
+
+    # Verify we can request a schema
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+
+    # Expect a table of nulls if the schema don't overlap
+    desired_schema = pa.schema([("g", pa.string())])
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+    for i in range(3):
+        assert table[0][i].as_py() is None
+
+    # Expect an error when we cannot convert schema
+    desired_schema = pa.schema([("a", pa.float32())])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+    # Expect an error when we have a not set non-nullable
+    desired_schema = pa.schema([("g", pa.string(), False)])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+
+def test_dataframe_transform(df):
+    def add_string_col(df_internal) -> DataFrame:
+        return df_internal.with_column("string_col", literal("string data"))
+
+    def add_with_parameter(df_internal, value: Any) -> DataFrame:
+        return df_internal.with_column("new_col", literal(value))
+
+    df = df.transform(add_string_col).transform(add_with_parameter, 3)
+
+    result = df.to_pydict()
+
+    assert result["a"] == [1, 2, 3]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
+
+
+def test_dataframe_repr_html_structure(df) -> None:
+    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+    import re
+
+    output = df._repr_html_()
+
+    # Since we've added a fair bit of processing to the html output, lets just verify
+    # the values we are expecting in the table exist. Use regex and ignore everything
+    # between the <th></th> and <td></td>. We also don't want the closing > on the
+    # td and th segments because that is where the formatting data is written.
+
+    headers = ["a", "b", "c"]
+    headers = [f"<th(.*?)>{v}</th>" for v in headers]
+    header_pattern = "(.*?)".join(headers)
+    header_matches = re.findall(header_pattern, output, re.DOTALL)
+    assert len(header_matches) == 1
+
+    # Update the pattern to handle values that may be wrapped in spans
+    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
+
+    body_lines = [
+        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
+        for inner in body_data
+        for v in inner
+    ]
+    body_pattern = "(.*?)".join(body_lines)
+
+    body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+    """Test that DataFrame._repr_html_ contains the expected data values."""
+    html = df._repr_html_()
+    assert html is not None
+
+    # Create a more flexible pattern that handles values being wrapped in spans
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # of formatting
+    pattern = re.compile(
+        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
+        re.DOTALL,
+    )
+
+    # Print debug info if the test fails
+    matches = re.findall(pattern, html)
+    if not matches:
+        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
+
+    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+    """Test that shared styles work correctly across multiple tables."""
+
+    # First, ensure we're using shared styles
+    configure_formatter(use_shared_styles=True)
+
+    # Get HTML output for first table - should include styles
+    html_first = df._repr_html_()
+
+    # Verify styles are included in first render
+    assert "<style>" in html_first
+    assert ".expandable-container" in html_first
+
+    # Get HTML output for second table - should NOT include styles
+    html_second = df._repr_html_()
+
+    # Verify styles are NOT included in second render
+    assert "<style>" not in html_second
+    assert ".expandable-container" not in html_second
+
+    # Reset the styles loaded state and verify styles are included again
+    reset_styles_loaded_state()
+    html_after_reset = df._repr_html_()
+
+    # Verify styles are included after reset
+    assert "<style>" in html_after_reset
+    assert ".expandable-container" in html_after_reset
+
+
+def test_html_formatter_no_shared_styles(df, clean_formatter_state):
+    """Test that styles are always included when shared styles are disabled."""
+
+    # Configure formatter to NOT use shared styles
+    configure_formatter(use_shared_styles=False)
+
+    # Generate HTML multiple times
+    html_first = df._repr_html_()
+    html_second = df._repr_html_()
+
+    # Verify styles are included in both renders
+    assert "<style>" in html_first
+    assert "<style>" in html_second
+    assert ".expandable-container" in html_first
+    assert ".expandable-container" in html_second
+
+
+def test_html_formatter_manual_format_html(clean_formatter_state):
+    """Test direct usage of format_html method with shared styles."""
+
+    # Create sample data
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    formatter = get_formatter()
+
+    # First call should include styles
+    html_first = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_first
+
+    # Second call should not include styles (using shared styles by default)
+    html_second = formatter.format_html([batch], batch.schema)
+    assert "<style>" not in html_second
+
+    # Reset loaded state
+    reset_styles_loaded_state()
+
+    # After reset, styles should be included again
+    html_reset = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_reset
+
+    # Create a new formatter with shared_styles=False
+    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
+
+    # Both calls should include styles
+    local_html_1 = local_formatter.format_html([batch], batch.schema)
+    local_html_2 = local_formatter.format_html([batch], batch.schema)
+
+    assert "<style>" in local_html_1
+    assert "<style>" in local_html_2
+
+
+def test_html_formatter_memory_and_rows():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+    
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
+    # Test validation for invalid parameters
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters were properly set
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7
+
+
+def test_get_dataframe(tmp_path):
+    ctx = SessionContext()
+
+    path = tmp_path / "test.csv"
+    table = pa.Table.from_arrays(
+        [
+            [1, 2, 3, 4],
+            ["a", "b", "c", "d"],
+            [1.1, 2.2, 3.3, 4.4],
+        ],
+        names=["int", "str", "float"],
+    )
+    write_csv(table, path)
+
+    ctx.register_csv("csv", path)
+
+    df = ctx.table("csv")
+    assert isinstance(df, DataFrame)
+
+
+def test_struct_select(struct_df):
+    df = struct_df.select(
+        column("a")["c"] + column("b"),
+        column("a")["c"] - column("b"),
+    )
+
+    # execute and collect the first (and only) batch
+    result = df.collect()[0]
+
+    assert result.column(0) == pa.array([5, 7, 9])
+    assert result.column(1) == pa.array([-3, -3, -3])
+
+
+def test_explain(df):
+    df = df.select(
+        column("a") + column("b"),
+        column("a") - column("b"),
+    )
+    df.explain()
+
+
+def test_logical_plan(aggregate_df):
+    plan = aggregate_df.logical_plan()
+
+    expected = "Projection: test.c1, sum(test.c2)"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Projection: test.c1, sum(test.c2)\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "    TableScan: test"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_optimized_logical_plan(aggregate_df):
+    plan = aggregate_df.optimized_logical_plan()
+
+    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
+
+    assert expected == plan.display()
+
+    expected = (
+        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
+        "  TableScan: test projection=[c1, c2]"
+    )
+
+    assert expected == plan.display_indent()
+
+
+def test_execution_plan(aggregate_df):
+    plan = aggregate_df.execution_plan()
+
+    expected = (
+        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
+    )
+
+    assert expected == plan.display()
+
+    # Check the number of partitions is as expected.
+    assert isinstance(plan.partition_count, int)
+
+    expected = (
+        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
+        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
+        "    TableScan: test projection=[c1, c2]"
+    )
+
+    indent = plan.display_indent()
+
+    # indent plan will be different for everyone due to absolute path
+    # to filename, so we just check for some expected content
+    assert "AggregateExec:" in indent
+    assert "CoalesceBatchesExec:" in indent
+    assert "RepartitionExec:" in indent
+    assert "DataSourceExec:" in indent
+    assert "file_type=csv" in indent
+
+    ctx = SessionContext()
+    rows_returned = 0
+    for idx in range(plan.partition_count):
+        stream = ctx.execute(plan, idx)
+        try:
+            batch = stream.next()
+            assert batch is not None
+            rows_returned += len(batch.to_pyarrow()[0])
+        except StopIteration:
+            # This is one of the partitions with no values
+            pass
+        with pytest.raises(StopIteration):
+            stream.next()
+
+    assert rows_returned == 5
+
+
+@pytest.mark.asyncio
+async def test_async_iteration_of_df(aggregate_df):
+    rows_returned = 0
+    async for batch in aggregate_df.execute_stream():
+        assert batch is not None
+        rows_returned += len(batch.to_pyarrow()[0])
+
+    assert rows_returned == 5
+
+
+def test_repartition(df):
+    df.repartition(2)
+
+
+def test_repartition_by_hash(df):
+    df.repartition_by_hash(column("a"), num=2)
+
+
+def test_intersect():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3]), pa.array([6])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_i_b.collect()
+
+
+def test_except_all():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2]), pa.array([4, 5])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_e_b.collect()
+
+
+def test_collect_partitioned():
+    ctx = SessionContext()
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
+
+
+def test_union(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_union_distinct(ctx):
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+    df_a = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_b = ctx.create_dataframe([[batch]])
+
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
+        names=["a", "b"],
+    )
+    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
+
+    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
+
+    assert df_c.collect() == df_a_u_b.collect()
+    assert df_c.collect() == df_a_u_b.collect()
+
+
+def test_cache(df):
+    assert df.cache().collect() == df.collect()
+
+
+def test_count(df):
+    # Get number of rows
+    assert df.count() == 3
+
+
+def test_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert datafusion dataframe to pandas dataframe
+    pandas_df = df.to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (3, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_pandas(df):
+    # Skip test if pandas is not installed
+    pd = pytest.importorskip("pandas")
+
+    # Convert empty datafusion dataframe to pandas dataframe
+    pandas_df = df.limit(0).to_pandas()
+    assert isinstance(pandas_df, pd.DataFrame)
+    assert pandas_df.shape == (0, 3)
+    assert set(pandas_df.columns) == {"a", "b", "c"}
+
+
+def test_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert datafusion dataframe to polars dataframe
+    polars_df = df.to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (3, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_empty_to_polars(df):
+    # Skip test if polars is not installed
+    pl = pytest.importorskip("polars")
+
+    # Convert empty datafusion dataframe to polars dataframe
+    polars_df = df.limit(0).to_polars()
+    assert isinstance(polars_df, pl.DataFrame)
+    assert polars_df.shape == (0, 3)
+    assert set(polars_df.columns) == {"a", "b", "c"}
+
+
+def test_to_arrow_table(df):
+    # Convert datafusion dataframe to pyarrow Table
+    pyarrow_table = df.to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream(df):
+    stream = df.execute_stream()
+    assert all(batch is not None for batch in stream)
+    assert not list(stream)  # after one iteration the generator must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_async(df):
+    stream = df.execute_stream()
+    batches = [batch async for batch in stream]
+
+    assert all(batch is not None for batch in batches)
+
+    # After consuming all batches, the stream should be exhausted
+    remaining_batches = [batch async for batch in stream]
+    assert not remaining_batches
+
+
+@pytest.mark.parametrize("schema", [True, False])
+def test_execute_stream_to_arrow_table(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            (batch.to_pyarrow() for batch in stream), schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("schema", [True, False])
+async def test_execute_stream_to_arrow_table_async(df, schema):
+    stream = df.execute_stream()
+
+    if schema:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
+        )
+    else:
+        pyarrow_table = pa.Table.from_batches(
+            [batch.to_pyarrow() async for batch in stream]
+        )
+
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (3, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_execute_stream_partitioned(df):
+    streams = df.execute_stream_partitioned()
+    assert all(batch is not None for stream in streams for batch in stream)
+    assert all(
+        not list(stream) for stream in streams
+    )  # after one iteration all generators must be exhausted
+
+
+@pytest.mark.asyncio
+async def test_execute_stream_partitioned_async(df):
+    streams = df.execute_stream_partitioned()
+
+    for stream in streams:
+        batches = [batch async for batch in stream]
+        assert all(batch is not None for batch in batches)
+
+        # Ensure the stream is exhausted after iteration
+        remaining_batches = [batch async for batch in stream]
+        assert not remaining_batches
+
+
+def test_empty_to_arrow_table(df):
+    # Convert empty datafusion dataframe to pyarrow Table
+    pyarrow_table = df.limit(0).to_arrow_table()
+    assert isinstance(pyarrow_table, pa.Table)
+    assert pyarrow_table.shape == (0, 3)
+    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
+
+
+def test_to_pylist(df):
+    # Convert datafusion dataframe to Python list
+    pylist = df.to_pylist()
+    assert isinstance(pylist, list)
+    assert pylist == [
+        {"a": 1, "b": 4, "c": 8},
+        {"a": 2, "b": 5, "c": 5},
+        {"a": 3, "b": 6, "c": 8},
+    ]
+
+
+def test_to_pydict(df):
+    # Convert datafusion dataframe to Python dictionary
+    pydict = df.to_pydict()
+    assert isinstance(pydict, dict)
+    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
+
+
+def test_describe(df):
+    # Calculate statistics
+    df = df.describe()
+
+    # Collect the result
+    result = df.to_pydict()
+
+    assert result == {
+        "describe": [
+            "count",
+            "null_count",
+            "mean",
+            "std",
+            "min",
+            "max",
+            "median",
+        ],
+        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
+        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
+        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
+    }
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_csv(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_csv(path, with_header=True)
+
+    ctx.register_csv("csv", path)
+    result = ctx.table("csv").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_json(ctx, df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_json(path)
+
+    ctx.register_json("json", path)
+    result = ctx.table("json").to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize("path_to_str", [True, False])
+def test_write_parquet(df, tmp_path, path_to_str):
+    path = str(tmp_path) if path_to_str else tmp_path
+
+    df.write_parquet(str(path))
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
+)
+def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
+    path = tmp_path
+
+    df.write_parquet(
+        str(path), compression=compression, compression_level=compression_level
+    )
+
+    # test that the actual compression scheme is the one written
+    for _root, _dirs, files in os.walk(path):
+        for file in files:
+            if file.endswith(".parquet"):
+                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
+                for row_group in metadata["row_groups"]:
+                    for columns in row_group["columns"]:
+                        assert columns["compression"].lower() == compression
+
+    result = pq.read_table(str(path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    ("compression", "compression_level"),
+    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
+)
+def test_write_compressed_parquet_wrong_compression_level(
+    df, tmp_path, compression, compression_level
+):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(
+            str(path),
+            compression=compression,
+            compression_level=compression_level,
+        )
+
+
+@pytest.mark.parametrize("compression", ["wrong"])
+def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
+    path = tmp_path
+
+    with pytest.raises(ValueError):
+        df.write_parquet(str(path), compression=compression)
+
+
+# not testing lzo because it it not implemented yet
+# https://github.com/apache/arrow-rs/issues/6970
+@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
+def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
+    # Test write_parquet with zstd, brotli, gzip default compression level,
+    # ie don't specify compression level
+    # should complete without error
+    path = tmp_path
+
+    df.write_parquet(str(path), compression=compression)
+
+
+def test_dataframe_export(df) -> None:
+    # Guarantees that we have the canonical implementation
+    # reading our dataframe export
+    table = pa.table(df)
+    assert table.num_columns == 3
+    assert table.num_rows == 3
+
+    desired_schema = pa.schema([("a", pa.int64())])
+
+    # Verify we can request a schema
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+
+    # Expect a table of nulls if the schema don't overlap
+    desired_schema = pa.schema([("g", pa.string())])
+    table = pa.table(df, schema=desired_schema)
+    assert table.num_columns == 1
+    assert table.num_rows == 3
+    for i in range(3):
+        assert table[0][i].as_py() is None
+
+    # Expect an error when we cannot convert schema
+    desired_schema = pa.schema([("a", pa.float32())])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+    # Expect an error when we have a not set non-nullable
+    desired_schema = pa.schema([("g", pa.string(), False)])
+    failed_convert = False
+    try:
+        table = pa.table(df, schema=desired_schema)
+    except Exception:
+        failed_convert = True
+    assert failed_convert
+
+
+def test_dataframe_transform(df):
+    def add_string_col(df_internal) -> DataFrame:
+        return df_internal.with_column("string_col", literal("string data"))
+
+    def add_with_parameter(df_internal, value: Any) -> DataFrame:
+        return df_internal.with_column("new_col", literal(value))
+
+    df = df.transform(add_string_col).transform(add_with_parameter, 3)
+
+    result = df.to_pydict()
+
+    assert result["a"] == [1, 2, 3]
+    assert result["string_col"] == ["string data" for _i in range(3)]
+    assert result["new_col"] == [3 for _i in range(3)]
+
+
+def test_dataframe_repr_html_structure(df) -> None:
+    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
+    import re
+
+    output = df._repr_html_()
+
+    # Since we've added a fair bit of processing to the html output, lets just verify
+    # the values we are expecting in the table exist. Use regex and ignore everything
+    # between the <th></th> and <td></td>. We also don't want the closing > on the
+    # td and th segments because that is where the formatting data is written.
+
+    headers = ["a", "b", "c"]
+    headers = [f"<th(.*?)>{v}</th>" for v in headers]
+    header_pattern = "(.*?)".join(headers)
+    header_matches = re.findall(header_pattern, output, re.DOTALL)
+    assert len(header_matches) == 1
+
+    # Update the pattern to handle values that may be wrapped in spans
+    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
+
+    body_lines = [
+        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
+        for inner in body_data
+        for v in inner
+    ]
+    body_pattern = "(.*?)".join(body_lines)
+
+    body_matches = re.findall(body_pattern, output, re.DOTALL)
+
+    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
+
+
+def test_dataframe_repr_html_values(df):
+    """Test that DataFrame._repr_html_ contains the expected data values."""
+    html = df._repr_html_()
+    assert html is not None
+
+    # Create a more flexible pattern that handles values being wrapped in spans
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # of formatting
+    pattern = re.compile(
+        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
+        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
+        re.DOTALL,
+    )
+
+    # Print debug info if the test fails
+    matches = re.findall(pattern, html)
+    if not matches:
+        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
+
+    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
+
+
+def test_html_formatter_shared_styles(df, clean_formatter_state):
+    """Test that shared styles work correctly across multiple tables."""
+
+    # First, ensure we're using shared styles
+    configure_formatter(use_shared_styles=True)
+
+    # Get HTML output for first table - should include styles
+    html_first = df._repr_html_()
+
+    # Verify styles are included in first render
+    assert "<style>" in html_first
+    assert ".expandable-container" in html_first
+
+    # Get HTML output for second table - should NOT include styles
+    html_second = df._repr_html_()
+
+    # Verify styles are NOT included in second render
+    assert "<style>" not in html_second
+    assert ".expandable-container" not in html_second
+
+    # Reset the styles loaded state and verify styles are included again
+    reset_styles_loaded_state()
+    html_after_reset = df._repr_html_()
+
+    # Verify styles are included after reset
+    assert "<style>" in html_after_reset
+    assert ".expandable-container" in html_after_reset
+
+
+def test_html_formatter_no_shared_styles(df, clean_formatter_state):
+    """Test that styles are always included when shared styles are disabled."""
+
+    # Configure formatter to NOT use shared styles
+    configure_formatter(use_shared_styles=False)
+
+    # Generate HTML multiple times
+    html_first = df._repr_html_()
+    html_second = df._repr_html_()
+
+    # Verify styles are included in both renders
+    assert "<style>" in html_first
+    assert "<style>" in html_second
+    assert ".expandable-container" in html_first
+    assert ".expandable-container" in html_second
+
+
+def test_html_formatter_manual_format_html(clean_formatter_state):
+    """Test direct usage of format_html method with shared styles."""
+
+    # Create sample data
+    batch = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+        names=["a", "b"],
+    )
+
+    formatter = get_formatter()
+
+    # First call should include styles
+    html_first = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_first
+
+    # Second call should not include styles (using shared styles by default)
+    html_second = formatter.format_html([batch], batch.schema)
+    assert "<style>" not in html_second
+
+    # Reset loaded state
+    reset_styles_loaded_state()
+
+    # After reset, styles should be included again
+    html_reset = formatter.format_html([batch], batch.schema)
+    assert "<style>" in html_reset
+
+    # Create a new formatter with shared_styles=False
+    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
+
+    # Both calls should include styles
+    local_html_1 = local_formatter.format_html([batch], batch.schema)
+    local_html_2 = local_formatter.format_html([batch], batch.schema)
+
+    assert "<style>" in local_html_1
+    assert "<style>" in local_html_2
+
+
+def test_html_formatter_memory_and_rows():
+    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
+    
+    # Test default values
+    formatter = DataFrameHtmlFormatter()
+    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
+    assert formatter.min_rows_display == 20
+    assert formatter.repr_rows == 10
+    
+    # Test custom values
+    formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=1024 * 1024,  # 1 MB
+        min_rows_display=10,
+        repr_rows=5
+    )
+    assert formatter.max_memory_bytes == 1024 * 1024
+    assert formatter.min_rows_display == 10
+    assert formatter.repr_rows == 5
+    
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
+    # Test validation for invalid parameters
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=0)
+    
+    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
+        DataFrameHtmlFormatter(max_memory_bytes=-100)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=0)
+    
+    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
+        DataFrameHtmlFormatter(min_rows_display=-5)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=0)
+    
+    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
+        DataFrameHtmlFormatter(repr_rows=-10)
+
+
+def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured parameters."""
+
+    class CustomStyleProvider:
+        def get_cell_style(self) -> str:
+            return (
+                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+                "1px solid #ddd;"
+            )
+
+        def get_header_style(self) -> str:
+            return (
+                "background-color: #4285f4; color: white; font-weight: bold; "
+                "padding: 10px; border: 1px solid #3367d6;"
+            )
+
+    # Configure with custom style provider
+    configure_formatter(style_provider=CustomStyleProvider())
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Reset for the next part of the test
+    reset_formatter()
+    
+    # Configure with custom style provider and additional parameters
+    configure_formatter(
+        style_provider=CustomStyleProvider(),
+        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
+        min_rows_display=15,
+        repr_rows=7
+    )
+
+    html_output = df._repr_html_()
+
+    # Verify our custom styles were applied
+    assert "background-color: #4285f4" in html_output
+    assert "color: white" in html_output
+    assert "background-color: #f5f5f5" in html_output
+
+    # Test memory and row parameters were properly set
+    formatter = get_formatter()
+    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
+    assert formatter.min_rows_display == 15
+    assert formatter.repr_rows == 7
+
+
 def test_get_dataframe(tmp_path):
     ctx = SessionContext()
 
@@ -1668,6 +5101,17 @@ def test_html_formatter_memory_and_rows():
     assert formatter.min_rows_display == 10
     assert formatter.repr_rows == 5
     
+    # Test extremely large values and tiny values (edge cases)
+    # These should not raise exceptions
+    extreme_formatter = DataFrameHtmlFormatter(
+        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
+        min_rows_display=1,
+        repr_rows=1
+    )
+    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
+    assert extreme_formatter.min_rows_display == 1
+    assert extreme_formatter.repr_rows == 1
+    
     # Test validation for invalid parameters
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=0)

From 412ab3aba2328c5620abd1233c2f382ebde22ccc Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 11:00:42 +0800
Subject: [PATCH 23/40] feat: add validation method for FormatterConfig to
 ensure positive integer values

---
 src/dataframe.rs | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index a221a3649..7f4992338 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -93,6 +93,29 @@ impl Default for FormatterConfig {
     }
 }
 
+impl FormatterConfig {
+    /// Validates that all configuration values are positive integers.
+    ///
+    /// # Returns
+    ///
+    /// `Ok(())` if all values are valid, or an `Err` with a descriptive error message.
+    pub fn validate(&self) -> Result<(), String> {
+        if self.max_bytes == 0 {
+            return Err("max_bytes must be a positive integer".to_string());
+        }
+
+        if self.min_rows == 0 {
+            return Err("min_rows must be a positive integer".to_string());
+        }
+
+        if self.repr_rows == 0 {
+            return Err("repr_rows must be a positive integer".to_string());
+        }
+
+        Ok(())
+    }
+}
+
 /// Holds the Python formatter and its configuration
 struct PythonFormatter<'py> {
     /// The Python formatter object
@@ -129,11 +152,20 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> Formatter
     let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows);
     let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows);
 
-    FormatterConfig {
+    let config = FormatterConfig {
         max_bytes,
         min_rows,
         repr_rows,
+    };
+
+    // Validate the configuration
+    if let Err(err) = config.validate() {
+        // Log the error but use default values instead of failing
+        eprintln!("Invalid formatter configuration: {}", err);
+        return default_config;
     }
+
+    config
 }
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.

From 9a1f59fd0a85efe096e389267dc035859b5ad2ea Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 11:05:28 +0800
Subject: [PATCH 24/40] add comment - ensure minimum rows are collected even if
 memory or row limits are hit

---
 src/dataframe.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index 7f4992338..42c94f4be 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -902,6 +902,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
+    // ensure minimum rows even if memory/row limits are hit
     while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows {
         let mut rb = match stream.next().await {
             None => {

From bfb2b12c532189247d1e31d6229a72894ab45a8b Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 11:19:31 +0800
Subject: [PATCH 25/40] Update html_formatter documentation

---
 docs/source/user-guide/dataframe.rst | 109 +++++++++++++++++++++++----
 1 file changed, 95 insertions(+), 14 deletions(-)

diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe.rst
index cd4abe05f..11e3d7e72 100644
--- a/docs/source/user-guide/dataframe.rst
+++ b/docs/source/user-guide/dataframe.rst
@@ -90,47 +90,128 @@ You can customize how DataFrames are rendered in HTML by configuring the formatt
 
 The formatter settings affect all DataFrames displayed after configuration.
 
-Performance Optimization with Shared Styles
-------------------------------------------
+Custom Style Providers
+----------------------
 
-The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying 
-multiple DataFrames in notebook environments:
+For advanced styling needs, you can create a custom style provider:
 
 .. code-block:: python
 
+    from datafusion.html_formatter import StyleProvider, configure_formatter
+    
+    class MyStyleProvider(StyleProvider):
+        def get_table_styles(self):
+            return {
+                "table": "border-collapse: collapse; width: 100%;",
+                "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;",
+                "td": "border: 1px solid #ddd; padding: 8px;",
+                "tr:nth-child(even)": "background-color: #f2f2f2;",
+            }
+            
+        def get_value_styles(self, dtype, value):
+            """Return custom styles for specific values"""
+            if dtype == "float" and value < 0:
+                return "color: red;"
+            return None
+    
+    # Apply the custom style provider
+    configure_formatter(style_provider=MyStyleProvider())
+
+Performance Optimization with Shared Styles
+-------------------------------------------
+The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying 
+multiple DataFrames in notebook environments:
+
+ .. code-block:: python
+    from datafusion.html_formatter import StyleProvider, configure_formatter
     # Default: Use shared styles (recommended for notebooks)
     configure_formatter(use_shared_styles=True)
-    
+
     # Disable shared styles (each DataFrame includes its own styles)
     configure_formatter(use_shared_styles=False)
 
 When ``use_shared_styles=True``:
-
 - CSS styles and JavaScript are included only once per notebook session
 - This reduces HTML output size and prevents style duplication
 - Improves rendering performance with many DataFrames
 - Applies consistent styling across all DataFrames
 
-If you switch between notebooks or need to refresh styles:
+Creating a Custom Formatter
+---------------------------
+
+For complete control over rendering, you can implement a custom formatter:
 
 .. code-block:: python
 
-    from datafusion.html_formatter import reset_styles_loaded_state
+    from datafusion.html_formatter import Formatter, get_formatter
+    
+    class MyFormatter(Formatter):
+        def format_html(self, batches, schema, has_more=False, table_uuid=None):
+            # Create your custom HTML here
+            html = "<div class='my-custom-table'>"
+            # ... formatting logic ...
+            html += "</div>"
+            return html
     
-    # Force styles to be included in the next DataFrame display
-    reset_styles_loaded_state()
+    # Set as the global formatter
+    configure_formatter(formatter_class=MyFormatter)
+    
+    # Or use the formatter just for specific operations
+    formatter = get_formatter()
+    custom_html = formatter.format_html(batches, schema)
 
-Memory and Display Controls
---------------------------
+Managing Formatters
+-------------------
 
-You can control how much data is displayed and how much memory is used for rendering:
+Reset to default formatting:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import reset_formatter
+    
+    # Reset to default settings
+    reset_formatter()
+
+Get the current formatter settings:
+
+.. code-block:: python
+
+    from datafusion.html_formatter import get_formatter
+    
+    formatter = get_formatter()
+    print(formatter.max_rows)
+    print(formatter.theme)
+
+Contextual Formatting
+---------------------
+
+You can also use a context manager to temporarily change formatting settings:
 
 .. code-block:: python
 
+    from datafusion.html_formatter import formatting_context
+    
+    # Default formatting
+    df.show()
+    
+    # Temporarily use different formatting
+    with formatting_context(max_rows=100, theme="dark"):
+        df.show()  # Will use the temporary settings
+    
+    # Back to default formatting
+    df.show()
+
+Memory and Display Controls
+---------------------------
+
+You can control how much data is displayed and how much memory is used for rendering:
+
+ .. code-block:: python
+ 
     configure_formatter(
         max_memory_bytes=4 * 1024 * 1024,  # 4MB maximum memory for display
         min_rows_display=50,               # Always show at least 50 rows
         repr_rows=20                       # Show 20 rows in __repr__ output
     )
 
-These parameters help balance comprehensive data display against performance considerations.
+These parameters help balance comprehensive data display against performance considerations.
\ No newline at end of file

From 1fdc2c2b6449c30c710d283c549c88f90cc25e2b Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 12:02:47 +0800
Subject: [PATCH 26/40] update tests

---
 python/tests/test_dataframe.py | 3330 --------------------------------
 1 file changed, 3330 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index ed3d6012e..38a8c6866 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1851,3333 +1851,3 @@ def get_header_style(self) -> str:
     assert formatter.repr_rows == 7
 
 
-def test_get_dataframe(tmp_path):
-    ctx = SessionContext()
-
-    path = tmp_path / "test.csv"
-    table = pa.Table.from_arrays(
-        [
-            [1, 2, 3, 4],
-            ["a", "b", "c", "d"],
-            [1.1, 2.2, 3.3, 4.4],
-        ],
-        names=["int", "str", "float"],
-    )
-    write_csv(table, path)
-
-    ctx.register_csv("csv", path)
-
-    df = ctx.table("csv")
-    assert isinstance(df, DataFrame)
-
-
-def test_struct_select(struct_df):
-    df = struct_df.select(
-        column("a")["c"] + column("b"),
-        column("a")["c"] - column("b"),
-    )
-
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
-
-    assert result.column(0) == pa.array([5, 7, 9])
-    assert result.column(1) == pa.array([-3, -3, -3])
-
-
-def test_explain(df):
-    df = df.select(
-        column("a") + column("b"),
-        column("a") - column("b"),
-    )
-    df.explain()
-
-
-def test_logical_plan(aggregate_df):
-    plan = aggregate_df.logical_plan()
-
-    expected = "Projection: test.c1, sum(test.c2)"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Projection: test.c1, sum(test.c2)\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "    TableScan: test"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_optimized_logical_plan(aggregate_df):
-    plan = aggregate_df.optimized_logical_plan()
-
-    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "  TableScan: test projection=[c1, c2]"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_execution_plan(aggregate_df):
-    plan = aggregate_df.execution_plan()
-
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
-
-    assert expected == plan.display()
-
-    # Check the number of partitions is as expected.
-    assert isinstance(plan.partition_count, int)
-
-    expected = (
-        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
-        "    TableScan: test projection=[c1, c2]"
-    )
-
-    indent = plan.display_indent()
-
-    # indent plan will be different for everyone due to absolute path
-    # to filename, so we just check for some expected content
-    assert "AggregateExec:" in indent
-    assert "CoalesceBatchesExec:" in indent
-    assert "RepartitionExec:" in indent
-    assert "DataSourceExec:" in indent
-    assert "file_type=csv" in indent
-
-    ctx = SessionContext()
-    rows_returned = 0
-    for idx in range(plan.partition_count):
-        stream = ctx.execute(plan, idx)
-        try:
-            batch = stream.next()
-            assert batch is not None
-            rows_returned += len(batch.to_pyarrow()[0])
-        except StopIteration:
-            # This is one of the partitions with no values
-            pass
-        with pytest.raises(StopIteration):
-            stream.next()
-
-    assert rows_returned == 5
-
-
-@pytest.mark.asyncio
-async def test_async_iteration_of_df(aggregate_df):
-    rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
-        assert batch is not None
-        rows_returned += len(batch.to_pyarrow()[0])
-
-    assert rows_returned == 5
-
-
-def test_repartition(df):
-    df.repartition(2)
-
-
-def test_repartition_by_hash(df):
-    df.repartition_by_hash(column("a"), num=2)
-
-
-def test_intersect():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3]), pa.array([6])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_i_b.collect()
-
-
-def test_except_all():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([4, 5])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_e_b.collect()
-
-
-def test_collect_partitioned():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
-
-
-def test_union(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_union_distinct(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_cache(df):
-    assert df.cache().collect() == df.collect()
-
-
-def test_count(df):
-    # Get number of rows
-    assert df.count() == 3
-
-
-def test_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert datafusion dataframe to pandas dataframe
-    pandas_df = df.to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (3, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert empty datafusion dataframe to pandas dataframe
-    pandas_df = df.limit(0).to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (0, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert datafusion dataframe to polars dataframe
-    polars_df = df.to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (3, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert empty datafusion dataframe to polars dataframe
-    polars_df = df.limit(0).to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (0, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_to_arrow_table(df):
-    # Convert datafusion dataframe to pyarrow Table
-    pyarrow_table = df.to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream(df):
-    stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
-    assert not list(stream)  # after one iteration the generator must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_async(df):
-    stream = df.execute_stream()
-    batches = [batch async for batch in stream]
-
-    assert all(batch is not None for batch in batches)
-
-    # After consuming all batches, the stream should be exhausted
-    remaining_batches = [batch async for batch in stream]
-    assert not remaining_batches
-
-
-@pytest.mark.parametrize("schema", [True, False])
-def test_execute_stream_to_arrow_table(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            (batch.to_pyarrow() for batch in stream), schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("schema", [True, False])
-async def test_execute_stream_to_arrow_table_async(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream]
-        )
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream_partitioned(df):
-    streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_partitioned_async(df):
-    streams = df.execute_stream_partitioned()
-
-    for stream in streams:
-        batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
-
-        # Ensure the stream is exhausted after iteration
-        remaining_batches = [batch async for batch in stream]
-        assert not remaining_batches
-
-
-def test_empty_to_arrow_table(df):
-    # Convert empty datafusion dataframe to pyarrow Table
-    pyarrow_table = df.limit(0).to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (0, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_to_pylist(df):
-    # Convert datafusion dataframe to Python list
-    pylist = df.to_pylist()
-    assert isinstance(pylist, list)
-    assert pylist == [
-        {"a": 1, "b": 4, "c": 8},
-        {"a": 2, "b": 5, "c": 5},
-        {"a": 3, "b": 6, "c": 8},
-    ]
-
-
-def test_to_pydict(df):
-    # Convert datafusion dataframe to Python dictionary
-    pydict = df.to_pydict()
-    assert isinstance(pydict, dict)
-    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
-
-
-def test_describe(df):
-    # Calculate statistics
-    df = df.describe()
-
-    # Collect the result
-    result = df.to_pydict()
-
-    assert result == {
-        "describe": [
-            "count",
-            "null_count",
-            "mean",
-            "std",
-            "min",
-            "max",
-            "median",
-        ],
-        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
-        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
-        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
-    }
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_csv(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_csv(path, with_header=True)
-
-    ctx.register_csv("csv", path)
-    result = ctx.table("csv").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_json(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_json(path)
-
-    ctx.register_json("json", path)
-    result = ctx.table("json").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_parquet(df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_parquet(str(path))
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
-)
-def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
-    path = tmp_path
-
-    df.write_parquet(
-        str(path), compression=compression, compression_level=compression_level
-    )
-
-    # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
-        for file in files:
-            if file.endswith(".parquet"):
-                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
-                for row_group in metadata["row_groups"]:
-                    for columns in row_group["columns"]:
-                        assert columns["compression"].lower() == compression
-
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
-)
-def test_write_compressed_parquet_wrong_compression_level(
-    df, tmp_path, compression, compression_level
-):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(
-            str(path),
-            compression=compression,
-            compression_level=compression_level,
-        )
-
-
-@pytest.mark.parametrize("compression", ["wrong"])
-def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(str(path), compression=compression)
-
-
-# not testing lzo because it it not implemented yet
-# https://github.com/apache/arrow-rs/issues/6970
-@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
-def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
-    # Test write_parquet with zstd, brotli, gzip default compression level,
-    # ie don't specify compression level
-    # should complete without error
-    path = tmp_path
-
-    df.write_parquet(str(path), compression=compression)
-
-
-def test_dataframe_export(df) -> None:
-    # Guarantees that we have the canonical implementation
-    # reading our dataframe export
-    table = pa.table(df)
-    assert table.num_columns == 3
-    assert table.num_rows == 3
-
-    desired_schema = pa.schema([("a", pa.int64())])
-
-    # Verify we can request a schema
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-
-    # Expect a table of nulls if the schema don't overlap
-    desired_schema = pa.schema([("g", pa.string())])
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-    for i in range(3):
-        assert table[0][i].as_py() is None
-
-    # Expect an error when we cannot convert schema
-    desired_schema = pa.schema([("a", pa.float32())])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-    # Expect an error when we have a not set non-nullable
-    desired_schema = pa.schema([("g", pa.string(), False)])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-
-def test_dataframe_transform(df):
-    def add_string_col(df_internal) -> DataFrame:
-        return df_internal.with_column("string_col", literal("string data"))
-
-    def add_with_parameter(df_internal, value: Any) -> DataFrame:
-        return df_internal.with_column("new_col", literal(value))
-
-    df = df.transform(add_string_col).transform(add_with_parameter, 3)
-
-    result = df.to_pydict()
-
-    assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
-
-
-def test_dataframe_repr_html_structure(df) -> None:
-    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
-
-    output = df._repr_html_()
-
-    # Since we've added a fair bit of processing to the html output, lets just verify
-    # the values we are expecting in the table exist. Use regex and ignore everything
-    # between the <th></th> and <td></td>. We also don't want the closing > on the
-    # td and th segments because that is where the formatting data is written.
-
-    headers = ["a", "b", "c"]
-    headers = [f"<th(.*?)>{v}</th>" for v in headers]
-    header_pattern = "(.*?)".join(headers)
-    header_matches = re.findall(header_pattern, output, re.DOTALL)
-    assert len(header_matches) == 1
-
-    # Update the pattern to handle values that may be wrapped in spans
-    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
-
-    body_lines = [
-        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
-        for inner in body_data
-        for v in inner
-    ]
-    body_pattern = "(.*?)".join(body_lines)
-
-    body_matches = re.findall(body_pattern, output, re.DOTALL)
-
-    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
-
-
-def test_dataframe_repr_html_values(df):
-    """Test that DataFrame._repr_html_ contains the expected data values."""
-    html = df._repr_html_()
-    assert html is not None
-
-    # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
-    # of formatting
-    pattern = re.compile(
-        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
-        re.DOTALL,
-    )
-
-    # Print debug info if the test fails
-    matches = re.findall(pattern, html)
-    if not matches:
-        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
-
-    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
-
-
-def test_html_formatter_shared_styles(df, clean_formatter_state):
-    """Test that shared styles work correctly across multiple tables."""
-
-    # First, ensure we're using shared styles
-    configure_formatter(use_shared_styles=True)
-
-    # Get HTML output for first table - should include styles
-    html_first = df._repr_html_()
-
-    # Verify styles are included in first render
-    assert "<style>" in html_first
-    assert ".expandable-container" in html_first
-
-    # Get HTML output for second table - should NOT include styles
-    html_second = df._repr_html_()
-
-    # Verify styles are NOT included in second render
-    assert "<style>" not in html_second
-    assert ".expandable-container" not in html_second
-
-    # Reset the styles loaded state and verify styles are included again
-    reset_styles_loaded_state()
-    html_after_reset = df._repr_html_()
-
-    # Verify styles are included after reset
-    assert "<style>" in html_after_reset
-    assert ".expandable-container" in html_after_reset
-
-
-def test_html_formatter_no_shared_styles(df, clean_formatter_state):
-    """Test that styles are always included when shared styles are disabled."""
-
-    # Configure formatter to NOT use shared styles
-    configure_formatter(use_shared_styles=False)
-
-    # Generate HTML multiple times
-    html_first = df._repr_html_()
-    html_second = df._repr_html_()
-
-    # Verify styles are included in both renders
-    assert "<style>" in html_first
-    assert "<style>" in html_second
-    assert ".expandable-container" in html_first
-    assert ".expandable-container" in html_second
-
-
-def test_html_formatter_manual_format_html(clean_formatter_state):
-    """Test direct usage of format_html method with shared styles."""
-
-    # Create sample data
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    formatter = get_formatter()
-
-    # First call should include styles
-    html_first = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_first
-
-    # Second call should not include styles (using shared styles by default)
-    html_second = formatter.format_html([batch], batch.schema)
-    assert "<style>" not in html_second
-
-    # Reset loaded state
-    reset_styles_loaded_state()
-
-    # After reset, styles should be included again
-    html_reset = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_reset
-
-    # Create a new formatter with shared_styles=False
-    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
-
-    # Both calls should include styles
-    local_html_1 = local_formatter.format_html([batch], batch.schema)
-    local_html_2 = local_formatter.format_html([batch], batch.schema)
-
-    assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
-def test_html_formatter_memory_and_rows():
-    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-    
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
-    )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-    
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
-    )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-    
-    # Test validation for invalid parameters
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=0)
-    
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=0)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=-5)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=0)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=-10)
-
-
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
-    reset_formatter()
-    
-    # Configure with custom style provider and additional parameters
-    configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7
-
-
-def test_get_dataframe(tmp_path):
-    ctx = SessionContext()
-
-    path = tmp_path / "test.csv"
-    table = pa.Table.from_arrays(
-        [
-            [1, 2, 3, 4],
-            ["a", "b", "c", "d"],
-            [1.1, 2.2, 3.3, 4.4],
-        ],
-        names=["int", "str", "float"],
-    )
-    write_csv(table, path)
-
-    ctx.register_csv("csv", path)
-
-    df = ctx.table("csv")
-    assert isinstance(df, DataFrame)
-
-
-def test_struct_select(struct_df):
-    df = struct_df.select(
-        column("a")["c"] + column("b"),
-        column("a")["c"] - column("b"),
-    )
-
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
-
-    assert result.column(0) == pa.array([5, 7, 9])
-    assert result.column(1) == pa.array([-3, -3, -3])
-
-
-def test_explain(df):
-    df = df.select(
-        column("a") + column("b"),
-        column("a") - column("b"),
-    )
-    df.explain()
-
-
-def test_logical_plan(aggregate_df):
-    plan = aggregate_df.logical_plan()
-
-    expected = "Projection: test.c1, sum(test.c2)"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Projection: test.c1, sum(test.c2)\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "    TableScan: test"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_optimized_logical_plan(aggregate_df):
-    plan = aggregate_df.optimized_logical_plan()
-
-    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "  TableScan: test projection=[c1, c2]"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_execution_plan(aggregate_df):
-    plan = aggregate_df.execution_plan()
-
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
-
-    assert expected == plan.display()
-
-    # Check the number of partitions is as expected.
-    assert isinstance(plan.partition_count, int)
-
-    expected = (
-        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
-        "    TableScan: test projection=[c1, c2]"
-    )
-
-    indent = plan.display_indent()
-
-    # indent plan will be different for everyone due to absolute path
-    # to filename, so we just check for some expected content
-    assert "AggregateExec:" in indent
-    assert "CoalesceBatchesExec:" in indent
-    assert "RepartitionExec:" in indent
-    assert "DataSourceExec:" in indent
-    assert "file_type=csv" in indent
-
-    ctx = SessionContext()
-    rows_returned = 0
-    for idx in range(plan.partition_count):
-        stream = ctx.execute(plan, idx)
-        try:
-            batch = stream.next()
-            assert batch is not None
-            rows_returned += len(batch.to_pyarrow()[0])
-        except StopIteration:
-            # This is one of the partitions with no values
-            pass
-        with pytest.raises(StopIteration):
-            stream.next()
-
-    assert rows_returned == 5
-
-
-@pytest.mark.asyncio
-async def test_async_iteration_of_df(aggregate_df):
-    rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
-        assert batch is not None
-        rows_returned += len(batch.to_pyarrow()[0])
-
-    assert rows_returned == 5
-
-
-def test_repartition(df):
-    df.repartition(2)
-
-
-def test_repartition_by_hash(df):
-    df.repartition_by_hash(column("a"), num=2)
-
-
-def test_intersect():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3]), pa.array([6])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_i_b.collect()
-
-
-def test_except_all():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([4, 5])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_e_b.collect()
-
-
-def test_collect_partitioned():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
-
-
-def test_union(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_union_distinct(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_cache(df):
-    assert df.cache().collect() == df.collect()
-
-
-def test_count(df):
-    # Get number of rows
-    assert df.count() == 3
-
-
-def test_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert datafusion dataframe to pandas dataframe
-    pandas_df = df.to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (3, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert empty datafusion dataframe to pandas dataframe
-    pandas_df = df.limit(0).to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (0, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert datafusion dataframe to polars dataframe
-    polars_df = df.to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (3, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert empty datafusion dataframe to polars dataframe
-    polars_df = df.limit(0).to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (0, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_to_arrow_table(df):
-    # Convert datafusion dataframe to pyarrow Table
-    pyarrow_table = df.to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream(df):
-    stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
-    assert not list(stream)  # after one iteration the generator must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_async(df):
-    stream = df.execute_stream()
-    batches = [batch async for batch in stream]
-
-    assert all(batch is not None for batch in batches)
-
-    # After consuming all batches, the stream should be exhausted
-    remaining_batches = [batch async for batch in stream]
-    assert not remaining_batches
-
-
-@pytest.mark.parametrize("schema", [True, False])
-def test_execute_stream_to_arrow_table(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            (batch.to_pyarrow() for batch in stream), schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("schema", [True, False])
-async def test_execute_stream_to_arrow_table_async(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream]
-        )
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream_partitioned(df):
-    streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_partitioned_async(df):
-    streams = df.execute_stream_partitioned()
-
-    for stream in streams:
-        batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
-
-        # Ensure the stream is exhausted after iteration
-        remaining_batches = [batch async for batch in stream]
-        assert not remaining_batches
-
-
-def test_empty_to_arrow_table(df):
-    # Convert empty datafusion dataframe to pyarrow Table
-    pyarrow_table = df.limit(0).to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (0, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_to_pylist(df):
-    # Convert datafusion dataframe to Python list
-    pylist = df.to_pylist()
-    assert isinstance(pylist, list)
-    assert pylist == [
-        {"a": 1, "b": 4, "c": 8},
-        {"a": 2, "b": 5, "c": 5},
-        {"a": 3, "b": 6, "c": 8},
-    ]
-
-
-def test_to_pydict(df):
-    # Convert datafusion dataframe to Python dictionary
-    pydict = df.to_pydict()
-    assert isinstance(pydict, dict)
-    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
-
-
-def test_describe(df):
-    # Calculate statistics
-    df = df.describe()
-
-    # Collect the result
-    result = df.to_pydict()
-
-    assert result == {
-        "describe": [
-            "count",
-            "null_count",
-            "mean",
-            "std",
-            "min",
-            "max",
-            "median",
-        ],
-        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
-        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
-        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
-    }
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_csv(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_csv(path, with_header=True)
-
-    ctx.register_csv("csv", path)
-    result = ctx.table("csv").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_json(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_json(path)
-
-    ctx.register_json("json", path)
-    result = ctx.table("json").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_parquet(df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_parquet(str(path))
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
-)
-def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
-    path = tmp_path
-
-    df.write_parquet(
-        str(path), compression=compression, compression_level=compression_level
-    )
-
-    # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
-        for file in files:
-            if file.endswith(".parquet"):
-                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
-                for row_group in metadata["row_groups"]:
-                    for columns in row_group["columns"]:
-                        assert columns["compression"].lower() == compression
-
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
-)
-def test_write_compressed_parquet_wrong_compression_level(
-    df, tmp_path, compression, compression_level
-):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(
-            str(path),
-            compression=compression,
-            compression_level=compression_level,
-        )
-
-
-@pytest.mark.parametrize("compression", ["wrong"])
-def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(str(path), compression=compression)
-
-
-# not testing lzo because it it not implemented yet
-# https://github.com/apache/arrow-rs/issues/6970
-@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
-def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
-    # Test write_parquet with zstd, brotli, gzip default compression level,
-    # ie don't specify compression level
-    # should complete without error
-    path = tmp_path
-
-    df.write_parquet(str(path), compression=compression)
-
-
-def test_dataframe_export(df) -> None:
-    # Guarantees that we have the canonical implementation
-    # reading our dataframe export
-    table = pa.table(df)
-    assert table.num_columns == 3
-    assert table.num_rows == 3
-
-    desired_schema = pa.schema([("a", pa.int64())])
-
-    # Verify we can request a schema
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-
-    # Expect a table of nulls if the schema don't overlap
-    desired_schema = pa.schema([("g", pa.string())])
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-    for i in range(3):
-        assert table[0][i].as_py() is None
-
-    # Expect an error when we cannot convert schema
-    desired_schema = pa.schema([("a", pa.float32())])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-    # Expect an error when we have a not set non-nullable
-    desired_schema = pa.schema([("g", pa.string(), False)])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-
-def test_dataframe_transform(df):
-    def add_string_col(df_internal) -> DataFrame:
-        return df_internal.with_column("string_col", literal("string data"))
-
-    def add_with_parameter(df_internal, value: Any) -> DataFrame:
-        return df_internal.with_column("new_col", literal(value))
-
-    df = df.transform(add_string_col).transform(add_with_parameter, 3)
-
-    result = df.to_pydict()
-
-    assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
-
-
-def test_dataframe_repr_html_structure(df) -> None:
-    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
-
-    output = df._repr_html_()
-
-    # Since we've added a fair bit of processing to the html output, lets just verify
-    # the values we are expecting in the table exist. Use regex and ignore everything
-    # between the <th></th> and <td></td>. We also don't want the closing > on the
-    # td and th segments because that is where the formatting data is written.
-
-    headers = ["a", "b", "c"]
-    headers = [f"<th(.*?)>{v}</th>" for v in headers]
-    header_pattern = "(.*?)".join(headers)
-    header_matches = re.findall(header_pattern, output, re.DOTALL)
-    assert len(header_matches) == 1
-
-    # Update the pattern to handle values that may be wrapped in spans
-    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
-
-    body_lines = [
-        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
-        for inner in body_data
-        for v in inner
-    ]
-    body_pattern = "(.*?)".join(body_lines)
-
-    body_matches = re.findall(body_pattern, output, re.DOTALL)
-
-    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
-
-
-def test_dataframe_repr_html_values(df):
-    """Test that DataFrame._repr_html_ contains the expected data values."""
-    html = df._repr_html_()
-    assert html is not None
-
-    # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
-    # of formatting
-    pattern = re.compile(
-        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
-        re.DOTALL,
-    )
-
-    # Print debug info if the test fails
-    matches = re.findall(pattern, html)
-    if not matches:
-        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
-
-    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
-
-
-def test_html_formatter_shared_styles(df, clean_formatter_state):
-    """Test that shared styles work correctly across multiple tables."""
-
-    # First, ensure we're using shared styles
-    configure_formatter(use_shared_styles=True)
-
-    # Get HTML output for first table - should include styles
-    html_first = df._repr_html_()
-
-    # Verify styles are included in first render
-    assert "<style>" in html_first
-    assert ".expandable-container" in html_first
-
-    # Get HTML output for second table - should NOT include styles
-    html_second = df._repr_html_()
-
-    # Verify styles are NOT included in second render
-    assert "<style>" not in html_second
-    assert ".expandable-container" not in html_second
-
-    # Reset the styles loaded state and verify styles are included again
-    reset_styles_loaded_state()
-    html_after_reset = df._repr_html_()
-
-    # Verify styles are included after reset
-    assert "<style>" in html_after_reset
-    assert ".expandable-container" in html_after_reset
-
-
-def test_html_formatter_no_shared_styles(df, clean_formatter_state):
-    """Test that styles are always included when shared styles are disabled."""
-
-    # Configure formatter to NOT use shared styles
-    configure_formatter(use_shared_styles=False)
-
-    # Generate HTML multiple times
-    html_first = df._repr_html_()
-    html_second = df._repr_html_()
-
-    # Verify styles are included in both renders
-    assert "<style>" in html_first
-    assert "<style>" in html_second
-    assert ".expandable-container" in html_first
-    assert ".expandable-container" in html_second
-
-
-def test_html_formatter_manual_format_html(clean_formatter_state):
-    """Test direct usage of format_html method with shared styles."""
-
-    # Create sample data
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    formatter = get_formatter()
-
-    # First call should include styles
-    html_first = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_first
-
-    # Second call should not include styles (using shared styles by default)
-    html_second = formatter.format_html([batch], batch.schema)
-    assert "<style>" not in html_second
-
-    # Reset loaded state
-    reset_styles_loaded_state()
-
-    # After reset, styles should be included again
-    html_reset = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_reset
-
-    # Create a new formatter with shared_styles=False
-    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
-
-    # Both calls should include styles
-    local_html_1 = local_formatter.format_html([batch], batch.schema)
-    local_html_2 = local_formatter.format_html([batch], batch.schema)
-
-    assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
-def test_html_formatter_memory_and_rows():
-    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-    
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
-    )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-    
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
-    )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-    
-    # Test validation for invalid parameters
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=0)
-    
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=0)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=-5)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=0)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=-10)
-
-
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
-    reset_formatter()
-    
-    # Configure with custom style provider and additional parameters
-    configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7
-
-
-def test_get_dataframe(tmp_path):
-    ctx = SessionContext()
-
-    path = tmp_path / "test.csv"
-    table = pa.Table.from_arrays(
-        [
-            [1, 2, 3, 4],
-            ["a", "b", "c", "d"],
-            [1.1, 2.2, 3.3, 4.4],
-        ],
-        names=["int", "str", "float"],
-    )
-    write_csv(table, path)
-
-    ctx.register_csv("csv", path)
-
-    df = ctx.table("csv")
-    assert isinstance(df, DataFrame)
-
-
-def test_struct_select(struct_df):
-    df = struct_df.select(
-        column("a")["c"] + column("b"),
-        column("a")["c"] - column("b"),
-    )
-
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
-
-    assert result.column(0) == pa.array([5, 7, 9])
-    assert result.column(1) == pa.array([-3, -3, -3])
-
-
-def test_explain(df):
-    df = df.select(
-        column("a") + column("b"),
-        column("a") - column("b"),
-    )
-    df.explain()
-
-
-def test_logical_plan(aggregate_df):
-    plan = aggregate_df.logical_plan()
-
-    expected = "Projection: test.c1, sum(test.c2)"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Projection: test.c1, sum(test.c2)\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "    TableScan: test"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_optimized_logical_plan(aggregate_df):
-    plan = aggregate_df.optimized_logical_plan()
-
-    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "  TableScan: test projection=[c1, c2]"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_execution_plan(aggregate_df):
-    plan = aggregate_df.execution_plan()
-
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
-
-    assert expected == plan.display()
-
-    # Check the number of partitions is as expected.
-    assert isinstance(plan.partition_count, int)
-
-    expected = (
-        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
-        "    TableScan: test projection=[c1, c2]"
-    )
-
-    indent = plan.display_indent()
-
-    # indent plan will be different for everyone due to absolute path
-    # to filename, so we just check for some expected content
-    assert "AggregateExec:" in indent
-    assert "CoalesceBatchesExec:" in indent
-    assert "RepartitionExec:" in indent
-    assert "DataSourceExec:" in indent
-    assert "file_type=csv" in indent
-
-    ctx = SessionContext()
-    rows_returned = 0
-    for idx in range(plan.partition_count):
-        stream = ctx.execute(plan, idx)
-        try:
-            batch = stream.next()
-            assert batch is not None
-            rows_returned += len(batch.to_pyarrow()[0])
-        except StopIteration:
-            # This is one of the partitions with no values
-            pass
-        with pytest.raises(StopIteration):
-            stream.next()
-
-    assert rows_returned == 5
-
-
-@pytest.mark.asyncio
-async def test_async_iteration_of_df(aggregate_df):
-    rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
-        assert batch is not None
-        rows_returned += len(batch.to_pyarrow()[0])
-
-    assert rows_returned == 5
-
-
-def test_repartition(df):
-    df.repartition(2)
-
-
-def test_repartition_by_hash(df):
-    df.repartition_by_hash(column("a"), num=2)
-
-
-def test_intersect():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3]), pa.array([6])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_i_b.collect()
-
-
-def test_except_all():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([4, 5])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_e_b.collect()
-
-
-def test_collect_partitioned():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
-
-
-def test_union(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_union_distinct(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_cache(df):
-    assert df.cache().collect() == df.collect()
-
-
-def test_count(df):
-    # Get number of rows
-    assert df.count() == 3
-
-
-def test_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert datafusion dataframe to pandas dataframe
-    pandas_df = df.to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (3, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert empty datafusion dataframe to pandas dataframe
-    pandas_df = df.limit(0).to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (0, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert datafusion dataframe to polars dataframe
-    polars_df = df.to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (3, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert empty datafusion dataframe to polars dataframe
-    polars_df = df.limit(0).to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (0, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_to_arrow_table(df):
-    # Convert datafusion dataframe to pyarrow Table
-    pyarrow_table = df.to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream(df):
-    stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
-    assert not list(stream)  # after one iteration the generator must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_async(df):
-    stream = df.execute_stream()
-    batches = [batch async for batch in stream]
-
-    assert all(batch is not None for batch in batches)
-
-    # After consuming all batches, the stream should be exhausted
-    remaining_batches = [batch async for batch in stream]
-    assert not remaining_batches
-
-
-@pytest.mark.parametrize("schema", [True, False])
-def test_execute_stream_to_arrow_table(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            (batch.to_pyarrow() for batch in stream), schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("schema", [True, False])
-async def test_execute_stream_to_arrow_table_async(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream]
-        )
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream_partitioned(df):
-    streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_partitioned_async(df):
-    streams = df.execute_stream_partitioned()
-
-    for stream in streams:
-        batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
-
-        # Ensure the stream is exhausted after iteration
-        remaining_batches = [batch async for batch in stream]
-        assert not remaining_batches
-
-
-def test_empty_to_arrow_table(df):
-    # Convert empty datafusion dataframe to pyarrow Table
-    pyarrow_table = df.limit(0).to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (0, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_to_pylist(df):
-    # Convert datafusion dataframe to Python list
-    pylist = df.to_pylist()
-    assert isinstance(pylist, list)
-    assert pylist == [
-        {"a": 1, "b": 4, "c": 8},
-        {"a": 2, "b": 5, "c": 5},
-        {"a": 3, "b": 6, "c": 8},
-    ]
-
-
-def test_to_pydict(df):
-    # Convert datafusion dataframe to Python dictionary
-    pydict = df.to_pydict()
-    assert isinstance(pydict, dict)
-    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
-
-
-def test_describe(df):
-    # Calculate statistics
-    df = df.describe()
-
-    # Collect the result
-    result = df.to_pydict()
-
-    assert result == {
-        "describe": [
-            "count",
-            "null_count",
-            "mean",
-            "std",
-            "min",
-            "max",
-            "median",
-        ],
-        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
-        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
-        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
-    }
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_csv(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_csv(path, with_header=True)
-
-    ctx.register_csv("csv", path)
-    result = ctx.table("csv").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_json(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_json(path)
-
-    ctx.register_json("json", path)
-    result = ctx.table("json").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_parquet(df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_parquet(str(path))
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
-)
-def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
-    path = tmp_path
-
-    df.write_parquet(
-        str(path), compression=compression, compression_level=compression_level
-    )
-
-    # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
-        for file in files:
-            if file.endswith(".parquet"):
-                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
-                for row_group in metadata["row_groups"]:
-                    for columns in row_group["columns"]:
-                        assert columns["compression"].lower() == compression
-
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
-)
-def test_write_compressed_parquet_wrong_compression_level(
-    df, tmp_path, compression, compression_level
-):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(
-            str(path),
-            compression=compression,
-            compression_level=compression_level,
-        )
-
-
-@pytest.mark.parametrize("compression", ["wrong"])
-def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(str(path), compression=compression)
-
-
-# not testing lzo because it it not implemented yet
-# https://github.com/apache/arrow-rs/issues/6970
-@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
-def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
-    # Test write_parquet with zstd, brotli, gzip default compression level,
-    # ie don't specify compression level
-    # should complete without error
-    path = tmp_path
-
-    df.write_parquet(str(path), compression=compression)
-
-
-def test_dataframe_export(df) -> None:
-    # Guarantees that we have the canonical implementation
-    # reading our dataframe export
-    table = pa.table(df)
-    assert table.num_columns == 3
-    assert table.num_rows == 3
-
-    desired_schema = pa.schema([("a", pa.int64())])
-
-    # Verify we can request a schema
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-
-    # Expect a table of nulls if the schema don't overlap
-    desired_schema = pa.schema([("g", pa.string())])
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-    for i in range(3):
-        assert table[0][i].as_py() is None
-
-    # Expect an error when we cannot convert schema
-    desired_schema = pa.schema([("a", pa.float32())])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-    # Expect an error when we have a not set non-nullable
-    desired_schema = pa.schema([("g", pa.string(), False)])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-
-def test_dataframe_transform(df):
-    def add_string_col(df_internal) -> DataFrame:
-        return df_internal.with_column("string_col", literal("string data"))
-
-    def add_with_parameter(df_internal, value: Any) -> DataFrame:
-        return df_internal.with_column("new_col", literal(value))
-
-    df = df.transform(add_string_col).transform(add_with_parameter, 3)
-
-    result = df.to_pydict()
-
-    assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
-
-
-def test_dataframe_repr_html_structure(df) -> None:
-    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
-
-    output = df._repr_html_()
-
-    # Since we've added a fair bit of processing to the html output, lets just verify
-    # the values we are expecting in the table exist. Use regex and ignore everything
-    # between the <th></th> and <td></td>. We also don't want the closing > on the
-    # td and th segments because that is where the formatting data is written.
-
-    headers = ["a", "b", "c"]
-    headers = [f"<th(.*?)>{v}</th>" for v in headers]
-    header_pattern = "(.*?)".join(headers)
-    header_matches = re.findall(header_pattern, output, re.DOTALL)
-    assert len(header_matches) == 1
-
-    # Update the pattern to handle values that may be wrapped in spans
-    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
-
-    body_lines = [
-        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
-        for inner in body_data
-        for v in inner
-    ]
-    body_pattern = "(.*?)".join(body_lines)
-
-    body_matches = re.findall(body_pattern, output, re.DOTALL)
-
-    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
-
-
-def test_dataframe_repr_html_values(df):
-    """Test that DataFrame._repr_html_ contains the expected data values."""
-    html = df._repr_html_()
-    assert html is not None
-
-    # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
-    # of formatting
-    pattern = re.compile(
-        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
-        re.DOTALL,
-    )
-
-    # Print debug info if the test fails
-    matches = re.findall(pattern, html)
-    if not matches:
-        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
-
-    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
-
-
-def test_html_formatter_shared_styles(df, clean_formatter_state):
-    """Test that shared styles work correctly across multiple tables."""
-
-    # First, ensure we're using shared styles
-    configure_formatter(use_shared_styles=True)
-
-    # Get HTML output for first table - should include styles
-    html_first = df._repr_html_()
-
-    # Verify styles are included in first render
-    assert "<style>" in html_first
-    assert ".expandable-container" in html_first
-
-    # Get HTML output for second table - should NOT include styles
-    html_second = df._repr_html_()
-
-    # Verify styles are NOT included in second render
-    assert "<style>" not in html_second
-    assert ".expandable-container" not in html_second
-
-    # Reset the styles loaded state and verify styles are included again
-    reset_styles_loaded_state()
-    html_after_reset = df._repr_html_()
-
-    # Verify styles are included after reset
-    assert "<style>" in html_after_reset
-    assert ".expandable-container" in html_after_reset
-
-
-def test_html_formatter_no_shared_styles(df, clean_formatter_state):
-    """Test that styles are always included when shared styles are disabled."""
-
-    # Configure formatter to NOT use shared styles
-    configure_formatter(use_shared_styles=False)
-
-    # Generate HTML multiple times
-    html_first = df._repr_html_()
-    html_second = df._repr_html_()
-
-    # Verify styles are included in both renders
-    assert "<style>" in html_first
-    assert "<style>" in html_second
-    assert ".expandable-container" in html_first
-    assert ".expandable-container" in html_second
-
-
-def test_html_formatter_manual_format_html(clean_formatter_state):
-    """Test direct usage of format_html method with shared styles."""
-
-    # Create sample data
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    formatter = get_formatter()
-
-    # First call should include styles
-    html_first = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_first
-
-    # Second call should not include styles (using shared styles by default)
-    html_second = formatter.format_html([batch], batch.schema)
-    assert "<style>" not in html_second
-
-    # Reset loaded state
-    reset_styles_loaded_state()
-
-    # After reset, styles should be included again
-    html_reset = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_reset
-
-    # Create a new formatter with shared_styles=False
-    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
-
-    # Both calls should include styles
-    local_html_1 = local_formatter.format_html([batch], batch.schema)
-    local_html_2 = local_formatter.format_html([batch], batch.schema)
-
-    assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
-def test_html_formatter_memory_and_rows():
-    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-    
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
-    )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-    
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
-    )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-    
-    # Test validation for invalid parameters
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=0)
-    
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=0)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=-5)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=0)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=-10)
-
-
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
-    reset_formatter()
-    
-    # Configure with custom style provider and additional parameters
-    configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7
-
-
-def test_get_dataframe(tmp_path):
-    ctx = SessionContext()
-
-    path = tmp_path / "test.csv"
-    table = pa.Table.from_arrays(
-        [
-            [1, 2, 3, 4],
-            ["a", "b", "c", "d"],
-            [1.1, 2.2, 3.3, 4.4],
-        ],
-        names=["int", "str", "float"],
-    )
-    write_csv(table, path)
-
-    ctx.register_csv("csv", path)
-
-    df = ctx.table("csv")
-    assert isinstance(df, DataFrame)
-
-
-def test_struct_select(struct_df):
-    df = struct_df.select(
-        column("a")["c"] + column("b"),
-        column("a")["c"] - column("b"),
-    )
-
-    # execute and collect the first (and only) batch
-    result = df.collect()[0]
-
-    assert result.column(0) == pa.array([5, 7, 9])
-    assert result.column(1) == pa.array([-3, -3, -3])
-
-
-def test_explain(df):
-    df = df.select(
-        column("a") + column("b"),
-        column("a") - column("b"),
-    )
-    df.explain()
-
-
-def test_logical_plan(aggregate_df):
-    plan = aggregate_df.logical_plan()
-
-    expected = "Projection: test.c1, sum(test.c2)"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Projection: test.c1, sum(test.c2)\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "    TableScan: test"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_optimized_logical_plan(aggregate_df):
-    plan = aggregate_df.optimized_logical_plan()
-
-    expected = "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]"
-
-    assert expected == plan.display()
-
-    expected = (
-        "Aggregate: groupBy=[[test.c1]], aggr=[[sum(test.c2)]]\n"
-        "  TableScan: test projection=[c1, c2]"
-    )
-
-    assert expected == plan.display_indent()
-
-
-def test_execution_plan(aggregate_df):
-    plan = aggregate_df.execution_plan()
-
-    expected = (
-        "AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[sum(test.c2)]\n"
-    )
-
-    assert expected == plan.display()
-
-    # Check the number of partitions is as expected.
-    assert isinstance(plan.partition_count, int)
-
-    expected = (
-        "ProjectionExec: expr=[c1@0 as c1, SUM(test.c2)@1 as SUM(test.c2)]\n"
-        "  Aggregate: groupBy=[[test.c1]], aggr=[[SUM(test.c2)]]\n"
-        "    TableScan: test projection=[c1, c2]"
-    )
-
-    indent = plan.display_indent()
-
-    # indent plan will be different for everyone due to absolute path
-    # to filename, so we just check for some expected content
-    assert "AggregateExec:" in indent
-    assert "CoalesceBatchesExec:" in indent
-    assert "RepartitionExec:" in indent
-    assert "DataSourceExec:" in indent
-    assert "file_type=csv" in indent
-
-    ctx = SessionContext()
-    rows_returned = 0
-    for idx in range(plan.partition_count):
-        stream = ctx.execute(plan, idx)
-        try:
-            batch = stream.next()
-            assert batch is not None
-            rows_returned += len(batch.to_pyarrow()[0])
-        except StopIteration:
-            # This is one of the partitions with no values
-            pass
-        with pytest.raises(StopIteration):
-            stream.next()
-
-    assert rows_returned == 5
-
-
-@pytest.mark.asyncio
-async def test_async_iteration_of_df(aggregate_df):
-    rows_returned = 0
-    async for batch in aggregate_df.execute_stream():
-        assert batch is not None
-        rows_returned += len(batch.to_pyarrow()[0])
-
-    assert rows_returned == 5
-
-
-def test_repartition(df):
-    df.repartition(2)
-
-
-def test_repartition_by_hash(df):
-    df.repartition_by_hash(column("a"), num=2)
-
-
-def test_intersect():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3]), pa.array([6])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_i_b = df_a.intersect(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_i_b.collect()
-
-
-def test_except_all():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2]), pa.array([4, 5])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_e_b = df_a.except_all(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_e_b.collect()
-
-
-def test_collect_partitioned():
-    ctx = SessionContext()
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    assert [[batch]] == ctx.create_dataframe([[batch]]).collect_partitioned()
-
-
-def test_union(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 3, 4, 5]), pa.array([4, 5, 6, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_union_distinct(ctx):
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-    df_a = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([3, 4, 5]), pa.array([6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_b = ctx.create_dataframe([[batch]])
-
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3, 4, 5]), pa.array([4, 5, 6, 7, 8])],
-        names=["a", "b"],
-    )
-    df_c = ctx.create_dataframe([[batch]]).sort(column("a"))
-
-    df_a_u_b = df_a.union(df_b, distinct=True).sort(column("a"))
-
-    assert df_c.collect() == df_a_u_b.collect()
-    assert df_c.collect() == df_a_u_b.collect()
-
-
-def test_cache(df):
-    assert df.cache().collect() == df.collect()
-
-
-def test_count(df):
-    # Get number of rows
-    assert df.count() == 3
-
-
-def test_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert datafusion dataframe to pandas dataframe
-    pandas_df = df.to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (3, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_pandas(df):
-    # Skip test if pandas is not installed
-    pd = pytest.importorskip("pandas")
-
-    # Convert empty datafusion dataframe to pandas dataframe
-    pandas_df = df.limit(0).to_pandas()
-    assert isinstance(pandas_df, pd.DataFrame)
-    assert pandas_df.shape == (0, 3)
-    assert set(pandas_df.columns) == {"a", "b", "c"}
-
-
-def test_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert datafusion dataframe to polars dataframe
-    polars_df = df.to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (3, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_empty_to_polars(df):
-    # Skip test if polars is not installed
-    pl = pytest.importorskip("polars")
-
-    # Convert empty datafusion dataframe to polars dataframe
-    polars_df = df.limit(0).to_polars()
-    assert isinstance(polars_df, pl.DataFrame)
-    assert polars_df.shape == (0, 3)
-    assert set(polars_df.columns) == {"a", "b", "c"}
-
-
-def test_to_arrow_table(df):
-    # Convert datafusion dataframe to pyarrow Table
-    pyarrow_table = df.to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream(df):
-    stream = df.execute_stream()
-    assert all(batch is not None for batch in stream)
-    assert not list(stream)  # after one iteration the generator must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_async(df):
-    stream = df.execute_stream()
-    batches = [batch async for batch in stream]
-
-    assert all(batch is not None for batch in batches)
-
-    # After consuming all batches, the stream should be exhausted
-    remaining_batches = [batch async for batch in stream]
-    assert not remaining_batches
-
-
-@pytest.mark.parametrize("schema", [True, False])
-def test_execute_stream_to_arrow_table(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            (batch.to_pyarrow() for batch in stream), schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(batch.to_pyarrow() for batch in stream)
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("schema", [True, False])
-async def test_execute_stream_to_arrow_table_async(df, schema):
-    stream = df.execute_stream()
-
-    if schema:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream], schema=df.schema()
-        )
-    else:
-        pyarrow_table = pa.Table.from_batches(
-            [batch.to_pyarrow() async for batch in stream]
-        )
-
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (3, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_execute_stream_partitioned(df):
-    streams = df.execute_stream_partitioned()
-    assert all(batch is not None for stream in streams for batch in stream)
-    assert all(
-        not list(stream) for stream in streams
-    )  # after one iteration all generators must be exhausted
-
-
-@pytest.mark.asyncio
-async def test_execute_stream_partitioned_async(df):
-    streams = df.execute_stream_partitioned()
-
-    for stream in streams:
-        batches = [batch async for batch in stream]
-        assert all(batch is not None for batch in batches)
-
-        # Ensure the stream is exhausted after iteration
-        remaining_batches = [batch async for batch in stream]
-        assert not remaining_batches
-
-
-def test_empty_to_arrow_table(df):
-    # Convert empty datafusion dataframe to pyarrow Table
-    pyarrow_table = df.limit(0).to_arrow_table()
-    assert isinstance(pyarrow_table, pa.Table)
-    assert pyarrow_table.shape == (0, 3)
-    assert set(pyarrow_table.column_names) == {"a", "b", "c"}
-
-
-def test_to_pylist(df):
-    # Convert datafusion dataframe to Python list
-    pylist = df.to_pylist()
-    assert isinstance(pylist, list)
-    assert pylist == [
-        {"a": 1, "b": 4, "c": 8},
-        {"a": 2, "b": 5, "c": 5},
-        {"a": 3, "b": 6, "c": 8},
-    ]
-
-
-def test_to_pydict(df):
-    # Convert datafusion dataframe to Python dictionary
-    pydict = df.to_pydict()
-    assert isinstance(pydict, dict)
-    assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]}
-
-
-def test_describe(df):
-    # Calculate statistics
-    df = df.describe()
-
-    # Collect the result
-    result = df.to_pydict()
-
-    assert result == {
-        "describe": [
-            "count",
-            "null_count",
-            "mean",
-            "std",
-            "min",
-            "max",
-            "median",
-        ],
-        "a": [3.0, 0.0, 2.0, 1.0, 1.0, 3.0, 2.0],
-        "b": [3.0, 0.0, 5.0, 1.0, 4.0, 6.0, 5.0],
-        "c": [3.0, 0.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0],
-    }
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_csv(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_csv(path, with_header=True)
-
-    ctx.register_csv("csv", path)
-    result = ctx.table("csv").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_json(ctx, df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_json(path)
-
-    ctx.register_json("json", path)
-    result = ctx.table("json").to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize("path_to_str", [True, False])
-def test_write_parquet(df, tmp_path, path_to_str):
-    path = str(tmp_path) if path_to_str else tmp_path
-
-    df.write_parquet(str(path))
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 6), ("brotli", 7), ("zstd", 15)],
-)
-def test_write_compressed_parquet(df, tmp_path, compression, compression_level):
-    path = tmp_path
-
-    df.write_parquet(
-        str(path), compression=compression, compression_level=compression_level
-    )
-
-    # test that the actual compression scheme is the one written
-    for _root, _dirs, files in os.walk(path):
-        for file in files:
-            if file.endswith(".parquet"):
-                metadata = pq.ParquetFile(tmp_path / file).metadata.to_dict()
-                for row_group in metadata["row_groups"]:
-                    for columns in row_group["columns"]:
-                        assert columns["compression"].lower() == compression
-
-    result = pq.read_table(str(path)).to_pydict()
-    expected = df.to_pydict()
-
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    ("compression", "compression_level"),
-    [("gzip", 12), ("brotli", 15), ("zstd", 23), ("wrong", 12)],
-)
-def test_write_compressed_parquet_wrong_compression_level(
-    df, tmp_path, compression, compression_level
-):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(
-            str(path),
-            compression=compression,
-            compression_level=compression_level,
-        )
-
-
-@pytest.mark.parametrize("compression", ["wrong"])
-def test_write_compressed_parquet_invalid_compression(df, tmp_path, compression):
-    path = tmp_path
-
-    with pytest.raises(ValueError):
-        df.write_parquet(str(path), compression=compression)
-
-
-# not testing lzo because it it not implemented yet
-# https://github.com/apache/arrow-rs/issues/6970
-@pytest.mark.parametrize("compression", ["zstd", "brotli", "gzip"])
-def test_write_compressed_parquet_default_compression_level(df, tmp_path, compression):
-    # Test write_parquet with zstd, brotli, gzip default compression level,
-    # ie don't specify compression level
-    # should complete without error
-    path = tmp_path
-
-    df.write_parquet(str(path), compression=compression)
-
-
-def test_dataframe_export(df) -> None:
-    # Guarantees that we have the canonical implementation
-    # reading our dataframe export
-    table = pa.table(df)
-    assert table.num_columns == 3
-    assert table.num_rows == 3
-
-    desired_schema = pa.schema([("a", pa.int64())])
-
-    # Verify we can request a schema
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-
-    # Expect a table of nulls if the schema don't overlap
-    desired_schema = pa.schema([("g", pa.string())])
-    table = pa.table(df, schema=desired_schema)
-    assert table.num_columns == 1
-    assert table.num_rows == 3
-    for i in range(3):
-        assert table[0][i].as_py() is None
-
-    # Expect an error when we cannot convert schema
-    desired_schema = pa.schema([("a", pa.float32())])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-    # Expect an error when we have a not set non-nullable
-    desired_schema = pa.schema([("g", pa.string(), False)])
-    failed_convert = False
-    try:
-        table = pa.table(df, schema=desired_schema)
-    except Exception:
-        failed_convert = True
-    assert failed_convert
-
-
-def test_dataframe_transform(df):
-    def add_string_col(df_internal) -> DataFrame:
-        return df_internal.with_column("string_col", literal("string data"))
-
-    def add_with_parameter(df_internal, value: Any) -> DataFrame:
-        return df_internal.with_column("new_col", literal(value))
-
-    df = df.transform(add_string_col).transform(add_with_parameter, 3)
-
-    result = df.to_pydict()
-
-    assert result["a"] == [1, 2, 3]
-    assert result["string_col"] == ["string data" for _i in range(3)]
-    assert result["new_col"] == [3 for _i in range(3)]
-
-
-def test_dataframe_repr_html_structure(df) -> None:
-    """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
-
-    output = df._repr_html_()
-
-    # Since we've added a fair bit of processing to the html output, lets just verify
-    # the values we are expecting in the table exist. Use regex and ignore everything
-    # between the <th></th> and <td></td>. We also don't want the closing > on the
-    # td and th segments because that is where the formatting data is written.
-
-    headers = ["a", "b", "c"]
-    headers = [f"<th(.*?)>{v}</th>" for v in headers]
-    header_pattern = "(.*?)".join(headers)
-    header_matches = re.findall(header_pattern, output, re.DOTALL)
-    assert len(header_matches) == 1
-
-    # Update the pattern to handle values that may be wrapped in spans
-    body_data = [[1, 4, 8], [2, 5, 5], [3, 6, 8]]
-
-    body_lines = [
-        f"<td(.*?)>(?:<span[^>]*?>)?{v}(?:</span>)?</td>"
-        for inner in body_data
-        for v in inner
-    ]
-    body_pattern = "(.*?)".join(body_lines)
-
-    body_matches = re.findall(body_pattern, output, re.DOTALL)
-
-    assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
-
-
-def test_dataframe_repr_html_values(df):
-    """Test that DataFrame._repr_html_ contains the expected data values."""
-    html = df._repr_html_()
-    assert html is not None
-
-    # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
-    # of formatting
-    pattern = re.compile(
-        r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?4(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?2(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?5(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?3(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?6(?:</span>)?</td>.*?"
-        r"<td[^>]*?>(?:<span[^>]*?>)?8(?:</span>)?</td>",
-        re.DOTALL,
-    )
-
-    # Print debug info if the test fails
-    matches = re.findall(pattern, html)
-    if not matches:
-        print(f"HTML output snippet: {html[:500]}...")  # noqa: T201
-
-    assert len(matches) > 0, "Expected pattern of values not found in HTML output"
-
-
-def test_html_formatter_shared_styles(df, clean_formatter_state):
-    """Test that shared styles work correctly across multiple tables."""
-
-    # First, ensure we're using shared styles
-    configure_formatter(use_shared_styles=True)
-
-    # Get HTML output for first table - should include styles
-    html_first = df._repr_html_()
-
-    # Verify styles are included in first render
-    assert "<style>" in html_first
-    assert ".expandable-container" in html_first
-
-    # Get HTML output for second table - should NOT include styles
-    html_second = df._repr_html_()
-
-    # Verify styles are NOT included in second render
-    assert "<style>" not in html_second
-    assert ".expandable-container" not in html_second
-
-    # Reset the styles loaded state and verify styles are included again
-    reset_styles_loaded_state()
-    html_after_reset = df._repr_html_()
-
-    # Verify styles are included after reset
-    assert "<style>" in html_after_reset
-    assert ".expandable-container" in html_after_reset
-
-
-def test_html_formatter_no_shared_styles(df, clean_formatter_state):
-    """Test that styles are always included when shared styles are disabled."""
-
-    # Configure formatter to NOT use shared styles
-    configure_formatter(use_shared_styles=False)
-
-    # Generate HTML multiple times
-    html_first = df._repr_html_()
-    html_second = df._repr_html_()
-
-    # Verify styles are included in both renders
-    assert "<style>" in html_first
-    assert "<style>" in html_second
-    assert ".expandable-container" in html_first
-    assert ".expandable-container" in html_second
-
-
-def test_html_formatter_manual_format_html(clean_formatter_state):
-    """Test direct usage of format_html method with shared styles."""
-
-    # Create sample data
-    batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
-    )
-
-    formatter = get_formatter()
-
-    # First call should include styles
-    html_first = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_first
-
-    # Second call should not include styles (using shared styles by default)
-    html_second = formatter.format_html([batch], batch.schema)
-    assert "<style>" not in html_second
-
-    # Reset loaded state
-    reset_styles_loaded_state()
-
-    # After reset, styles should be included again
-    html_reset = formatter.format_html([batch], batch.schema)
-    assert "<style>" in html_reset
-
-    # Create a new formatter with shared_styles=False
-    local_formatter = DataFrameHtmlFormatter(use_shared_styles=False)
-
-    # Both calls should include styles
-    local_html_1 = local_formatter.format_html([batch], batch.schema)
-    local_html_2 = local_formatter.format_html([batch], batch.schema)
-
-    assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
-def test_html_formatter_memory_and_rows():
-    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-    
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
-    )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-    
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
-    )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-    
-    # Test validation for invalid parameters
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=0)
-    
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=0)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=-5)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=0)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=-10)
-
-
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
-    reset_formatter()
-    
-    # Configure with custom style provider and additional parameters
-    configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7

From 4da231bfdc9da66c0fc5dded673ea3afe2c828c0 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 12:16:50 +0800
Subject: [PATCH 27/40] remove unused type hints from imports in
 html_formatter.py

---
 python/datafusion/html_formatter.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index ccf4da31c..af6c7f8c3 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -24,8 +24,6 @@
     Optional,
     Protocol,
     runtime_checkable,
-    TypeVar,
-    Union,
 )
 
 

From 43a31836cec8ee2b3fc23ed048f2191b6efe8386 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 12:17:00 +0800
Subject: [PATCH 28/40] remove redundant tests for DataFrameHtmlFormatter and
 clean up assertions

---
 python/tests/test_dataframe.py | 105 +--------------------------------
 1 file changed, 1 insertion(+), 104 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 38a8c6866..35547bf9e 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1747,107 +1747,4 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
     local_html_2 = local_formatter.format_html([batch], batch.schema)
 
     assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
-
-
-def test_html_formatter_memory_and_rows():
-    """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-    
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
-    )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-    
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
-    )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-    
-    # Test validation for invalid parameters
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=0)
-    
-    with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
-        DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=0)
-    
-    with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
-        DataFrameHtmlFormatter(min_rows_display=-5)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=0)
-    
-    with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
-        DataFrameHtmlFormatter(repr_rows=-10)
-
-
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
-
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
-    reset_formatter()
-    
-    # Configure with custom style provider and additional parameters
-    configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
-    )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7
-
-
+    assert "<style>" in local_html_2
\ No newline at end of file

From b52166bc1d27e688c7bf366cc87a486a02eb35bf Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 12:17:11 +0800
Subject: [PATCH 29/40] refactor get_attr function to support generic default
 values

---
 src/dataframe.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index fd5069968..f8a65d101 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -138,11 +138,14 @@ fn import_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
     get_formatter.call0()
 }
 // Helper function to extract attributes with fallback to default
-fn get_attr<'a>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: usize) -> usize {
+fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T
+where
+    T: for<'py> pyo3::FromPyObject<'py> + Clone,
+{
     py_object
         .getattr(attr_name)
-        .and_then(|v| v.extract::<usize>())
-        .unwrap_or(default_value)
+        .and_then(|v| v.extract::<T>())
+        .unwrap_or_else(|_| default_value.clone())
 }
 
 /// Helper function to create a FormatterConfig from a Python formatter object

From 9f68c83a76610da567d4a9ebc821f3171728e9ad Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 12:26:18 +0800
Subject: [PATCH 30/40] build_formatter_config_from_python return PyResult

---
 src/dataframe.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index f8a65d101..dafb9d4b7 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -127,7 +127,7 @@ struct PythonFormatter<'py> {
 /// Get the Python formatter and its configuration
 fn get_python_formatter_with_config<'py>(py: Python<'py>) -> PyResult<PythonFormatter<'py>> {
     let formatter = import_python_formatter(py)?;
-    let config = build_formatter_config_from_python(&formatter);
+    let config = build_formatter_config_from_python(&formatter)?;
     Ok(PythonFormatter { formatter, config })
 }
 
@@ -137,6 +137,7 @@ fn import_python_formatter(py: Python) -> PyResult<Bound<'_, PyAny>> {
     let get_formatter = formatter_module.getattr("get_formatter")?;
     get_formatter.call0()
 }
+
 // Helper function to extract attributes with fallback to default
 fn get_attr<'a, T>(py_object: &'a Bound<'a, PyAny>, attr_name: &str, default_value: T) -> T
 where
@@ -149,7 +150,7 @@ where
 }
 
 /// Helper function to create a FormatterConfig from a Python formatter object
-fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> FormatterConfig {
+fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult<FormatterConfig> {
     let default_config = FormatterConfig::default();
     let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes);
     let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows);
@@ -161,14 +162,11 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> Formatter
         repr_rows,
     };
 
-    // Validate the configuration
-    if let Err(err) = config.validate() {
-        // Log the error but use default values instead of failing
-        eprintln!("Invalid formatter configuration: {}", err);
-        return default_config;
-    }
-
+    // Return the validated config, converting String error to PyErr
     config
+        .validate()
+        .map_err(|e| pyo3::exceptions::PyValueError::new_err(e))?;
+    Ok(config)
 }
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.

From 2c0565a7b0008c582377607ab3027beb188dccb2 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 14:55:39 +0800
Subject: [PATCH 31/40] fix ruff errors

---
 python/datafusion/html_formatter.py |  2 +-
 python/tests/test_dataframe.py      | 27 ++++++++++++++-------------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index af6c7f8c3..d84bb31c2 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -181,7 +181,7 @@ def __init__(
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, max_height, max_memory_bytes, 
+            If max_cell_length, max_width, max_height, max_memory_bytes,
             min_rows_display, or repr_rows is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 35547bf9e..7cc3bc8bf 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -919,13 +919,13 @@ def get_header_style(self) -> str:
 
 def test_html_formatter_memory_and_rows():
     """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    
+
     # Test default values
     formatter = DataFrameHtmlFormatter()
     assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
     assert formatter.min_rows_display == 20
     assert formatter.repr_rows == 10
-    
+
     # Test custom values
     formatter = DataFrameHtmlFormatter(
         max_memory_bytes=1024 * 1024,  # 1 MB
@@ -935,7 +935,7 @@ def test_html_formatter_memory_and_rows():
     assert formatter.max_memory_bytes == 1024 * 1024
     assert formatter.min_rows_display == 10
     assert formatter.repr_rows == 5
-    
+
     # Test extremely large values and tiny values (edge cases)
     # These should not raise exceptions
     extreme_formatter = DataFrameHtmlFormatter(
@@ -946,29 +946,30 @@ def test_html_formatter_memory_and_rows():
     assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
     assert extreme_formatter.min_rows_display == 1
     assert extreme_formatter.repr_rows == 1
-    
+
     # Test validation for invalid parameters
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=0)
-    
+
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=-100)
-    
+
     with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
         DataFrameHtmlFormatter(min_rows_display=0)
-    
+
     with pytest.raises(ValueError, match="min_rows_display must be a positive integer"):
         DataFrameHtmlFormatter(min_rows_display=-5)
-    
+
     with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
         DataFrameHtmlFormatter(repr_rows=0)
-    
+
     with pytest.raises(ValueError, match="repr_rows must be a positive integer"):
         DataFrameHtmlFormatter(repr_rows=-10)
 
 
-def test_html_formatter_custom_style_provider_with_parameters(df, clean_formatter_state):
-    """Test using custom style providers with the HTML formatter and configured parameters."""
+def test_custom_style_provider_html_formatter(df, clean_formatter_state):
+    """Test using custom style providers with the HTML formatter and configured
+    parameters."""
 
     class CustomStyleProvider:
         def get_cell_style(self) -> str:
@@ -995,7 +996,7 @@ def get_header_style(self) -> str:
 
     # Reset for the next part of the test
     reset_formatter()
-    
+
     # Configure with custom style provider and additional parameters
     configure_formatter(
         style_provider=CustomStyleProvider(),
@@ -1747,4 +1748,4 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
     local_html_2 = local_formatter.format_html([batch], batch.schema)
 
     assert "<style>" in local_html_1
-    assert "<style>" in local_html_2
\ No newline at end of file
+    assert "<style>" in local_html_2

From 946dcdc69db22a35efac2bb237682ed4ce813714 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 14:58:50 +0800
Subject: [PATCH 32/40] trigger ci


From b5ab123377d7092a1553a9256592dfed951cd106 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 15:00:40 +0800
Subject: [PATCH 33/40] fix: remove redundant newline in
 test_custom_style_provider_html_formatter

---
 python/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 7cc3bc8bf..3e2904779 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -996,7 +996,6 @@ def get_header_style(self) -> str:
 
     # Reset for the next part of the test
     reset_formatter()
-
     # Configure with custom style provider and additional parameters
     configure_formatter(
         style_provider=CustomStyleProvider(),
@@ -1749,3 +1748,4 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
+    
\ No newline at end of file

From d5a3f1f74cea67007c30669f191a46871a2f03ec Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 16:36:19 +0800
Subject: [PATCH 34/40] add more tests

---
 python/tests/test_dataframe.py | 220 +++++++++++++++++++--------------
 1 file changed, 130 insertions(+), 90 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 3e2904779..def4d86e3 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -41,7 +41,7 @@
 )
 from pyarrow.csv import write_csv
 
-
+MB = 1024* 1024
 @pytest.fixture
 def ctx():
     return SessionContext()
@@ -116,6 +116,30 @@ def clean_formatter_state():
     """Reset the HTML formatter after each test."""
     reset_formatter()
 
+# custom style for testing with html formatter
+class CustomStyleProvider:
+    def get_cell_style(self) -> str:
+        return (
+            "background-color: #f5f5f5; color: #333; padding: 8px; border: "
+            "1px solid #ddd;"
+        )
+
+    def get_header_style(self) -> str:
+        return (
+            "background-color: #4285f4; color: white; font-weight: bold; "
+            "padding: 10px; border: 1px solid #3367d6;"
+        )
+
+def count_table_rows(html_content: str) -> int:
+    """Count the number of table rows in HTML content.
+    
+    Args:
+        html_content: HTML string to analyze
+        
+    Returns:
+        Number of table rows found (number of <tr> tags)
+    """
+    return len(re.findall(r"<tr", html_content))
 
 def test_select(df):
     df_1 = df.select(
@@ -671,11 +695,10 @@ def test_window_frame_defaults_match_postgres(partitioned_df):
     assert df_2.sort(col_a).to_pydict() == expected
 
 
-def test_html_formatter_configuration(df, clean_formatter_state):
+def test_html_formatter_cell_dimension(df, clean_formatter_state):
     """Test configuring the HTML formatter with different options."""
     # Configure with custom settings
     configure_formatter(
-        max_cell_length=5,
         max_width=500,
         max_height=200,
         enable_cell_expansion=False,
@@ -693,19 +716,6 @@ def test_html_formatter_configuration(df, clean_formatter_state):
 def test_html_formatter_custom_style_provider(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter."""
 
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
     # Configure with custom style provider
     configure_formatter(style_provider=CustomStyleProvider())
 
@@ -917,37 +927,67 @@ def get_header_style(self) -> str:
     assert "color: #5af" in html_output  # Even numbers
 
 
-def test_html_formatter_memory_and_rows():
+def test_html_formatter_memory(df, clean_formatter_state):
     """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-
-    # Test default values
-    formatter = DataFrameHtmlFormatter()
-    assert formatter.max_memory_bytes == 2 * 1024 * 1024  # 2 MB
-    assert formatter.min_rows_display == 20
-    assert formatter.repr_rows == 10
-
-    # Test custom values
-    formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=1024 * 1024,  # 1 MB
-        min_rows_display=10,
-        repr_rows=5
+    configure_formatter(
+        max_memory_bytes = 10,
+        min_rows_display = 1
     )
-    assert formatter.max_memory_bytes == 1024 * 1024
-    assert formatter.min_rows_display == 10
-    assert formatter.repr_rows == 5
-
-    # Test extremely large values and tiny values (edge cases)
-    # These should not raise exceptions
-    extreme_formatter = DataFrameHtmlFormatter(
-        max_memory_bytes=10 * 1024 * 1024 * 1024,  # 10 GB
-        min_rows_display=1,
-        repr_rows=1
+    html_output = df._repr_html_()
+   
+    # Count the number of table rows in the output
+    tr_count = count_table_rows(html_output)
+    # With a tiny memory limit of 10 bytes, the formatter should display
+    # the minimum number of rows (1) plus a message about truncation
+    assert tr_count == 2  # 1 for header row, 1 for data row
+    assert "data truncated" in html_output.lower() 
+    
+    configure_formatter(
+        max_memory_bytes = 10*MB,
+        min_rows_display = 1
     )
-    assert extreme_formatter.max_memory_bytes == 10 * 1024 * 1024 * 1024
-    assert extreme_formatter.min_rows_display == 1
-    assert extreme_formatter.repr_rows == 1
-
+    html_output = df._repr_html_()
+    # With larger memory limit and min_rows=2, should display all rows
+    tr_count = count_table_rows(html_output)
+    # Table should have header row (1) + 3 data rows = 4 rows
+    assert tr_count == 4
+    # No truncation message should appear
+    assert "data truncated" not in html_output.lower()
+
+def test_html_formatter_repr_rows(df, clean_formatter_state):
+    configure_formatter(
+        min_rows_display = 2,
+        repr_rows = 2
+    )
+    html_output = df._repr_html_()
+    
+    tr_count = count_table_rows(html_output)
+    # Tabe should have header row (1) + 2 data rows = 3 rows
+    assert tr_count == 3
+    
+    configure_formatter(
+        min_rows_display = 2,
+        repr_rows = 3
+    )
+    html_output = df._repr_html_()
+    
+    tr_count = count_table_rows(html_output)
+    # Tabe should have header row (1) + 3 data rows = 4 rows
+    assert tr_count == 4
+   
+    
+def test_html_formatter_validation():
     # Test validation for invalid parameters
+    
+    with pytest.raises(ValueError, match="max_cell_length must be a positive integer"):
+        DataFrameHtmlFormatter(max_cell_length=0)
+        
+    with pytest.raises(ValueError, match="max_width must be a positive integer"):
+        DataFrameHtmlFormatter(max_width=0)
+        
+    with pytest.raises(ValueError, match="max_height must be a positive integer"):
+        DataFrameHtmlFormatter(max_height=0)
+        
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=0)
 
@@ -967,55 +1007,56 @@ def test_html_formatter_memory_and_rows():
         DataFrameHtmlFormatter(repr_rows=-10)
 
 
-def test_custom_style_provider_html_formatter(df, clean_formatter_state):
+def test_configure_formatter(df, clean_formatter_state):
     """Test using custom style providers with the HTML formatter and configured
     parameters."""
 
-    class CustomStyleProvider:
-        def get_cell_style(self) -> str:
-            return (
-                "background-color: #f5f5f5; color: #333; padding: 8px; border: "
-                "1px solid #ddd;"
-            )
-
-        def get_header_style(self) -> str:
-            return (
-                "background-color: #4285f4; color: white; font-weight: bold; "
-                "padding: 10px; border: 1px solid #3367d6;"
-            )
-
-    # Configure with custom style provider
-    configure_formatter(style_provider=CustomStyleProvider())
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Reset for the next part of the test
+    # these are non-default values
+    MAX_CELL_LENGTH = 10
+    MAX_WIDTH = 500
+    MAX_HEIGHT = 30
+    MAX_MEMORY_BYTES = 3*MB
+    MIN_ROWS_DISPLAY=2
+    REPR_ROWS = 2
+    ENABLE_CELL_EXPANSION = False
+    SHOW_TRUNCATION_MESSAGE = False
+    USE_SHARED_STYLES = False
+    
     reset_formatter()
+    formatter_default = get_formatter()
+   
+    assert formatter_default.max_cell_length != MAX_CELL_LENGTH
+    assert formatter_default.max_width != MAX_WIDTH
+    assert formatter_default.max_height != MAX_HEIGHT 
+    assert formatter_default.max_memory_bytes != MAX_MEMORY_BYTES
+    assert formatter_default.min_rows_display != MIN_ROWS_DISPLAY
+    assert formatter_default.repr_rows != REPR_ROWS
+    assert formatter_default.enable_cell_expansion != ENABLE_CELL_EXPANSION
+    assert formatter_default.show_truncation_message != SHOW_TRUNCATION_MESSAGE
+    assert formatter_default.use_shared_styles != USE_SHARED_STYLES
+     
     # Configure with custom style provider and additional parameters
     configure_formatter(
-        style_provider=CustomStyleProvider(),
-        max_memory_bytes=3 * 1024 * 1024,  # 3 MB
-        min_rows_display=15,
-        repr_rows=7
+        max_cell_length = MAX_CELL_LENGTH,
+        max_width = MAX_WIDTH,
+        max_height= MAX_HEIGHT,
+        max_memory_bytes=MAX_MEMORY_BYTES,
+        min_rows_display=MIN_ROWS_DISPLAY,
+        repr_rows=REPR_ROWS,
+        enable_cell_expansion = ENABLE_CELL_EXPANSION,
+        show_truncation_message = SHOW_TRUNCATION_MESSAGE,
+        use_shared_styles = USE_SHARED_STYLES
     )
-
-    html_output = df._repr_html_()
-
-    # Verify our custom styles were applied
-    assert "background-color: #4285f4" in html_output
-    assert "color: white" in html_output
-    assert "background-color: #f5f5f5" in html_output
-
-    # Test memory and row parameters were properly set
-    formatter = get_formatter()
-    assert formatter.max_memory_bytes == 3 * 1024 * 1024  # 3 MB
-    assert formatter.min_rows_display == 15
-    assert formatter.repr_rows == 7
+    formatter_custom = get_formatter()
+    assert formatter_custom.max_cell_length == MAX_CELL_LENGTH
+    assert formatter_custom.max_width == MAX_WIDTH
+    assert formatter_custom.max_height == MAX_HEIGHT 
+    assert formatter_custom.max_memory_bytes == MAX_MEMORY_BYTES
+    assert formatter_custom.min_rows_display == MIN_ROWS_DISPLAY
+    assert formatter_custom.repr_rows == REPR_ROWS
+    assert formatter_custom.enable_cell_expansion == ENABLE_CELL_EXPANSION
+    assert formatter_custom.show_truncation_message == SHOW_TRUNCATION_MESSAGE
+    assert formatter_custom.use_shared_styles == USE_SHARED_STYLES
 
 
 def test_get_dataframe(tmp_path):
@@ -1606,9 +1647,8 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame:
     assert result["new_col"] == [3 for _i in range(3)]
 
 
-def test_dataframe_repr_html_structure(df) -> None:
+def test_dataframe_repr_html_structure(df, clean_formatter_state) -> None:
     """Test that DataFrame._repr_html_ produces expected HTML output structure."""
-    import re
 
     output = df._repr_html_()
 
@@ -1638,13 +1678,13 @@ def test_dataframe_repr_html_structure(df) -> None:
     assert len(body_matches) == 1, "Expected pattern of values not found in HTML output"
 
 
-def test_dataframe_repr_html_values(df):
+def test_dataframe_repr_html_values(df, clean_formatter_state):
     """Test that DataFrame._repr_html_ contains the expected data values."""
     html = df._repr_html_()
     assert html is not None
 
     # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
+    # This pattern will match the sequence of values 1,4,8,2,5,5 regardless
     # of formatting
     pattern = re.compile(
         r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"
@@ -1748,4 +1788,4 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
-    
\ No newline at end of file
+

From 6cf0689e7b0a7384a98818aea6feb05c7a298134 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 16:55:59 +0800
Subject: [PATCH 35/40] trigger ci

---
 python/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index def4d86e3..325f14d94 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1684,7 +1684,7 @@ def test_dataframe_repr_html_values(df, clean_formatter_state):
     assert html is not None
 
     # Create a more flexible pattern that handles values being wrapped in spans
-    # This pattern will match the sequence of values 1,4,8,2,5,5 regardless
+    # This pattern will match the sequence of values 1,4,8,2,5,5,3,6,8 regardless
     # of formatting
     pattern = re.compile(
         r"<td[^>]*?>(?:<span[^>]*?>)?1(?:</span>)?</td>.*?"

From 714aa709d7bbd86fb13c0b124ae0bb5cf7cc6ac3 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Mon, 28 Apr 2025 22:04:43 +0800
Subject: [PATCH 36/40] Fix ruff errors

---
 python/tests/test_dataframe.py | 133 +++++++++++++++------------------
 1 file changed, 62 insertions(+), 71 deletions(-)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 325f14d94..d32277455 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -41,7 +41,9 @@
 )
 from pyarrow.csv import write_csv
 
-MB = 1024* 1024
+MB = 1024 * 1024
+
+
 @pytest.fixture
 def ctx():
     return SessionContext()
@@ -116,6 +118,7 @@ def clean_formatter_state():
     """Reset the HTML formatter after each test."""
     reset_formatter()
 
+
 # custom style for testing with html formatter
 class CustomStyleProvider:
     def get_cell_style(self) -> str:
@@ -130,17 +133,17 @@ def get_header_style(self) -> str:
             "padding: 10px; border: 1px solid #3367d6;"
         )
 
+
 def count_table_rows(html_content: str) -> int:
     """Count the number of table rows in HTML content.
-    
     Args:
         html_content: HTML string to analyze
-        
     Returns:
         Number of table rows found (number of <tr> tags)
     """
     return len(re.findall(r"<tr", html_content))
 
+
 def test_select(df):
     df_1 = df.select(
         column("a") + column("b"),
@@ -929,23 +932,17 @@ def get_header_style(self) -> str:
 
 def test_html_formatter_memory(df, clean_formatter_state):
     """Test the memory and row control parameters in DataFrameHtmlFormatter."""
-    configure_formatter(
-        max_memory_bytes = 10,
-        min_rows_display = 1
-    )
+    configure_formatter(max_memory_bytes=10, min_rows_display=1)
     html_output = df._repr_html_()
-   
+
     # Count the number of table rows in the output
     tr_count = count_table_rows(html_output)
     # With a tiny memory limit of 10 bytes, the formatter should display
     # the minimum number of rows (1) plus a message about truncation
     assert tr_count == 2  # 1 for header row, 1 for data row
-    assert "data truncated" in html_output.lower() 
-    
-    configure_formatter(
-        max_memory_bytes = 10*MB,
-        min_rows_display = 1
-    )
+    assert "data truncated" in html_output.lower()
+
+    configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1)
     html_output = df._repr_html_()
     # With larger memory limit and min_rows=2, should display all rows
     tr_count = count_table_rows(html_output)
@@ -954,40 +951,35 @@ def test_html_formatter_memory(df, clean_formatter_state):
     # No truncation message should appear
     assert "data truncated" not in html_output.lower()
 
+
 def test_html_formatter_repr_rows(df, clean_formatter_state):
-    configure_formatter(
-        min_rows_display = 2,
-        repr_rows = 2
-    )
+    configure_formatter(min_rows_display=2, repr_rows=2)
     html_output = df._repr_html_()
-    
+
     tr_count = count_table_rows(html_output)
     # Tabe should have header row (1) + 2 data rows = 3 rows
     assert tr_count == 3
-    
-    configure_formatter(
-        min_rows_display = 2,
-        repr_rows = 3
-    )
+
+    configure_formatter(min_rows_display=2, repr_rows=3)
     html_output = df._repr_html_()
-    
+
     tr_count = count_table_rows(html_output)
     # Tabe should have header row (1) + 3 data rows = 4 rows
     assert tr_count == 4
-   
-    
+
+
 def test_html_formatter_validation():
     # Test validation for invalid parameters
-    
+
     with pytest.raises(ValueError, match="max_cell_length must be a positive integer"):
         DataFrameHtmlFormatter(max_cell_length=0)
-        
+
     with pytest.raises(ValueError, match="max_width must be a positive integer"):
         DataFrameHtmlFormatter(max_width=0)
-        
+
     with pytest.raises(ValueError, match="max_height must be a positive integer"):
         DataFrameHtmlFormatter(max_height=0)
-        
+
     with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"):
         DataFrameHtmlFormatter(max_memory_bytes=0)
 
@@ -1012,51 +1004,51 @@ def test_configure_formatter(df, clean_formatter_state):
     parameters."""
 
     # these are non-default values
-    MAX_CELL_LENGTH = 10
-    MAX_WIDTH = 500
-    MAX_HEIGHT = 30
-    MAX_MEMORY_BYTES = 3*MB
-    MIN_ROWS_DISPLAY=2
-    REPR_ROWS = 2
-    ENABLE_CELL_EXPANSION = False
-    SHOW_TRUNCATION_MESSAGE = False
-    USE_SHARED_STYLES = False
-    
+    max_cell_length = 10
+    max_width = 500
+    max_height = 30
+    max_memory_bytes = 3 * MB
+    min_rows_display = 2
+    repr_rows = 2
+    enable_cell_expansion = False
+    show_truncation_message = False
+    use_shared_styles = False
+
     reset_formatter()
     formatter_default = get_formatter()
-   
-    assert formatter_default.max_cell_length != MAX_CELL_LENGTH
-    assert formatter_default.max_width != MAX_WIDTH
-    assert formatter_default.max_height != MAX_HEIGHT 
-    assert formatter_default.max_memory_bytes != MAX_MEMORY_BYTES
-    assert formatter_default.min_rows_display != MIN_ROWS_DISPLAY
-    assert formatter_default.repr_rows != REPR_ROWS
-    assert formatter_default.enable_cell_expansion != ENABLE_CELL_EXPANSION
-    assert formatter_default.show_truncation_message != SHOW_TRUNCATION_MESSAGE
-    assert formatter_default.use_shared_styles != USE_SHARED_STYLES
-     
+
+    assert formatter_default.max_cell_length != max_cell_length
+    assert formatter_default.max_width != max_width
+    assert formatter_default.max_height != max_height
+    assert formatter_default.max_memory_bytes != max_memory_bytes
+    assert formatter_default.min_rows_display != min_rows_display
+    assert formatter_default.repr_rows != repr_rows
+    assert formatter_default.enable_cell_expansion != enable_cell_expansion
+    assert formatter_default.show_truncation_message != show_truncation_message
+    assert formatter_default.use_shared_styles != use_shared_styles
+
     # Configure with custom style provider and additional parameters
     configure_formatter(
-        max_cell_length = MAX_CELL_LENGTH,
-        max_width = MAX_WIDTH,
-        max_height= MAX_HEIGHT,
-        max_memory_bytes=MAX_MEMORY_BYTES,
-        min_rows_display=MIN_ROWS_DISPLAY,
-        repr_rows=REPR_ROWS,
-        enable_cell_expansion = ENABLE_CELL_EXPANSION,
-        show_truncation_message = SHOW_TRUNCATION_MESSAGE,
-        use_shared_styles = USE_SHARED_STYLES
+        max_cell_length=max_cell_length,
+        max_width=max_width,
+        max_height=max_height,
+        max_memory_bytes=max_memory_bytes,
+        min_rows_display=min_rows_display,
+        repr_rows=repr_rows,
+        enable_cell_expansion=enable_cell_expansion,
+        show_truncation_message=show_truncation_message,
+        use_shared_styles=use_shared_styles,
     )
     formatter_custom = get_formatter()
-    assert formatter_custom.max_cell_length == MAX_CELL_LENGTH
-    assert formatter_custom.max_width == MAX_WIDTH
-    assert formatter_custom.max_height == MAX_HEIGHT 
-    assert formatter_custom.max_memory_bytes == MAX_MEMORY_BYTES
-    assert formatter_custom.min_rows_display == MIN_ROWS_DISPLAY
-    assert formatter_custom.repr_rows == REPR_ROWS
-    assert formatter_custom.enable_cell_expansion == ENABLE_CELL_EXPANSION
-    assert formatter_custom.show_truncation_message == SHOW_TRUNCATION_MESSAGE
-    assert formatter_custom.use_shared_styles == USE_SHARED_STYLES
+    assert formatter_custom.max_cell_length == max_cell_length
+    assert formatter_custom.max_width == max_width
+    assert formatter_custom.max_height == max_height
+    assert formatter_custom.max_memory_bytes == max_memory_bytes
+    assert formatter_custom.min_rows_display == min_rows_display
+    assert formatter_custom.repr_rows == repr_rows
+    assert formatter_custom.enable_cell_expansion == enable_cell_expansion
+    assert formatter_custom.show_truncation_message == show_truncation_message
+    assert formatter_custom.use_shared_styles == use_shared_styles
 
 
 def test_get_dataframe(tmp_path):
@@ -1788,4 +1780,3 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
-

From 001988ea9d29d2bca9683383c84389ffa15547a3 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 29 Apr 2025 09:01:01 +0800
Subject: [PATCH 37/40] fix clippy error

---
 src/dataframe.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dataframe.rs b/src/dataframe.rs
index dafb9d4b7..211e31bd1 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -125,7 +125,7 @@ struct PythonFormatter<'py> {
 }
 
 /// Get the Python formatter and its configuration
-fn get_python_formatter_with_config<'py>(py: Python<'py>) -> PyResult<PythonFormatter<'py>> {
+fn get_python_formatter_with_config(py: Python) -> PyResult<PythonFormatter> {
     let formatter = import_python_formatter(py)?;
     let config = build_formatter_config_from_python(&formatter)?;
     Ok(PythonFormatter { formatter, config })

From 23146136e2f8e8378de1d42a81355ab44fd80215 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 29 Apr 2025 09:29:03 +0800
Subject: [PATCH 38/40] feat: add validation for parameters in
 configure_formatter

---
 python/datafusion/html_formatter.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index d84bb31c2..b40a9e8fa 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -633,6 +633,9 @@ def configure_formatter(**kwargs: Any) -> None:
         **kwargs: Formatter configuration parameters like max_cell_length,
                  max_width, max_height, enable_cell_expansion, etc.
 
+    Raises:
+        ValueError: If any invalid parameters are provided
+
     Example:
         >>> from datafusion.html_formatter import configure_formatter
         >>> configure_formatter(
@@ -642,6 +645,21 @@ def configure_formatter(**kwargs: Any) -> None:
         ...     use_shared_styles=True
         ... )
     """
+    # Valid parameters accepted by DataFrameHtmlFormatter
+    valid_params = {
+        "max_cell_length", "max_width", "max_height", "max_memory_bytes",
+        "min_rows_display", "repr_rows", "enable_cell_expansion", "custom_css",
+        "show_truncation_message", "style_provider", "use_shared_styles"
+    }
+    
+    # Check for invalid parameters
+    invalid_params = set(kwargs) - valid_params
+    if invalid_params:
+        msg = f"Invalid formatter parameters: {', '.join(invalid_params)}. " \
+              f"Valid parameters are: {', '.join(valid_params)}"
+        raise ValueError(msg)
+    
+    # Create and set formatter with validated parameters
     set_formatter(DataFrameHtmlFormatter(**kwargs))
 
 

From f5bec5baec8ea93b6865477f338837b75092d299 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 29 Apr 2025 09:35:23 +0800
Subject: [PATCH 39/40] test: add tests for invalid parameters in
 configure_formatter

---
 python/tests/test_dataframe.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index d32277455..261ca1870 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1051,6 +1051,20 @@ def test_configure_formatter(df, clean_formatter_state):
     assert formatter_custom.use_shared_styles == use_shared_styles
 
 
+def test_configure_formatter_invalid_params(clean_formatter_state):
+    """Test that configure_formatter rejects invalid parameters."""
+    with pytest.raises(ValueError, match="Invalid formatter parameters"):
+        configure_formatter(invalid_param=123)
+        
+    # Test with multiple parameters, one valid and one invalid
+    with pytest.raises(ValueError, match="Invalid formatter parameters"):
+        configure_formatter(max_width=500, not_a_real_param="test")
+        
+    # Test with multiple invalid parameters
+    with pytest.raises(ValueError, match="Invalid formatter parameters"):
+        configure_formatter(fake_param1="test", fake_param2=456)
+
+
 def test_get_dataframe(tmp_path):
     ctx = SessionContext()
 
@@ -1780,3 +1794,4 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
+

From 2da8da9d7e2c77cc72d292809adac7cd9069add6 Mon Sep 17 00:00:00 2001
From: Siew Kam Onn <kosiew@gmail.com>
Date: Tue, 29 Apr 2025 17:01:31 +0800
Subject: [PATCH 40/40] Fix ruff errors

---
 python/datafusion/html_formatter.py | 24 +++++++++++++++++-------
 python/tests/test_dataframe.py      |  5 ++---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index b40a9e8fa..12a7e4553 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -647,18 +647,28 @@ def configure_formatter(**kwargs: Any) -> None:
     """
     # Valid parameters accepted by DataFrameHtmlFormatter
     valid_params = {
-        "max_cell_length", "max_width", "max_height", "max_memory_bytes",
-        "min_rows_display", "repr_rows", "enable_cell_expansion", "custom_css",
-        "show_truncation_message", "style_provider", "use_shared_styles"
+        "max_cell_length",
+        "max_width",
+        "max_height",
+        "max_memory_bytes",
+        "min_rows_display",
+        "repr_rows",
+        "enable_cell_expansion",
+        "custom_css",
+        "show_truncation_message",
+        "style_provider",
+        "use_shared_styles",
     }
-    
+
     # Check for invalid parameters
     invalid_params = set(kwargs) - valid_params
     if invalid_params:
-        msg = f"Invalid formatter parameters: {', '.join(invalid_params)}. " \
-              f"Valid parameters are: {', '.join(valid_params)}"
+        msg = (
+            f"Invalid formatter parameters: {', '.join(invalid_params)}. "
+            f"Valid parameters are: {', '.join(valid_params)}"
+        )
         raise ValueError(msg)
-    
+
     # Create and set formatter with validated parameters
     set_formatter(DataFrameHtmlFormatter(**kwargs))
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 261ca1870..e01308c86 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -1055,11 +1055,11 @@ def test_configure_formatter_invalid_params(clean_formatter_state):
     """Test that configure_formatter rejects invalid parameters."""
     with pytest.raises(ValueError, match="Invalid formatter parameters"):
         configure_formatter(invalid_param=123)
-        
+
     # Test with multiple parameters, one valid and one invalid
     with pytest.raises(ValueError, match="Invalid formatter parameters"):
         configure_formatter(max_width=500, not_a_real_param="test")
-        
+
     # Test with multiple invalid parameters
     with pytest.raises(ValueError, match="Invalid formatter parameters"):
         configure_formatter(fake_param1="test", fake_param2=456)
@@ -1794,4 +1794,3 @@ def test_html_formatter_manual_format_html(clean_formatter_state):
 
     assert "<style>" in local_html_1
     assert "<style>" in local_html_2
-