kosiew
diff --git a/‎python/datafusion/html_formatter.py
Lines changed: 18 additions & 1 deletion b/‎python/datafusion/html_formatter.py
Lines changed: 18 additions & 1 deletion
diff --git a/‎src/dataframe.rs
Lines changed: 39 additions & 30 deletions b/‎src/dataframe.rs
Lines changed: 39 additions & 30 deletions
@@ -98,6 +98,8 @@ class DataFrameHtmlFormatter:
         style_provider: Custom provider for cell and header styles
         use_shared_styles: Whether to load styles and scripts only once per notebook
           session
+        max_table_bytes: Maximum bytes to display for table presentation (default: 2MB) 
+        min_table_rows: Minimum number of table rows to display (default: 20)
     """
 
     # Class variable to track if styles have been loaded in the notebook
@@ -113,6 +115,8 @@ def __init__(
         show_truncation_message: bool = True,
         style_provider: Optional[StyleProvider] = None,
         use_shared_styles: bool = True,
+        max_table_bytes: int = 2 * 1024 * 1024,  # 2 MB
+        min_table_rows: int = 20,
     ) -> None:
         """Initialize the HTML formatter.
 
@@ -135,11 +139,16 @@ def __init__(
             is used.
         use_shared_styles : bool, default True
             Whether to use shared styles across multiple tables.
+        max_table_bytes : int, default 2MB (2 * 1024 * 1024)
+            Maximum bytes to display for table presentation.
+        min_table_rows : int, default 20
+            Minimum number of table rows to display.
 
         Raises:
         ------
         ValueError
-            If max_cell_length, max_width, or max_height is not a positive integer.
+            If max_cell_length, max_width, max_height, max_table_bytes, or min_table_rows 
+            is not a positive integer.
         TypeError
             If enable_cell_expansion, show_truncation_message, or use_shared_styles is
             not a boolean,
@@ -158,6 +167,12 @@ def __init__(
         if not isinstance(max_height, int) or max_height <= 0:
             msg = "max_height must be a positive integer"
             raise ValueError(msg)
+        if not isinstance(max_table_bytes, int) or max_table_bytes <= 0:
+            msg = "max_table_bytes must be a positive integer"
+            raise ValueError(msg)
+        if not isinstance(min_table_rows, int) or min_table_rows <= 0:
+            msg = "min_table_rows must be a positive integer"
+            raise ValueError(msg)
 
         # Validate boolean parameters
         if not isinstance(enable_cell_expansion, bool):
@@ -188,6 +203,8 @@ def __init__(
         self.show_truncation_message = show_truncation_message
         self.style_provider = style_provider or DefaultStyleProvider()
         self.use_shared_styles = use_shared_styles
+        self.max_table_bytes = max_table_bytes
+        self.min_table_rows = min_table_rows
         # Registry for custom type formatters
         self._type_formatters: dict[type, CellFormatter] = {}
         # Custom cell builders
 
@@ -71,8 +71,6 @@ impl PyTableProvider {
         PyTable::new(table_provider)
     }
 }
-const MAX_TABLE_BYTES_TO_DISPLAY: usize = 2 * 1024 * 1024; // 2 MB
-const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
 /// Use it to build a plan and `.collect()` to execute the plan and collect the result.
@@ -81,12 +79,16 @@ const MIN_TABLE_ROWS_TO_DISPLAY: usize = 20;
 #[derive(Clone)]
 pub struct PyDataFrame {
     df: Arc<DataFrame>,
+    display_config: Arc<PyDataframeDisplayConfig>,
 }
 
 impl PyDataFrame {
     /// creates a new PyDataFrame
-    pub fn new(df: DataFrame) -> Self {
-        Self { df: Arc::new(df) }
+    pub fn new(df: DataFrame, display_config: PyDataframeDisplayConfig) -> Self {
+        Self {
+            df: Arc::new(df),
+            display_config: Arc::new(display_config),
+        }
     }
 }
 
@@ -116,7 +118,12 @@ impl PyDataFrame {
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
         let (batches, has_more) = wait_for_future(
             py,
-            collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10),
+            collect_record_batches_to_display(
+                self.df.as_ref().clone(),
+                10,
+                10,
+                self.display_config.max_table_bytes,
+            ),
         )?;
         if batches.is_empty() {
             // This should not be reached, but do it for safety since we index into the vector below
@@ -139,8 +146,9 @@ impl PyDataFrame {
             py,
             collect_record_batches_to_display(
                 self.df.as_ref().clone(),
-                MIN_TABLE_ROWS_TO_DISPLAY,
+                self.display_config.min_table_rows,
                 usize::MAX,
+                self.display_config.max_table_bytes,
             ),
         )?;
         if batches.is_empty() {
@@ -181,7 +189,7 @@ impl PyDataFrame {
     fn describe(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone();
         let stat_df = wait_for_future(py, df.describe())?;
-        Ok(Self::new(stat_df))
+        Ok(Self::new(stat_df, (*self.display_config).clone()))
     }
 
     /// Returns the schema from the logical plan
@@ -211,31 +219,31 @@ impl PyDataFrame {
     fn select_columns(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let args = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().select_columns(&args)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn select(&self, args: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let expr = args.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().select(expr)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*args))]
     fn drop(&self, args: Vec<PyBackedStr>) -> PyDataFusionResult<Self> {
         let cols = args.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
         let df = self.df.as_ref().clone().drop_columns(&cols)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn filter(&self, predicate: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().filter(predicate.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn with_column(&self, name: &str, expr: PyExpr) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().with_column(name, expr.into())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn with_columns(&self, exprs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
@@ -245,7 +253,7 @@ impl PyDataFrame {
             let name = format!("{}", expr.schema_name());
             df = df.with_column(name.as_str(), expr)?
         }
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -256,27 +264,27 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .with_column_renamed(old_name, new_name)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn aggregate(&self, group_by: Vec<PyExpr>, aggs: Vec<PyExpr>) -> PyDataFusionResult<Self> {
         let group_by = group_by.into_iter().map(|e| e.into()).collect();
         let aggs = aggs.into_iter().map(|e| e.into()).collect();
         let df = self.df.as_ref().clone().aggregate(group_by, aggs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (*exprs))]
     fn sort(&self, exprs: Vec<PySortExpr>) -> PyDataFusionResult<Self> {
         let exprs = to_sort_expressions(exprs);
         let df = self.df.as_ref().clone().sort(exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (count, offset=0))]
     fn limit(&self, count: usize, offset: usize) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().limit(offset, Some(count))?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Executes the plan, returning a list of `RecordBatch`es.
@@ -293,7 +301,7 @@ impl PyDataFrame {
     /// Cache DataFrame.
     fn cache(&self, py: Python) -> PyDataFusionResult<Self> {
         let df = wait_for_future(py, self.df.as_ref().clone().cache())?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Executes this DataFrame and collects all results into a vector of vector of RecordBatch
@@ -318,7 +326,7 @@ impl PyDataFrame {
     /// Filter out duplicate rows
     fn distinct(&self) -> PyDataFusionResult<Self> {
         let df = self.df.as_ref().clone().distinct()?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn join(
@@ -352,7 +360,7 @@ impl PyDataFrame {
             &right_keys,
             None,
         )?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     fn join_on(
@@ -381,7 +389,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .join_on(right.df.as_ref().clone(), join_type, exprs)?;
-        Ok(Self::new(df))
+        Ok(Self::new(df, (*self.display_config).clone()))
     }
 
     /// Print the query plan
@@ -414,7 +422,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::RoundRobinBatch(num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Repartition a `DataFrame` based on a logical partitioning scheme.
@@ -426,7 +434,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .repartition(Partitioning::Hash(expr, num))?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the union of two `DataFrame`s, preserving duplicate rows.The
@@ -442,7 +450,7 @@ impl PyDataFrame {
             self.df.as_ref().clone().union(py_df.df.as_ref().clone())?
         };
 
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the distinct union of two `DataFrame`s.  The
@@ -453,7 +461,7 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .union_distinct(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     #[pyo3(signature = (column, preserve_nulls=true))]
@@ -494,13 +502,13 @@ impl PyDataFrame {
             .as_ref()
             .clone()
             .intersect(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Calculate the exception of two `DataFrame`s.  The two `DataFrame`s must have exactly the same schema
     fn except_all(&self, py_df: PyDataFrame) -> PyDataFusionResult<Self> {
         let new_df = self.df.as_ref().clone().except(py_df.df.as_ref().clone())?;
-        Ok(Self::new(new_df))
+        Ok(Self::new(new_df, (*self.display_config).clone()))
     }
 
     /// Write a `DataFrame` to a CSV file.
@@ -798,6 +806,7 @@ async fn collect_record_batches_to_display(
     df: DataFrame,
     min_rows: usize,
     max_rows: usize,
+    max_table_bytes: usize,
 ) -> Result<(Vec<RecordBatch>, bool), DataFusionError> {
     let partitioned_stream = df.execute_stream_partitioned().await?;
     let mut stream = futures::stream::iter(partitioned_stream).flatten();
@@ -806,7 +815,7 @@ async fn collect_record_batches_to_display(
     let mut record_batches = Vec::default();
     let mut has_more = false;
 
-    while (size_estimate_so_far < MAX_TABLE_BYTES_TO_DISPLAY && rows_so_far < max_rows)
+    while (size_estimate_so_far < max_table_bytes && rows_so_far < max_rows)
         || rows_so_far < min_rows
     {
         let mut rb = match stream.next().await {
@@ -821,8 +830,8 @@ async fn collect_record_batches_to_display(
         if rows_in_rb > 0 {
             size_estimate_so_far += rb.get_array_memory_size();
 
-            if size_estimate_so_far > MAX_TABLE_BYTES_TO_DISPLAY {
-                let ratio = MAX_TABLE_BYTES_TO_DISPLAY as f32 / size_estimate_so_far as f32;
+            if size_estimate_so_far > max_table_bytes {
+                let ratio = max_table_bytes as f32 / size_estimate_so_far as f32;
                 let total_rows = rows_in_rb + rows_so_far;
 
                 let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize;