8000 refactor: add debug utilities for HTML formatter integration testing … · kosiew/datafusion-python@0208862 · GitHub
[go: up one dir, main page]

Skip to content

Commit 0208862

Browse files
committed
refactor: add debug utilities for HTML formatter integration testing and enhance debugging output in DataFrameHtmlFormatter
1 parent 2c3bd60 commit 0208862

File tree

3 files c 8000 hanged

+136
-1
lines changed

3 files changed

+136
-1
lines changed

python/datafusion/debug_utils.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""Debug utilities for DataFusion."""
2+
3+
4+
def check_html_formatter_integration():
5+
"""Debug function to check if DataFrame properly uses the HTML formatter."""
6+
from datafusion import SessionContext
7+
from datafusion.html_formatter import get_formatter, configure_formatter
8+
9+
# Print formatter details
10+
formatter = get_formatter()
11+
print(f"Default formatter ID: {id(formatter)}")
12+
print(f"Has type formatters: {len(formatter._type_formatters)}")
13+
14+
# Create a test DataFrame
15+
ctx = SessionContext()
16+
df = ctx.sql("SELECT 1 as a, 2 as b, 3 as c")
17+
18+
# Check if DataFrame has _repr_html_ method
19+
if not hasattr(df, "_repr_html_"):
20+
print("ERROR: DataFrame does not have _repr_html_ method")
21+
return
22+
23+
# Get the _repr_html_ method
24+
repr_html_method = getattr(df, "_repr_html_")
25+
print(f"DataFrame _repr_html_ method: {repr_html_method}")
26+
27+
# Register a custom formatter
28+
formatter.register_formatter(int, lambda n: f"INT:{n}")
29+
print("Registered formatter for integers")
30+
31+
# Generate HTML and check if our formatter was used
32+
html_output = df._repr_html_()
33+
print(f"HTML contains our formatter output (INT:1): {'INT:1' in html_output}")
34+
35+
# If not using our formatter, try to install a monkeypatch
36+
if "INT:1" not in html_output:
37+
print("Installing monkeypatch for DataFrame._repr_html_")
38+
import importlib
39+
40+
df_module = importlib.import_module("datafusion.dataframe")
41+
DataFrame = getattr(df_module, "DataFrame")
42+
43+
# Define the monkeypatch
44+
def patched_repr_html(self):
45+
"""Patched version of _repr_html_ to use our formatter."""
46+
from datafusion.html_formatter import get_formatter
47+
48+
formatter = get_formatter()
49+
print(f"Patched _repr_html_ using formatter ID: {id(formatter)}")
50+
return formatter.format_html(self.collect(), self.schema())
51+
52+
# Apply the monkeypatch
53+
setattr(DataFrame, "_repr_html_", patched_repr_html)
54+
55+
# Test again
56+
df = ctx.sql("SELECT 1 as a, 2 as b, 3 as c")
57+
html_output = df._repr_html_()
58+
print(
59+
f"After monkeypatch, HTML contains our formatter output (INT:1): {'INT:1' in html_output}"
60+
)

python/datafusion/html_formatter.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""HTML formatting utilities for DataFusion DataFrames."""
22

3+
import sys
34
from typing import Dict, Optional, Any, Union, List, Callable, Type, Protocol
45

56

@@ -140,6 +141,14 @@ def format_html(
140141
Returns:
141142
HTML string representation of the data
142143
"""
144+
print("DEBUG format_html: Called with batches:", len(batches) if batches else 0)
145+
print(
146+
f"DEBUG format_html: Type formatters registered: {len(self._type_formatters)}"
147+
)
148+
print(
149+
f"DEBUG format_html: Has custom cell builder: {self._custom_cell_builder is not None}"
150+
)
151+
143152
if not batches:
144153
return "No data to display"
145154

@@ -215,16 +224,25 @@ def _build_table_body(self, batches: list, table_uuid: str) -> List[str]:
215224
for col_idx, column in enumerate(batch.columns):
216225
# Get the raw value from the column
217226
raw_value = self._get_cell_value(column, row_idx)
227+
print(
228+
f"DEBUG row {row_count}, col {col_idx}: raw_value = {raw_value} ({type(raw_value).__name__})"
229+
)
218230

219231
# Always check for type formatters first to format the value
220232
formatted_value = self._format_cell_value(raw_value)
233+
print(
234+
f"DEBUG row {row_count}, col {col_idx}: formatted_value = {formatted_value}"
235+
)
221236

222237
# Then apply either custom cell builder or standard cell formatting
223238
if self._custom_cell_builder:
224239
# Pass both the raw value and formatted value to let the builder decide
225240
cell_html = self._custom_cell_builder(
226241
raw_value, row_count, col_idx, table_uuid
227242
)
243+
print(
244+
f"DEBUG custom cell builder returned: {cell_html[:50]}..."
245+
)
228246
html.append(cell_html)
229247
else:
230248
# Standard cell formatting with formatted value
@@ -273,7 +291,10 @@ def _format_cell_value(self, value: Any) -> str:
273291
# Check for custom type formatters
274292
for type_cls, formatter in self._type_formatters.items():
275293
if isinstance(value, type_cls):
276-
return formatter(value)
294+
print(f"DEBUG formatter match for {type_cls.__name__}: {value}")
295+
result = formatter(value)
296+
print(f"DEBUG formatter returned: {result}")
297+
return result
277298

278299
# If no formatter matched, return string representation
279300
return str(value)
@@ -383,6 +404,10 @@ def get_formatter() -> DataFrameHtmlFormatter:
383404
Returns:
384405
The global HTML formatter instance
385406
"""
407+
print(f"DEBUG get_formatter: returning instance id={id(_default_formatter)}")
408+
print(
409+
f"DEBUG get_formatter: type formatters: {len(_default_formatter._type_formatters)}"
410+
)
386411
return _default_formatter
387412

388413

python/tests/test_dataframe.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,56 @@ def test_html_formatter_type_formatters(df, reset_formatter):
740740
assert '<span style="color: red">3</span>' in html_output
741741

742742

743+
def test_html_formatter_type_formatters_debug(df, reset_formatter):
744+
"""Debugging version of test_html_formatter_type_formatters."""
745+
from datafusion.html_formatter import get_formatter
746+
747+
print("\n\n==== STARTING test_html_formatter_type_formatters_debug ====")
748+
749+
# Import the debug utility
750+
try:
751+
from datafusion.debug_utils import check_html_formatter_integration
752+
753+
check_html_formatter_integration()
754+
except ImportError:
755+
print("Could not import debug_utils, continuing...")
756+
757+
# Get current formatter and register custom formatters
758+
formatter = get_formatter()
759+
760+
# Format integers with color based on value
761+
formatter.register_formatter(
762+
int, lambda n: f'<span style="color: {"red" if n > 2 else "blue"}">{n}</span>'
763+
)
764+
print(f"Registered formatter for int: {formatter._type_formatters}")
765+
766+
# Let's examine the DataFrame instance
767+
print(f"DataFrame type: {type(df).__name__}")
768+
print(
769+
f"DataFrame dir: {[m for m in dir(df) if not m.startswith('_') or m == '_repr_html_']}"
770+
)
771+
772+
# Let's check what _repr_html_ does
773+
import inspect
774+
775+
if hasattr(df, "_repr_html_"):
776+
print(f"_repr_html_ source: {inspect.getsource(df._repr_html_)}")
777+
else:
778+
print("No _repr_html_ method found")
779+
780+
# Get the HTML output
781+
html_output = df._repr_html_()
782+
783+
# Check for our expected string
784+
expected = '<span style="color: blue">1</span>'
785+
print(f"Expected string '{expected}' in output: {expected in html_output}")
786+
787+
# Print a small portion of the output
788+
print(f"HTML snippet: {html_output[:500]}...")
789+
790+
print("==== END test_html_formatter_type_formatters_debug ====\n\n")
791+
792+
743793
def test_html_formatter_custom_cell_builder(df, reset_formatter):
744794
"""Test using a custom cell builder function."""
745795
from datafusion.html_formatter import get_formatter

0 commit comments

Comments
 (0)
0