diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
index 16d65f68..9586ae26 100644
--- a/python/datafusion/__init__.py
+++ b/python/datafusion/__init__.py
@@ -47,11 +47,11 @@
SQLOptions,
)
from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
+from .dataframe_formatter import configure_formatter
from .expr import (
Expr,
WindowFrame,
)
-from .html_formatter import configure_formatter
from .io import read_avro, read_csv, read_json, read_parquet
from .plan import ExecutionPlan, LogicalPlan
from .record_batch import RecordBatch, RecordBatchStream
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 1fd63bdc..034c4cb7 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -52,7 +52,6 @@
import polars as pl
import pyarrow as pa
- from datafusion._internal import DataFrame as DataFrameInternal
from datafusion._internal import expr as expr_internal
from enum import Enum
@@ -323,6 +322,20 @@ def __repr__(self) -> str:
def _repr_html_(self) -> str:
return self.df._repr_html_()
+ @staticmethod
+ def default_str_repr(
+ batches: list[pa.RecordBatch],
+ schema: pa.Schema,
+ has_more: bool,
+ table_uuid: str | None = None,
+ ) -> str:
+ """Return the default string representation of a DataFrame.
+
+ This method is used by the default formatter and implemented in Rust for
+ performance reasons.
+ """
+ return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid)
+
def describe(self) -> DataFrame:
"""Return the statistics for this DataFrame.
diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py
new file mode 100644
index 00000000..27f00f9c
--- /dev/null
+++ b/python/datafusion/dataframe_formatter.py
@@ -0,0 +1,739 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""HTML formatting utilities for DataFusion DataFrames."""
+
+from __future__ import annotations
+
+from typing import (
+ Any,
+ Callable,
+ Optional,
+ Protocol,
+ runtime_checkable,
+)
+
+from datafusion._internal import DataFrame as DataFrameInternal
+
+
+def _validate_positive_int(value: Any, param_name: str) -> None:
+ """Validate that a parameter is a positive integer.
+
+ Args:
+ value: The value to validate
+ param_name: Name of the parameter (used in error message)
+
+ Raises:
+ ValueError: If the value is not a positive integer
+ """
+ if not isinstance(value, int) or value <= 0:
+ msg = f"{param_name} must be a positive integer"
+ raise ValueError(msg)
+
+
+def _validate_bool(value: Any, param_name: str) -> None:
+ """Validate that a parameter is a boolean.
+
+ Args:
+ value: The value to validate
+ param_name: Name of the parameter (used in error message)
+
+ Raises:
+ TypeError: If the value is not a boolean
+ """
+ if not isinstance(value, bool):
+ msg = f"{param_name} must be a boolean"
+ raise TypeError(msg)
+
+
+@runtime_checkable
+class CellFormatter(Protocol):
+ """Protocol for cell value formatters."""
+
+ def __call__(self, value: Any) -> str:
+ """Format a cell value to string representation."""
+ ...
+
+
+@runtime_checkable
+class StyleProvider(Protocol):
+ """Protocol for HTML style providers."""
+
+ def get_cell_style(self) -> str:
+ """Get the CSS style for table cells."""
+ ...
+
+ def get_header_style(self) -> str:
+ """Get the CSS style for header cells."""
+ ...
+
+
+class DefaultStyleProvider:
+ """Default implementation of StyleProvider."""
+
+ def get_cell_style(self) -> str:
+ """Get the CSS style for table cells.
+
+ Returns:
+ CSS style string
+ """
+ return (
+ "border: 1px solid black; padding: 8px; text-align: left; "
+ "white-space: nowrap;"
+ )
+
+ def get_header_style(self) -> str:
+ """Get the CSS style for header cells.
+
+ Returns:
+ CSS style string
+ """
+ return (
+ "border: 1px solid black; padding: 8px; text-align: left; "
+ "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; "
+ "max-width: fit-content;"
+ )
+
+
+class DataFrameHtmlFormatter:
+ """Configurable HTML formatter for DataFusion DataFrames.
+
+ This class handles the HTML rendering of DataFrames for display in
+ Jupyter notebooks and other rich display contexts.
+
+ This class supports extension through composition. Key extension points:
+ - Provide a custom StyleProvider for styling cells and headers
+ - Register custom formatters for specific types
+ - Provide custom cell builders for specialized cell rendering
+
+ Args:
+ max_cell_length: Maximum characters to display in a cell before truncation
+ max_width: Maximum width of the HTML table in pixels
+ max_height: Maximum height of the HTML table in pixels
+ max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB)
+ min_rows_display: Minimum number of rows to display
+ repr_rows: Default number of rows to display in repr output
+ enable_cell_expansion: Whether to add expand/collapse buttons for long cell
+ values
+ custom_css: Additional CSS to include in the HTML output
+ show_truncation_message: Whether to display a message when data is truncated
+ style_provider: Custom provider for cell and header styles
+ use_shared_styles: Whether to load styles and scripts only once per notebook
+ session
+ """
+
+ # Class variable to track if styles have been loaded in the notebook
+ _styles_loaded = False
+
+ def __init__(
+ self,
+ max_cell_length: int = 25,
+ max_width: int = 1000,
+ max_height: int = 300,
+ max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB
+ min_rows_display: int = 20,
+ repr_rows: int = 10,
+ enable_cell_expansion: bool = True,
+ custom_css: Optional[str] = None,
+ show_truncation_message: bool = True,
+ style_provider: Optional[StyleProvider] = None,
+ use_shared_styles: bool = True,
+ ) -> None:
+ """Initialize the HTML formatter.
+
+ Parameters
+ ----------
+ max_cell_length : int, default 25
+ Maximum length of cell content before truncation.
+ max_width : int, default 1000
+ Maximum width of the displayed table in pixels.
+ max_height : int, default 300
+ Maximum height of the displayed table in pixels.
+ max_memory_bytes : int, default 2097152 (2MB)
+ Maximum memory in bytes for rendered data.
+ min_rows_display : int, default 20
+ Minimum number of rows to display.
+ repr_rows : int, default 10
+ Default number of rows to display in repr output.
+ enable_cell_expansion : bool, default True
+ Whether to allow cells to expand when clicked.
+ custom_css : str, optional
+ Custom CSS to apply to the HTML table.
+ show_truncation_message : bool, default True
+ Whether to show a message indicating that content has been truncated.
+ style_provider : StyleProvider, optional
+ Provider of CSS styles for the HTML table. If None, DefaultStyleProvider
+ is used.
+ use_shared_styles : bool, default True
+ Whether to use shared styles across multiple tables.
+
+ Raises:
+ ------
+ ValueError
+ If max_cell_length, max_width, max_height, max_memory_bytes,
+ min_rows_display, or repr_rows is not a positive integer.
+ TypeError
+ If enable_cell_expansion, show_truncation_message, or use_shared_styles is
+ not a boolean,
+ or if custom_css is provided but is not a string,
+ or if style_provider is provided but does not implement the StyleProvider
+ protocol.
+ """
+ # Validate numeric parameters
+ _validate_positive_int(max_cell_length, "max_cell_length")
+ _validate_positive_int(max_width, "max_width")
+ _validate_positive_int(max_height, "max_height")
+ _validate_positive_int(max_memory_bytes, "max_memory_bytes")
+ _validate_positive_int(min_rows_display, "min_rows_display")
+ _validate_positive_int(repr_rows, "repr_rows")
+
+ # Validate boolean parameters
+ _validate_bool(enable_cell_expansion, "enable_cell_expansion")
+ _validate_bool(show_truncation_message, "show_truncation_message")
+ _validate_bool(use_shared_styles, "use_shared_styles")
+
+ # Validate custom_css
+ if custom_css is not None and not isinstance(custom_css, str):
+ msg = "custom_css must be None or a string"
+ raise TypeError(msg)
+
+ # Validate style_provider
+ if style_provider is not None and not isinstance(style_provider, StyleProvider):
+ msg = "style_provider must implement the StyleProvider protocol"
+ raise TypeError(msg)
+
+ self.max_cell_length = max_cell_length
+ self.max_width = max_width
+ self.max_height = max_height
+ self.max_memory_bytes = max_memory_bytes
+ self.min_rows_display = min_rows_display
+ self.repr_rows = repr_rows
+ self.enable_cell_expansion = enable_cell_expansion
+ self.custom_css = custom_css
+ self.show_truncation_message = show_truncation_message
+ self.style_provider = style_provider or DefaultStyleProvider()
+ self.use_shared_styles = use_shared_styles
+ # Registry for custom type formatters
+ self._type_formatters: dict[type, CellFormatter] = {}
+ # Custom cell builders
+ self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None
+ self._custom_header_builder: Optional[Callable[[Any], str]] = None
+
+ def register_formatter(self, type_class: type, formatter: CellFormatter) -> None:
+ """Register a custom formatter for a specific data type.
+
+ Args:
+ type_class: The type to register a formatter for
+ formatter: Function that takes a value of the given type and returns
+ a formatted string
+ """
+ self._type_formatters[type_class] = formatter
+
+ def set_custom_cell_builder(
+ self, builder: Callable[[Any, int, int, str], str]
+ ) -> None:
+ """Set a custom cell builder function.
+
+ Args:
+ builder: Function that takes (value, row, col, table_id) and returns HTML
+ """
+ self._custom_cell_builder = builder
+
+ def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None:
+ """Set a custom header builder function.
+
+ Args:
+ builder: Function that takes a field and returns HTML
+ """
+ self._custom_header_builder = builder
+
+ @classmethod
+ def is_styles_loaded(cls) -> bool:
+ """Check if HTML styles have been loaded in the current session.
+
+ This method is primarily intended for debugging UI rendering issues
+ related to style loading.
+
+ Returns:
+ True if styles have been loaded, False otherwise
+
+ Example:
+ >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter
+ >>> DataFrameHtmlFormatter.is_styles_loaded()
+ False
+ """
+ return cls._styles_loaded
+
+ def format_html(
+ self,
+ batches: list,
+ schema: Any,
+ has_more: bool = False,
+ table_uuid: str | None = None,
+ ) -> str:
+ """Format record batches as HTML.
+
+ This method is used by DataFrame's _repr_html_ implementation and can be
+ called directly when custom HTML rendering is needed.
+
+ Args:
+ batches: List of Arrow RecordBatch objects
+ schema: Arrow Schema object
+ has_more: Whether there are more batches not shown
+ table_uuid: Unique ID for the table, used for JavaScript interactions
+
+ Returns:
+ HTML string representation of the data
+
+ Raises:
+ TypeError: If schema is invalid and no batches are provided
+ """
+ if not batches:
+ return "No data to display"
+
+ # Validate schema
+ if schema is None or not hasattr(schema, "__iter__"):
+ msg = "Schema must be provided"
+ raise TypeError(msg)
+
+ # Generate a unique ID if none provided
+ table_uuid = table_uuid or f"df-{id(batches)}"
+
+ # Build HTML components
+ html = []
+
+ # Only include styles and scripts if:
+ # 1. Not using shared styles, OR
+ # 2. Using shared styles but they haven't been loaded yet
+ include_styles = (
+ not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded
+ )
+
+ if include_styles:
+ html.extend(self._build_html_header())
+ # If we're using shared styles, mark them as loaded
+ if self.use_shared_styles:
+ DataFrameHtmlFormatter._styles_loaded = True
+
+ html.extend(self._build_table_container_start())
+
+ # Add table header and body
+ html.extend(self._build_table_header(schema))
+ html.extend(self._build_table_body(batches, table_uuid))
+
+ html.append("")
+ html.append("")
+
+ # Add footer (JavaScript and messages)
+ if include_styles and self.enable_cell_expansion:
+ html.append(self._get_javascript())
+
+ # Always add truncation message if needed (independent of styles)
+ if has_more and self.show_truncation_message:
+ html.append("
Data truncated due to size.
")
+
+ return "\n".join(html)
+
+ def format_str(
+ self,
+ batches: list,
+ schema: Any,
+ has_more: bool = False,
+ table_uuid: str | None = None,
+ ) -> str:
+ """Format record batches as a string.
+
+ This method is used by DataFrame's __repr__ implementation and can be
+ called directly when string rendering is needed.
+
+ Args:
+ batches: List of Arrow RecordBatch objects
+ schema: Arrow Schema object
+ has_more: Whether there are more batches not shown
+ table_uuid: Unique ID for the table, used for JavaScript interactions
+
+ Returns:
+ String representation of the data
+
+ Raises:
+ TypeError: If schema is invalid and no batches are provided
+ """
+ return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid)
+
+ def _build_html_header(self) -> list[str]:
+ """Build the HTML header with CSS styles."""
+ html = []
+ html.append("")
+ return html
+
+ def _build_table_container_start(self) -> list[str]:
+ """Build the opening tags for the table container."""
+ html = []
+ html.append(
+ f'
'
+ )
+ html.append('
')
+ return html
+
+ def _build_table_header(self, schema: Any) -> list[str]:
+ """Build the HTML table header with column names."""
+ html = []
+ html.append("")
+ html.append("
")
+ for field in schema:
+ if self._custom_header_builder:
+ html.append(self._custom_header_builder(field))
+ else:
+ html.append(
+ f"
"
+ f"{field.name}
"
+ )
+ html.append("
")
+ html.append("")
+ return html
+
+ def _build_table_body(self, batches: list, table_uuid: str) -> list[str]:
+ """Build the HTML table body with data rows."""
+ html = []
+ html.append("")
+
+ row_count = 0
+ for batch in batches:
+ for row_idx in range(batch.num_rows):
+ row_count += 1
+ html.append("
")
+
+ for col_idx, column in enumerate(batch.columns):
+ # Get the raw value from the column
+ raw_value = self._get_cell_value(column, row_idx)
+
+ # Always check for type formatters first to format the value
+ formatted_value = self._format_cell_value(raw_value)
+
+ # Then apply either custom cell builder or standard cell formatting
+ if self._custom_cell_builder:
+ # Pass both the raw value and formatted value to let the
+ # builder decide
+ cell_html = self._custom_cell_builder(
+ raw_value, row_count, col_idx, table_uuid
+ )
+ html.append(cell_html)
+ else:
+ # Standard cell formatting with formatted value
+ if (
+ len(str(raw_value)) > self.max_cell_length
+ and self.enable_cell_expansion
+ ):
+ cell_html = self._build_expandable_cell(
+ formatted_value, row_count, col_idx, table_uuid
+ )
+ else:
+ cell_html = self._build_regular_cell(formatted_value)
+ html.append(cell_html)
+
+ html.append("
")
+
+ html.append("")
+ return html
+
+ def _get_cell_value(self, column: Any, row_idx: int) -> Any:
+ """Extract a cell value from a column.
+
+ Args:
+ column: Arrow array
+ row_idx: Row index
+
+ Returns:
+ The raw cell value
+ """
+ try:
+ value = column[row_idx]
+
+ if hasattr(value, "as_py"):
+ return value.as_py()
+ except (AttributeError, TypeError):
+ pass
+ else:
+ return value
+
+ def _format_cell_value(self, value: Any) -> str:
+ """Format a cell value for display.
+
+ Uses registered type formatters if available.
+
+ Args:
+ value: The cell value to format
+
+ Returns:
+ Formatted cell value as string
+ """
+ # Check for custom type formatters
+ for type_cls, formatter in self._type_formatters.items():
+ if isinstance(value, type_cls):
+ return formatter(value)
+
+ # If no formatter matched, return string representation
+ return str(value)
+
+ def _build_expandable_cell(
+ self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str
+ ) -> str:
+ """Build an expandable cell for long content."""
+ short_value = str(formatted_value)[: self.max_cell_length]
+ return (
+ f"
"
+ )
+
+ def _build_html_footer(self, has_more: bool) -> list[str]:
+ """Build the HTML footer with JavaScript and messages."""
+ html = []
+
+ # Add JavaScript for interactivity only if cell expansion is enabled
+ # and we're not using the shared styles approach
+ if self.enable_cell_expansion and not self.use_shared_styles:
+ html.append(self._get_javascript())
+
+ # Add truncation message if needed
+ if has_more and self.show_truncation_message:
+ html.append("
Data truncated due to size.
")
+
+ return html
+
+ def _get_default_css(self) -> str:
+ """Get default CSS styles for the HTML table."""
+ return """
+ .expandable-container {
+ display: inline-block;
+ max-width: 200px;
+ }
+ .expandable {
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ display: block;
+ }
+ .full-text {
+ display: none;
+ white-space: normal;
+ }
+ .expand-btn {
+ cursor: pointer;
+ color: blue;
+ text-decoration: underline;
+ border: none;
+ background: none;
+ font-size: inherit;
+ display: block;
+ margin-top: 5px;
+ }
+ """
+
+ def _get_javascript(self) -> str:
+ """Get JavaScript code for interactive elements."""
+ return """
+
+ """
+
+
+class FormatterManager:
+ """Manager class for the global DataFrame HTML formatter instance."""
+
+ _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter()
+
+ @classmethod
+ def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None:
+ """Set the global DataFrame HTML formatter.
+
+ Args:
+ formatter: The formatter instance to use globally
+ """
+ cls._default_formatter = formatter
+ _refresh_formatter_reference()
+
+ @classmethod
+ def get_formatter(cls) -> DataFrameHtmlFormatter:
+ """Get the current global DataFrame HTML formatter.
+
+ Returns:
+ The global HTML formatter instance
+ """
+ return cls._default_formatter
+
+
+def get_formatter() -> DataFrameHtmlFormatter:
+ """Get the current global DataFrame HTML formatter.
+
+ This function is used by the DataFrame._repr_html_ implementation to access
+ the shared formatter instance. It can also be used directly when custom
+ HTML rendering is needed.
+
+ Returns:
+ The global HTML formatter instance
+
+ Example:
+ >>> from datafusion.html_formatter import get_formatter
+ >>> formatter = get_formatter()
+ >>> formatter.max_cell_length = 50 # Increase cell length
+ """
+ return FormatterManager.get_formatter()
+
+
+def set_formatter(formatter: DataFrameHtmlFormatter) -> None:
+ """Set the global DataFrame HTML formatter.
+
+ Args:
+ formatter: The formatter instance to use globally
+
+ Example:
+ >>> from datafusion.html_formatter import get_formatter, set_formatter
+ >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100)
+ >>> set_formatter(custom_formatter)
+ """
+ FormatterManager.set_formatter(formatter)
+
+
+def configure_formatter(**kwargs: Any) -> None:
+ """Configure the global DataFrame HTML formatter.
+
+ This function creates a new formatter with the provided configuration
+ and sets it as the global formatter for all DataFrames.
+
+ Args:
+ **kwargs: Formatter configuration parameters like max_cell_length,
+ max_width, max_height, enable_cell_expansion, etc.
+
+ Raises:
+ ValueError: If any invalid parameters are provided
+
+ Example:
+ >>> from datafusion.html_formatter import configure_formatter
+ >>> configure_formatter(
+ ... max_cell_length=50,
+ ... max_height=500,
+ ... enable_cell_expansion=True,
+ ... use_shared_styles=True
+ ... )
+ """
+ # Valid parameters accepted by DataFrameHtmlFormatter
+ valid_params = {
+ "max_cell_length",
+ "max_width",
+ "max_height",
+ "max_memory_bytes",
+ "min_rows_display",
+ "repr_rows",
+ "enable_cell_expansion",
+ "custom_css",
+ "show_truncation_message",
+ "style_provider",
+ "use_shared_styles",
+ }
+
+ # Check for invalid parameters
+ invalid_params = set(kwargs) - valid_params
+ if invalid_params:
+ msg = (
+ f"Invalid formatter parameters: {', '.join(invalid_params)}. "
+ f"Valid parameters are: {', '.join(valid_params)}"
+ )
+ raise ValueError(msg)
+
+ # Create and set formatter with validated parameters
+ set_formatter(DataFrameHtmlFormatter(**kwargs))
+
+
+def reset_formatter() -> None:
+ """Reset the global DataFrame HTML formatter to default settings.
+
+ This function creates a new formatter with default configuration
+ and sets it as the global formatter for all DataFrames.
+
+ Example:
+ >>> from datafusion.html_formatter import reset_formatter
+ >>> reset_formatter() # Reset formatter to default settings
+ """
+ formatter = DataFrameHtmlFormatter()
+ # Reset the styles_loaded flag to ensure styles will be reloaded
+ DataFrameHtmlFormatter._styles_loaded = False
+ set_formatter(formatter)
+
+
+def reset_styles_loaded_state() -> None:
+ """Reset the styles loaded state to force reloading of styles.
+
+ This can be useful when switching between notebook sessions or
+ when styles need to be refreshed.
+
+ Example:
+ >>> from datafusion.html_formatter import reset_styles_loaded_state
+ >>> reset_styles_loaded_state() # Force styles to reload in next render
+ """
+ DataFrameHtmlFormatter._styles_loaded = False
+
+
+def _refresh_formatter_reference() -> None:
+ """Refresh formatter reference in any modules using it.
+
+ This helps ensure that changes to the formatter are reflected in existing
+ DataFrames that might be caching the formatter reference.
+ """
+ # This is a no-op but signals modules to refresh their reference
diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py
index 12a7e455..65eb1f04 100644
--- a/python/datafusion/html_formatter.py
+++ b/python/datafusion/html_formatter.py
@@ -14,698 +14,16 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-"""HTML formatting utilities for DataFusion DataFrames."""
-from __future__ import annotations
+"""Deprecated module for dataframe formatting."""
-from typing import (
- Any,
- Callable,
- Optional,
- Protocol,
- runtime_checkable,
-)
-
-
-def _validate_positive_int(value: Any, param_name: str) -> None:
- """Validate that a parameter is a positive integer.
-
- Args:
- value: The value to validate
- param_name: Name of the parameter (used in error message)
-
- Raises:
- ValueError: If the value is not a positive integer
- """
- if not isinstance(value, int) or value <= 0:
- msg = f"{param_name} must be a positive integer"
- raise ValueError(msg)
-
-
-def _validate_bool(value: Any, param_name: str) -> None:
- """Validate that a parameter is a boolean.
-
- Args:
- value: The value to validate
- param_name: Name of the parameter (used in error message)
-
- Raises:
- TypeError: If the value is not a boolean
- """
- if not isinstance(value, bool):
- msg = f"{param_name} must be a boolean"
- raise TypeError(msg)
-
-
-@runtime_checkable
-class CellFormatter(Protocol):
- """Protocol for cell value formatters."""
-
- def __call__(self, value: Any) -> str:
- """Format a cell value to string representation."""
- ...
-
-
-@runtime_checkable
-class StyleProvider(Protocol):
- """Protocol for HTML style providers."""
-
- def get_cell_style(self) -> str:
- """Get the CSS style for table cells."""
- ...
-
- def get_header_style(self) -> str:
- """Get the CSS style for header cells."""
- ...
-
-
-class DefaultStyleProvider:
- """Default implementation of StyleProvider."""
-
- def get_cell_style(self) -> str:
- """Get the CSS style for table cells.
-
- Returns:
- CSS style string
- """
- return (
- "border: 1px solid black; padding: 8px; text-align: left; "
- "white-space: nowrap;"
- )
-
- def get_header_style(self) -> str:
- """Get the CSS style for header cells.
-
- Returns:
- CSS style string
- """
- return (
- "border: 1px solid black; padding: 8px; text-align: left; "
- "background-color: #f2f2f2; white-space: nowrap; min-width: fit-content; "
- "max-width: fit-content;"
- )
-
-
-class DataFrameHtmlFormatter:
- """Configurable HTML formatter for DataFusion DataFrames.
-
- This class handles the HTML rendering of DataFrames for display in
- Jupyter notebooks and other rich display contexts.
-
- This class supports extension through composition. Key extension points:
- - Provide a custom StyleProvider for styling cells and headers
- - Register custom formatters for specific types
- - Provide custom cell builders for specialized cell rendering
-
- Args:
- max_cell_length: Maximum characters to display in a cell before truncation
- max_width: Maximum width of the HTML table in pixels
- max_height: Maximum height of the HTML table in pixels
- max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB)
- min_rows_display: Minimum number of rows to display
- repr_rows: Default number of rows to display in repr output
- enable_cell_expansion: Whether to add expand/collapse buttons for long cell
- values
- custom_css: Additional CSS to include in the HTML output
- show_truncation_message: Whether to display a message when data is truncated
- style_provider: Custom provider for cell and header styles
- use_shared_styles: Whether to load styles and scripts only once per notebook
- session
- """
-
- # Class variable to track if styles have been loaded in the notebook
- _styles_loaded = False
-
- def __init__(
- self,
- max_cell_length: int = 25,
- max_width: int = 1000,
- max_height: int = 300,
- max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB
- min_rows_display: int = 20,
- repr_rows: int = 10,
- enable_cell_expansion: bool = True,
- custom_css: Optional[str] = None,
- show_truncation_message: bool = True,
- style_provider: Optional[StyleProvider] = None,
- use_shared_styles: bool = True,
- ) -> None:
- """Initialize the HTML formatter.
-
- Parameters
- ----------
- max_cell_length : int, default 25
- Maximum length of cell content before truncation.
- max_width : int, default 1000
- Maximum width of the displayed table in pixels.
- max_height : int, default 300
- Maximum height of the displayed table in pixels.
- max_memory_bytes : int, default 2097152 (2MB)
- Maximum memory in bytes for rendered data.
- min_rows_display : int, default 20
- Minimum number of rows to display.
- repr_rows : int, default 10
- Default number of rows to display in repr output.
- enable_cell_expansion : bool, default True
- Whether to allow cells to expand when clicked.
- custom_css : str, optional
- Custom CSS to apply to the HTML table.
- show_truncation_message : bool, default True
- Whether to show a message indicating that content has been truncated.
- style_provider : StyleProvider, optional
- Provider of CSS styles for the HTML table. If None, DefaultStyleProvider
- is used.
- use_shared_styles : bool, default True
- Whether to use shared styles across multiple tables.
-
- Raises:
- ------
- ValueError
- If max_cell_length, max_width, max_height, max_memory_bytes,
- min_rows_display, or repr_rows is not a positive integer.
- TypeError
- If enable_cell_expansion, show_truncation_message, or use_shared_styles is
- not a boolean,
- or if custom_css is provided but is not a string,
- or if style_provider is provided but does not implement the StyleProvider
- protocol.
- """
- # Validate numeric parameters
- _validate_positive_int(max_cell_length, "max_cell_length")
- _validate_positive_int(max_width, "max_width")
- _validate_positive_int(max_height, "max_height")
- _validate_positive_int(max_memory_bytes, "max_memory_bytes")
- _validate_positive_int(min_rows_display, "min_rows_display")
- _validate_positive_int(repr_rows, "repr_rows")
-
- # Validate boolean parameters
- _validate_bool(enable_cell_expansion, "enable_cell_expansion")
- _validate_bool(show_truncation_message, "show_truncation_message")
- _validate_bool(use_shared_styles, "use_shared_styles")
-
- # Validate custom_css
- if custom_css is not None and not isinstance(custom_css, str):
- msg = "custom_css must be None or a string"
- raise TypeError(msg)
-
- # Validate style_provider
- if style_provider is not None and not isinstance(style_provider, StyleProvider):
- msg = "style_provider must implement the StyleProvider protocol"
- raise TypeError(msg)
-
- self.max_cell_length = max_cell_length
- self.max_width = max_width
- self.max_height = max_height
- self.max_memory_bytes = max_memory_bytes
- self.min_rows_display = min_rows_display
- self.repr_rows = repr_rows
- self.enable_cell_expansion = enable_cell_expansion
- self.custom_css = custom_css
- self.show_truncation_message = show_truncation_message
- self.style_provider = style_provider or DefaultStyleProvider()
- self.use_shared_styles = use_shared_styles
- # Registry for custom type formatters
- self._type_formatters: dict[type, CellFormatter] = {}
- # Custom cell builders
- self._custom_cell_builder: Optional[Callable[[Any, int, int, str], str]] = None
- self._custom_header_builder: Optional[Callable[[Any], str]] = None
-
- def register_formatter(self, type_class: type, formatter: CellFormatter) -> None:
- """Register a custom formatter for a specific data type.
-
- Args:
- type_class: The type to register a formatter for
- formatter: Function that takes a value of the given type and returns
- a formatted string
- """
- self._type_formatters[type_class] = formatter
-
- def set_custom_cell_builder(
- self, builder: Callable[[Any, int, int, str], str]
- ) -> None:
- """Set a custom cell builder function.
-
- Args:
- builder: Function that takes (value, row, col, table_id) and returns HTML
- """
- self._custom_cell_builder = builder
-
- def set_custom_header_builder(self, builder: Callable[[Any], str]) -> None:
- """Set a custom header builder function.
-
- Args:
- builder: Function that takes a field and returns HTML
- """
- self._custom_header_builder = builder
-
- @classmethod
- def is_styles_loaded(cls) -> bool:
- """Check if HTML styles have been loaded in the current session.
-
- This method is primarily intended for debugging UI rendering issues
- related to style loading.
-
- Returns:
- True if styles have been loaded, False otherwise
-
- Example:
- >>> from datafusion.html_formatter import DataFrameHtmlFormatter
- >>> DataFrameHtmlFormatter.is_styles_loaded()
- False
- """
- return cls._styles_loaded
-
- def format_html(
- self,
- batches: list,
- schema: Any,
- has_more: bool = False,
- table_uuid: str | None = None,
- ) -> str:
- """Format record batches as HTML.
-
- This method is used by DataFrame's _repr_html_ implementation and can be
- called directly when custom HTML rendering is needed.
-
- Args:
- batches: List of Arrow RecordBatch objects
- schema: Arrow Schema object
- has_more: Whether there are more batches not shown
- table_uuid: Unique ID for the table, used for JavaScript interactions
-
- Returns:
- HTML string representation of the data
-
- Raises:
- TypeError: If schema is invalid and no batches are provided
- """
- if not batches:
- return "No data to display"
-
- # Validate schema
- if schema is None or not hasattr(schema, "__iter__"):
- msg = "Schema must be provided"
- raise TypeError(msg)
-
- # Generate a unique ID if none provided
- table_uuid = table_uuid or f"df-{id(batches)}"
-
- # Build HTML components
- html = []
-
- # Only include styles and scripts if:
- # 1. Not using shared styles, OR
- # 2. Using shared styles but they haven't been loaded yet
- include_styles = (
- not self.use_shared_styles or not DataFrameHtmlFormatter._styles_loaded
- )
-
- if include_styles:
- html.extend(self._build_html_header())
- # If we're using shared styles, mark them as loaded
- if self.use_shared_styles:
- DataFrameHtmlFormatter._styles_loaded = True
-
- html.extend(self._build_table_container_start())
-
- # Add table header and body
- html.extend(self._build_table_header(schema))
- html.extend(self._build_table_body(batches, table_uuid))
-
- html.append("
")
- html.append("
")
-
- # Add footer (JavaScript and messages)
- if include_styles and self.enable_cell_expansion:
- html.append(self._get_javascript())
-
- # Always add truncation message if needed (independent of styles)
- if has_more and self.show_truncation_message:
- html.append("
Data truncated due to size.
")
-
- return "\n".join(html)
-
- def _build_html_header(self) -> list[str]:
- """Build the HTML header with CSS styles."""
- html = []
- html.append("")
- return html
+import warnings
- def _build_table_container_start(self) -> list[str]:
- """Build the opening tags for the table container."""
- html = []
- html.append(
- f'
'
- )
- html.append('
')
- return html
+from datafusion.dataframe_formatter import * # noqa: F403
- def _build_table_header(self, schema: Any) -> list[str]:
- """Build the HTML table header with column names."""
- html = []
- html.append("")
- html.append("
")
- for field in schema:
- if self._custom_header_builder:
- html.append(self._custom_header_builder(field))
- else:
- html.append(
- f"
"
- f"{field.name}
"
- )
- html.append("
")
- html.append("")
- return html
-
- def _build_table_body(self, batches: list, table_uuid: str) -> list[str]:
- """Build the HTML table body with data rows."""
- html = []
- html.append("")
-
- row_count = 0
- for batch in batches:
- for row_idx in range(batch.num_rows):
- row_count += 1
- html.append("
")
-
- for col_idx, column in enumerate(batch.columns):
- # Get the raw value from the column
- raw_value = self._get_cell_value(column, row_idx)
-
- # Always check for type formatters first to format the value
- formatted_value = self._format_cell_value(raw_value)
-
- # Then apply either custom cell builder or standard cell formatting
- if self._custom_cell_builder:
- # Pass both the raw value and formatted value to let the
- # builder decide
- cell_html = self._custom_cell_builder(
- raw_value, row_count, col_idx, table_uuid
- )
- html.append(cell_html)
- else:
- # Standard cell formatting with formatted value
- if (
- len(str(raw_value)) > self.max_cell_length
- and self.enable_cell_expansion
- ):
- cell_html = self._build_expandable_cell(
- formatted_value, row_count, col_idx, table_uuid
- )
- else:
- cell_html = self._build_regular_cell(formatted_value)
- html.append(cell_html)
-
- html.append("
")
-
- html.append("")
- return html
-
- def _get_cell_value(self, column: Any, row_idx: int) -> Any:
- """Extract a cell value from a column.
-
- Args:
- column: Arrow array
- row_idx: Row index
-
- Returns:
- The raw cell value
- """
- try:
- value = column[row_idx]
-
- if hasattr(value, "as_py"):
- return value.as_py()
- except (AttributeError, TypeError):
- pass
- else:
- return value
-
- def _format_cell_value(self, value: Any) -> str:
- """Format a cell value for display.
-
- Uses registered type formatters if available.
-
- Args:
- value: The cell value to format
-
- Returns:
- Formatted cell value as string
- """
- # Check for custom type formatters
- for type_cls, formatter in self._type_formatters.items():
- if isinstance(value, type_cls):
- return formatter(value)
-
- # If no formatter matched, return string representation
- return str(value)
-
- def _build_expandable_cell(
- self, formatted_value: str, row_count: int, col_idx: int, table_uuid: str
- ) -> str:
- """Build an expandable cell for long content."""
- short_value = str(formatted_value)[: self.max_cell_length]
- return (
- f"
"
- )
-
- def _build_html_footer(self, has_more: bool) -> list[str]:
- """Build the HTML footer with JavaScript and messages."""
- html = []
-
- # Add JavaScript for interactivity only if cell expansion is enabled
- # and we're not using the shared styles approach
- if self.enable_cell_expansion and not self.use_shared_styles:
- html.append(self._get_javascript())
-
- # Add truncation message if needed
- if has_more and self.show_truncation_message:
- html.append("
Data truncated due to size.
")
-
- return html
-
- def _get_default_css(self) -> str:
- """Get default CSS styles for the HTML table."""
- return """
- .expandable-container {
- display: inline-block;
- max-width: 200px;
- }
- .expandable {
- white-space: nowrap;
- overflow: hidden;
- text-overflow: ellipsis;
- display: block;
- }
- .full-text {
- display: none;
- white-space: normal;
- }
- .expand-btn {
- cursor: pointer;
- color: blue;
- text-decoration: underline;
- border: none;
- background: none;
- font-size: inherit;
- display: block;
- margin-top: 5px;
- }
- """
-
- def _get_javascript(self) -> str:
- """Get JavaScript code for interactive elements."""
- return """
-
- """
-
-
-class FormatterManager:
- """Manager class for the global DataFrame HTML formatter instance."""
-
- _default_formatter: DataFrameHtmlFormatter = DataFrameHtmlFormatter()
-
- @classmethod
- def set_formatter(cls, formatter: DataFrameHtmlFormatter) -> None:
- """Set the global DataFrame HTML formatter.
-
- Args:
- formatter: The formatter instance to use globally
- """
- cls._default_formatter = formatter
- _refresh_formatter_reference()
-
- @classmethod
- def get_formatter(cls) -> DataFrameHtmlFormatter:
- """Get the current global DataFrame HTML formatter.
-
- Returns:
- The global HTML formatter instance
- """
- return cls._default_formatter
-
-
-def get_formatter() -> DataFrameHtmlFormatter:
- """Get the current global DataFrame HTML formatter.
-
- This function is used by the DataFrame._repr_html_ implementation to access
- the shared formatter instance. It can also be used directly when custom
- HTML rendering is needed.
-
- Returns:
- The global HTML formatter instance
-
- Example:
- >>> from datafusion.html_formatter import get_formatter
- >>> formatter = get_formatter()
- >>> formatter.max_cell_length = 50 # Increase cell length
- """
- return FormatterManager.get_formatter()
-
-
-def set_formatter(formatter: DataFrameHtmlFormatter) -> None:
- """Set the global DataFrame HTML formatter.
-
- Args:
- formatter: The formatter instance to use globally
-
- Example:
- >>> from datafusion.html_formatter import get_formatter, set_formatter
- >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100)
- >>> set_formatter(custom_formatter)
- """
- FormatterManager.set_formatter(formatter)
-
-
-def configure_formatter(**kwargs: Any) -> None:
- """Configure the global DataFrame HTML formatter.
-
- This function creates a new formatter with the provided configuration
- and sets it as the global formatter for all DataFrames.
-
- Args:
- **kwargs: Formatter configuration parameters like max_cell_length,
- max_width, max_height, enable_cell_expansion, etc.
-
- Raises:
- ValueError: If any invalid parameters are provided
-
- Example:
- >>> from datafusion.html_formatter import configure_formatter
- >>> configure_formatter(
- ... max_cell_length=50,
- ... max_height=500,
- ... enable_cell_expansion=True,
- ... use_shared_styles=True
- ... )
- """
- # Valid parameters accepted by DataFrameHtmlFormatter
- valid_params = {
- "max_cell_length",
- "max_width",
- "max_height",
- "max_memory_bytes",
- "min_rows_display",
- "repr_rows",
- "enable_cell_expansion",
- "custom_css",
- "show_truncation_message",
- "style_provider",
- "use_shared_styles",
- }
-
- # Check for invalid parameters
- invalid_params = set(kwargs) - valid_params
- if invalid_params:
- msg = (
- f"Invalid formatter parameters: {', '.join(invalid_params)}. "
- f"Valid parameters are: {', '.join(valid_params)}"
- )
- raise ValueError(msg)
-
- # Create and set formatter with validated parameters
- set_formatter(DataFrameHtmlFormatter(**kwargs))
-
-
-def reset_formatter() -> None:
- """Reset the global DataFrame HTML formatter to default settings.
-
- This function creates a new formatter with default configuration
- and sets it as the global formatter for all DataFrames.
-
- Example:
- >>> from datafusion.html_formatter import reset_formatter
- >>> reset_formatter() # Reset formatter to default settings
- """
- formatter = DataFrameHtmlFormatter()
- # Reset the styles_loaded flag to ensure styles will be reloaded
- DataFrameHtmlFormatter._styles_loaded = False
- set_formatter(formatter)
-
-
-def reset_styles_loaded_state() -> None:
- """Reset the styles loaded state to force reloading of styles.
-
- This can be useful when switching between notebook sessions or
- when styles need to be refreshed.
-
- Example:
- >>> from datafusion.html_formatter import reset_styles_loaded_state
- >>> reset_styles_loaded_state() # Force styles to reload in next render
- """
- DataFrameHtmlFormatter._styles_loaded = False
-
-
-def _refresh_formatter_reference() -> None:
- """Refresh formatter reference in any modules using it.
-
- This helps ensure that changes to the formatter are reflected in existing
- DataFrames that might be caching the formatter reference.
- """
- # This is a no-op but signals modules to refresh their reference
+warnings.warn(
+ "The module 'html_formatter' is deprecated and will be removed in the next release."
+ "Please use 'dataframe_formatter' instead.",
+ DeprecationWarning,
+ stacklevel=3,
+)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index 3c9b97f2..3b816bc8 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -37,14 +37,14 @@
from datafusion import (
functions as f,
)
-from datafusion.expr import Window
-from datafusion.html_formatter import (
+from datafusion.dataframe_formatter import (
DataFrameHtmlFormatter,
configure_formatter,
get_formatter,
reset_formatter,
reset_styles_loaded_state,
)
+from datafusion.expr import Window
from pyarrow.csv import write_csv
MB = 1024 * 1024
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 3d68db27..c2ad4771 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -24,6 +24,7 @@ use arrow::compute::can_cast_types;
use arrow::error::ArrowError;
use arrow::ffi::FFI_ArrowSchema;
use arrow::ffi_stream::FFI_ArrowArrayStream;
+use arrow::pyarrow::FromPyArrow;
use datafusion::arrow::datatypes::Schema;
use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
use datafusion::arrow::util::pretty;
@@ -150,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult {
Ok(PythonFormatter { formatter, config })
}
-/// Get the Python formatter from the datafusion.html_formatter module
+/// Get the Python formatter from the datafusion.dataframe_formatter module
fn import_python_formatter(py: Python) -> PyResult> {
- let formatter_module = py.import("datafusion.html_formatter")?;
+ let formatter_module = py.import("datafusion.dataframe_formatter")?;
let get_formatter = formatter_module.getattr("get_formatter")?;
get_formatter.call0()
}
@@ -295,6 +296,46 @@ impl PyDataFrame {
pub fn new(df: DataFrame) -> Self {
Self { df: Arc::new(df) }
}
+
+ fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult {
+ // Get the Python formatter and config
+ let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
+ let (batches, has_more) = wait_for_future(
+ py,
+ collect_record_batches_to_display(self.df.as_ref().clone(), config),
+ )??;
+ if batches.is_empty() {
+ // This should not be reached, but do it for safety since we index into the vector below
+ return Ok("No data to display".to_string());
+ }
+
+ let table_uuid = uuid::Uuid::new_v4().to_string();
+
+ // Convert record batches to PyObject list
+ let py_batches = batches
+ .into_iter()
+ .map(|rb| rb.to_pyarrow(py))
+ .collect::>>()?;
+
+ let py_schema = self.schema().into_pyobject(py)?;
+
+ let kwargs = pyo3::types::PyDict::new(py);
+ let py_batches_list = PyList::new(py, py_batches.as_slice())?;
+ kwargs.set_item("batches", py_batches_list)?;
+ kwargs.set_item("schema", py_schema)?;
+ kwargs.set_item("has_more", has_more)?;
+ kwargs.set_item("table_uuid", table_uuid)?;
+
+ let method_name = match as_html {
+ true => "format_html",
+ false => "format_str",
+ };
+
+ let html_result = formatter.call_method(method_name, (), Some(&kwargs))?;
+ let html_str: String = html_result.extract()?;
+
+ Ok(html_str)
+ }
}
#[pymethods]
@@ -321,18 +362,27 @@ impl PyDataFrame {
}
fn __repr__(&self, py: Python) -> PyDataFusionResult {
- // Get the Python formatter config
- let PythonFormatter {
- formatter: _,
- config,
- } = get_python_formatter_with_config(py)?;
- let (batches, has_more) = wait_for_future(
- py,
- collect_record_batches_to_display(self.df.as_ref().clone(), config),
- )??;
+ self.prepare_repr_string(py, false)
+ }
+
+ #[staticmethod]
+ #[expect(unused_variables)]
+ fn default_str_repr<'py>(
+ batches: Vec>,
+ schema: &Bound<'py, PyAny>,
+ has_more: bool,
+ table_uuid: &str,
+ ) -> PyResult {
+ let batches = batches
+ .into_iter()
+ .map(|batch| RecordBatch::from_pyarrow_bound(&batch))
+ .collect::>>()?
+ .into_iter()
+ .filter(|batch| batch.num_rows() > 0)
+ .collect::>();
+
if batches.is_empty() {
- // This should not be reached, but do it for safety since we index into the vector below
- return Ok("No data to display".to_string());
+ return Ok("No data to display".to_owned());
}
let batches_as_displ =
@@ -347,38 +397,7 @@ impl PyDataFrame {
}
fn _repr_html_(&self, py: Python) -> PyDataFusionResult {
- // Get the Python formatter and config
- let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
- let (batches, has_more) = wait_for_future(
- py,
- collect_record_batches_to_display(self.df.as_ref().clone(), config),
- )??;
- if batches.is_empty() {
- // This should not be reached, but do it for safety since we index into the vector below
- return Ok("No data to display".to_string());
- }
-
- let table_uuid = uuid::Uuid::new_v4().to_string();
-
- // Convert record batches to PyObject list
- let py_batches = batches
- .into_iter()
- .map(|rb| rb.to_pyarrow(py))
- .collect::>>()?;
-
- let py_schema = self.schema().into_pyobject(py)?;
-
- let kwargs = pyo3::types::PyDict::new(py);
- let py_batches_list = PyList::new(py, py_batches.as_slice())?;
- kwargs.set_item("batches", py_batches_list)?;
- kwargs.set_item("schema", py_schema)?;
- kwargs.set_item("has_more", has_more)?;
- kwargs.set_item("table_uuid", table_uuid)?;
-
- let html_result = formatter.call_method("format_html", (), Some(&kwargs))?;
- let html_str: String = html_result.extract()?;
-
- Ok(html_str)
+ self.prepare_repr_string(py, true)
}
/// Calculate summary statistics for a DataFrame