From f3c8c018d0ee4b51c1fd51ab2de48a1ece53fdc3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 24 Jun 2025 15:02:48 -0400 Subject: [PATCH 1/4] Add string formatter --- python/datafusion/html_formatter.py | 28 ++++++++ src/dataframe.rs | 105 ++++++++++++++++------------ 2 files changed, 90 insertions(+), 43 deletions(-) diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 12a7e455..e26537db 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -26,6 +26,8 @@ runtime_checkable, ) +from datafusion._internal import DataFrame as DataFrameInternal + def _validate_positive_int(value: Any, param_name: str) -> None: """Validate that a parameter is a positive integer. @@ -345,6 +347,32 @@ def format_html( return "\n".join(html) + def format_str( + self, + batches: list, + schema: Any, + has_more: bool = False, + table_uuid: str | None = None, + ) -> str: + """Format record batches as a string. + + This method is used by DataFrame's __repr__ implementation and can be + called directly when string rendering is needed. + + Args: + batches: List of Arrow RecordBatch objects + schema: Arrow Schema object + has_more: Whether there are more batches not shown + table_uuid: Unique ID for the table, used for JavaScript interactions + + Returns: + String representation of the data + + Raises: + TypeError: If schema is invalid and no batches are provided + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + def _build_html_header(self) -> list[str]: """Build the HTML header with CSS styles.""" html = [] diff --git a/src/dataframe.rs b/src/dataframe.rs index 3d68db27..0b29f83b 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -24,6 +24,7 @@ use arrow::compute::can_cast_types; use arrow::error::ArrowError; use arrow::ffi::FFI_ArrowSchema; use arrow::ffi_stream::FFI_ArrowArrayStream; +use arrow::pyarrow::FromPyArrow; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow}; use datafusion::arrow::util::pretty; @@ -295,6 +296,46 @@ impl PyDataFrame { pub fn new(df: DataFrame) -> Self { Self { df: Arc::new(df) } } + + fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult { + // Get the Python formatter and config + let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), config), + )??; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); + } + + let table_uuid = uuid::Uuid::new_v4().to_string(); + + // Convert record batches to PyObject list + let py_batches = batches + .into_iter() + .map(|rb| rb.to_pyarrow(py)) + .collect::>>()?; + + let py_schema = self.schema().into_pyobject(py)?; + + let kwargs = pyo3::types::PyDict::new(py); + let py_batches_list = PyList::new(py, py_batches.as_slice())?; + kwargs.set_item("batches", py_batches_list)?; + kwargs.set_item("schema", py_schema)?; + kwargs.set_item("has_more", has_more)?; + kwargs.set_item("table_uuid", table_uuid)?; + + let method_name = match as_html { + true => "format_html", + false => "format_str", + }; + + let html_result = formatter.call_method(method_name, (), Some(&kwargs))?; + let html_str: String = html_result.extract()?; + + Ok(html_str) + } } #[pymethods] @@ -321,18 +362,27 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter config - let PythonFormatter { - formatter: _, - config, - } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; + self.prepare_repr_string(py, false) + } + + #[staticmethod] + #[expect(unused_variables)] + fn default_str_repr<'py>( + batches: Vec>, + schema: &Bound<'py, PyAny>, + has_more: bool, + table_uuid: &str, + ) -> PyResult { + let batches = batches + .into_iter() + .map(|batch| RecordBatch::from_pyarrow_bound(&batch)) + .collect::>>()? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect::>(); + if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); + return Ok("No data to display".to_owned()); } let batches_as_displ = @@ -347,38 +397,7 @@ impl PyDataFrame { } fn _repr_html_(&self, py: Python) -> PyDataFusionResult { - // Get the Python formatter and config - let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?; - let (batches, has_more) = wait_for_future( - py, - collect_record_batches_to_display(self.df.as_ref().clone(), config), - )??; - if batches.is_empty() { - // This should not be reached, but do it for safety since we index into the vector below - return Ok("No data to display".to_string()); - } - - let table_uuid = uuid::Uuid::new_v4().to_string(); - - // Convert record batches to PyObject list - let py_batches = batches - .into_iter() - .map(|rb| rb.to_pyarrow(py)) - .collect::>>()?; - - let py_schema = self.schema().into_pyobject(py)?; - - let kwargs = pyo3::types::PyDict::new(py); - let py_batches_list = PyList::new(py, py_batches.as_slice())?; - kwargs.set_item("batches", py_batches_list)?; - kwargs.set_item("schema", py_schema)?; - kwargs.set_item("has_more", has_more)?; - kwargs.set_item("table_uuid", table_uuid)?; - - let html_result = formatter.call_method("format_html", (), Some(&kwargs))?; - let html_str: String = html_result.extract()?; - - Ok(html_str) + self.prepare_repr_string(py, true) } /// Calculate summary statistics for a DataFrame From eed890754b53fe748ec7e330e83f863a3dc70f6e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 24 Jun 2025 15:12:52 -0400 Subject: [PATCH 2/4] Rename html_formatter to dataframe_formatter --- python/datafusion/__init__.py | 2 +- python/datafusion/dataframe.py | 15 ++++++++++++++- .../{html_formatter.py => dataframe_formatter.py} | 2 +- python/tests/test_dataframe.py | 4 ++-- src/dataframe.rs | 4 ++-- 5 files changed, 20 insertions(+), 7 deletions(-) rename python/datafusion/{html_formatter.py => dataframe_formatter.py} (99%) diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 16d65f68..9586ae26 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -47,11 +47,11 @@ SQLOptions, ) from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions +from .dataframe_formatter import configure_formatter from .expr import ( Expr, WindowFrame, ) -from .html_formatter import configure_formatter from .io import read_avro, read_csv, read_json, read_parquet from .plan import ExecutionPlan, LogicalPlan from .record_batch import RecordBatch, RecordBatchStream diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index 1fd63bdc..c747c24d 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -52,7 +52,6 @@ import polars as pl import pyarrow as pa - from datafusion._internal import DataFrame as DataFrameInternal from datafusion._internal import expr as expr_internal from enum import Enum @@ -1112,3 +1111,17 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: - For columns not in subset, the original column is kept unchanged """ return DataFrame(self.df.fill_null(value, subset)) + + @staticmethod + def default_str_repr( + batches: list[pa.RecordBatch], + schema: pa.Schema, + has_more: bool, + table_uuid: str | None = None, + ) -> str: + """Return the default string representation of a DataFrame. + + This method is used by the default formatter and implemented in Rust for + performance reasons. + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) diff --git a/python/datafusion/html_formatter.py b/python/datafusion/dataframe_formatter.py similarity index 99% rename from python/datafusion/html_formatter.py rename to python/datafusion/dataframe_formatter.py index e26537db..27f00f9c 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -271,7 +271,7 @@ def is_styles_loaded(cls) -> bool: True if styles have been loaded, False otherwise Example: - >>> from datafusion.html_formatter import DataFrameHtmlFormatter + >>> from datafusion.dataframe_formatter import DataFrameHtmlFormatter >>> DataFrameHtmlFormatter.is_styles_loaded() False """ diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 3c9b97f2..3b816bc8 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -37,14 +37,14 @@ from datafusion import ( functions as f, ) -from datafusion.expr import Window -from datafusion.html_formatter import ( +from datafusion.dataframe_formatter import ( DataFrameHtmlFormatter, configure_formatter, get_formatter, reset_formatter, reset_styles_loaded_state, ) +from datafusion.expr import Window from pyarrow.csv import write_csv MB = 1024 * 1024 diff --git a/src/dataframe.rs b/src/dataframe.rs index 0b29f83b..c2ad4771 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -151,9 +151,9 @@ fn get_python_formatter_with_config(py: Python) -> PyResult { Ok(PythonFormatter { formatter, config }) } -/// Get the Python formatter from the datafusion.html_formatter module +/// Get the Python formatter from the datafusion.dataframe_formatter module fn import_python_formatter(py: Python) -> PyResult> { - let formatter_module = py.import("datafusion.html_formatter")?; + let formatter_module = py.import("datafusion.dataframe_formatter")?; let get_formatter = formatter_module.getattr("get_formatter")?; get_formatter.call0() } From f04b97300814f4bcf019a01770f69416b65d6ece Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 24 Jun 2025 15:55:56 -0400 Subject: [PATCH 3/4] Add deprecation warning --- python/datafusion/html_formatter.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 python/datafusion/html_formatter.py diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py new file mode 100644 index 00000000..37558b91 --- /dev/null +++ b/python/datafusion/html_formatter.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Deprecated module for dataframe formatting.""" + +import warnings + +from datafusion.dataframe_formatter import * # noqa: F403 + +warnings.warn( + "The module 'html_formatter' is deprecated and will be removed in the next release." + "Please use 'dataframe_formatter' instead.", + DeprecationWarning, + stacklevel=2, +) From c36425af4c5cdadbe3feff69ec1e5c423c8b5525 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 25 Jun 2025 07:25:28 -0400 Subject: [PATCH 4/4] Small adjustments based on user feedback --- python/datafusion/dataframe.py | 28 ++++++++++++++-------------- python/datafusion/html_formatter.py | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c747c24d..034c4cb7 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -322,6 +322,20 @@ def __repr__(self) -> str: def _repr_html_(self) -> str: return self.df._repr_html_() + @staticmethod + def default_str_repr( + batches: list[pa.RecordBatch], + schema: pa.Schema, + has_more: bool, + table_uuid: str | None = None, + ) -> str: + """Return the default string representation of a DataFrame. + + This method is used by the default formatter and implemented in Rust for + performance reasons. + """ + return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) + def describe(self) -> DataFrame: """Return the statistics for this DataFrame. @@ -1111,17 +1125,3 @@ def fill_null(self, value: Any, subset: list[str] | None = None) -> DataFrame: - For columns not in subset, the original column is kept unchanged """ return DataFrame(self.df.fill_null(value, subset)) - - @staticmethod - def default_str_repr( - batches: list[pa.RecordBatch], - schema: pa.Schema, - has_more: bool, - table_uuid: str | None = None, - ) -> str: - """Return the default string representation of a DataFrame. - - This method is used by the default formatter and implemented in Rust for - performance reasons. - """ - return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid) diff --git a/python/datafusion/html_formatter.py b/python/datafusion/html_formatter.py index 37558b91..65eb1f04 100644 --- a/python/datafusion/html_formatter.py +++ b/python/datafusion/html_formatter.py @@ -25,5 +25,5 @@ "The module 'html_formatter' is deprecated and will be removed in the next release." "Please use 'dataframe_formatter' instead.", DeprecationWarning, - stacklevel=2, + stacklevel=3, )