Skip to content

Commit f3c8c01

Browse files
committed
Add string formatter
1 parent 0d3c37f commit f3c8c01

File tree

2 files changed

+90
-43
lines changed

2 files changed

+90
-43
lines changed

python/datafusion/html_formatter.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
runtime_checkable,
2727
)
2828

29+
from datafusion._internal import DataFrame as DataFrameInternal
30+
2931

3032
def _validate_positive_int(value: Any, param_name: str) -> None:
3133
"""Validate that a parameter is a positive integer.
@@ -345,6 +347,32 @@ def format_html(
345347

346348
return "\n".join(html)
347349

350+
def format_str(
351+
self,
352+
batches: list,
353+
schema: Any,
354+
has_more: bool = False,
355+
table_uuid: str | None = None,
356+
) -> str:
357+
"""Format record batches as a string.
358+
359+
This method is used by DataFrame's __repr__ implementation and can be
360+
called directly when string rendering is needed.
361+
362+
Args:
363+
batches: List of Arrow RecordBatch objects
364+
schema: Arrow Schema object
365+
has_more: Whether there are more batches not shown
366+
table_uuid: Unique ID for the table, used for JavaScript interactions
367+
368+
Returns:
369+
String representation of the data
370+
371+
Raises:
372+
TypeError: If schema is invalid and no batches are provided
373+
"""
374+
return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid)
375+
348376
def _build_html_header(self) -> list[str]:
349377
"""Build the HTML header with CSS styles."""
350378
html = []

src/dataframe.rs

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use arrow::compute::can_cast_types;
2424
use arrow::error::ArrowError;
2525
use arrow::ffi::FFI_ArrowSchema;
2626
use arrow::ffi_stream::FFI_ArrowArrayStream;
27+
use arrow::pyarrow::FromPyArrow;
2728
use datafusion::arrow::datatypes::Schema;
2829
use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
2930
use datafusion::arrow::util::pretty;
@@ -295,6 +296,46 @@ impl PyDataFrame {
295296
pub fn new(df: DataFrame) -> Self {
296297
Self { df: Arc::new(df) }
297298
}
299+
300+
fn prepare_repr_string(&self, py: Python, as_html: bool) -> PyDataFusionResult<String> {
301+
// Get the Python formatter and config
302+
let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
303+
let (batches, has_more) = wait_for_future(
304+
py,
305+
collect_record_batches_to_display(self.df.as_ref().clone(), config),
306+
)??;
307+
if batches.is_empty() {
308+
// This should not be reached, but do it for safety since we index into the vector below
309+
return Ok("No data to display".to_string());
310+
}
311+
312+
let table_uuid = uuid::Uuid::new_v4().to_string();
313+
314+
// Convert record batches to PyObject list
315+
let py_batches = batches
316+
.into_iter()
317+
.map(|rb| rb.to_pyarrow(py))
318+
.collect::<PyResult<Vec<PyObject>>>()?;
319+
320+
let py_schema = self.schema().into_pyobject(py)?;
321+
322+
let kwargs = pyo3::types::PyDict::new(py);
323+
let py_batches_list = PyList::new(py, py_batches.as_slice())?;
324+
kwargs.set_item("batches", py_batches_list)?;
325+
kwargs.set_item("schema", py_schema)?;
326+
kwargs.set_item("has_more", has_more)?;
327+
kwargs.set_item("table_uuid", table_uuid)?;
328+
329+
let method_name = match as_html {
330+
true => "format_html",
331+
false => "format_str",
332+
};
333+
334+
let html_result = formatter.call_method(method_name, (), Some(&kwargs))?;
335+
let html_str: String = html_result.extract()?;
336+
337+
Ok(html_str)
338+
}
298339
}
299340

300341
#[pymethods]
@@ -321,18 +362,27 @@ impl PyDataFrame {
321362
}
322363

323364
fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
324-
// Get the Python formatter config
325-
let PythonFormatter {
326-
formatter: _,
327-
config,
328-
} = get_python_formatter_with_config(py)?;
329-
let (batches, has_more) = wait_for_future(
330-
py,
331-
collect_record_batches_to_display(self.df.as_ref().clone(), config),
332-
)??;
365+
self.prepare_repr_string(py, false)
366+
}
367+
368+
#[staticmethod]
369+
#[expect(unused_variables)]
370+
fn default_str_repr<'py>(
371+
batches: Vec<Bound<'py, PyAny>>,
372+
schema: &Bound<'py, PyAny>,
373+
has_more: bool,
374+
table_uuid: &str,
375+
) -> PyResult<String> {
376+
let batches = batches
377+
.into_iter()
378+
.map(|batch| RecordBatch::from_pyarrow_bound(&batch))
379+
.collect::<PyResult<Vec<RecordBatch>>>()?
380+
.into_iter()
381+
.filter(|batch| batch.num_rows() > 0)
382+
.collect::<Vec<_>>();
383+
333384
if batches.is_empty() {
334-
// This should not be reached, but do it for safety since we index into the vector below
335-
return Ok("No data to display".to_string());
385+
return Ok("No data to display".to_owned());
336386
}
337387

338388
let batches_as_displ =
@@ -347,38 +397,7 @@ impl PyDataFrame {
347397
}
348398

349399
fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
350-
// Get the Python formatter and config
351-
let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
352-
let (batches, has_more) = wait_for_future(
353-
py,
354-
collect_record_batches_to_display(self.df.as_ref().clone(), config),
355-
)??;
356-
if batches.is_empty() {
357-
// This should not be reached, but do it for safety since we index into the vector below
358-
return Ok("No data to display".to_string());
359-
}
360-
361-
let table_uuid = uuid::Uuid::new_v4().to_string();
362-
363-
// Convert record batches to PyObject list
364-
let py_batches = batches
365-
.into_iter()
366-
.map(|rb| rb.to_pyarrow(py))
367-
.collect::<PyResult<Vec<PyObject>>>()?;
368-
369-
let py_schema = self.schema().into_pyobject(py)?;
370-
371-
let kwargs = pyo3::types::PyDict::new(py);
372-
let py_batches_list = PyList::new(py, py_batches.as_slice())?;
373-
kwargs.set_item("batches", py_batches_list)?;
374-
kwargs.set_item("schema", py_schema)?;
375-
kwargs.set_item("has_more", has_more)?;
376-
kwargs.set_item("table_uuid", table_uuid)?;
377-
378-
let html_result = formatter.call_method("format_html", (), Some(&kwargs))?;
379-
let html_str: String = html_result.extract()?;
380-
381-
Ok(html_str)
400+
self.prepare_repr_string(py, true)
382401
}
383402

384403
/// Calculate summary statistics for a DataFrame

0 commit comments

Comments
 (0)