feat: Add Parquet writer option autodetection

kosiew · kosiew · commit 5a5eb398fa28 · 2025-06-23T23:10:37.000+08:00
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -46,7 +46,11 @@
     SessionContext,
     SQLOptions,
 )
-from .dataframe import DataFrame, ParquetColumnOptions, ParquetWriterOptions
+from .dataframe import (
+    DataFrame,
+    ParquetColumnOptions,
+    ParquetWriterOptions,
+)
 from .expr import (
     Expr,
     WindowFrame,
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
@@ -55,6 +55,7 @@
     from datafusion._internal import DataFrame as DataFrameInternal
     from datafusion._internal import expr as expr_internal
 
+from dataclasses import dataclass
 from enum import Enum
 
 
@@ -873,7 +874,7 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None
     def write_parquet(
         self,
         path: str | pathlib.Path,
-        compression: Union[str, Compression] = Compression.ZSTD,
+        compression: Union[str, Compression, ParquetWriterOptions] = Compression.ZSTD,
         compression_level: int | None = None,
     ) -> None:
         """Execute the :py:class:`DataFrame` and write the results to a Parquet file.
@@ -894,7 +895,13 @@ def write_parquet(
                 recommended range is 1 to 22, with the default being 4. Higher levels
                 provide better compression but slower speed.
         """
-        # Convert string to Compression enum if necessary
+        if isinstance(compression, ParquetWriterOptions):
+            if compression_level is not None:
+                msg = "compression_level should be None when using ParquetWriterOptions"
+                raise ValueError(msg)
+            self.write_parquet_with_options(path, compression)
+            return
+
         if isinstance(compression, str):
             compression = Compression.from_str(compression)
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -2038,6 +2038,22 @@ def test_write_parquet_with_options_column_options(df, tmp_path):
                 assert col["encodings"] == result["encodings"]
 
 
+def test_write_parquet_options(df, tmp_path):
+    options = ParquetWriterOptions(compression="gzip", compression_level=6)
+    df.write_parquet(str(tmp_path), options)
+
+    result = pq.read_table(str(tmp_path)).to_pydict()
+    expected = df.to_pydict()
+
+    assert result == expected
+
+
+def test_write_parquet_options_error(df, tmp_path):
+    options = ParquetWriterOptions(compression="gzip", compression_level=6)
+    with pytest.raises(ValueError):
+        df.write_parquet(str(tmp_path), options, compression_level=1)
+
+
 def test_dataframe_export(df) -> None:
     # Guarantees that we have the canonical implementation
     # reading our dataframe export