apache · andygrove · Jun 13, 2025 · May 16, 2025 · May 16, 2025 · May 16, 2025
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -86,6 +86,7 @@ object CometConf extends ShimCometConf {
   val SCAN_NATIVE_COMET = "native_comet"
   val SCAN_NATIVE_DATAFUSION = "native_datafusion"
   val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
+  val SCAN_AUTO = "auto"
 
   val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
     .doc(
@@ -99,7 +100,8 @@ object CometConf extends ShimCometConf {
     .internal()
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT))
+    .checkValues(
+      Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT, SCAN_AUTO))
     .createWithDefault(sys.env
       .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_NATIVE_COMET)
       .toLowerCase(Locale.ROOT))

diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
 
 This guide offers information about areas of functionality where there are known differences.
 
-# Compatibility Guide
-
-Comet aims to provide consistent results with the version of Apache Spark that is being used.
-
-This guide offers information about areas of functionality where there are known differences.
-
 ## Parquet Scans
 
 Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -56,6 +50,8 @@ implementation:
 
 The new scans currently have the following limitations:
 
+Issues common to both `native_datafusion` and `native_iceberg_compat`:
+
 - When reading Parquet files written by systems other than Spark that contain columns with the logical types `UINT_8`
 or `UINT_16`, Comet will produce different results than Spark because Spark does not preserve or understand these
 logical types. Arrow-based readers, such as DataFusion and Comet do respect these types and read the data as unsigned
@@ -65,10 +61,19 @@ types (regardless of the logical type). This behavior can be disabled by setting
 - Reading legacy INT96 timestamps contained within complex types can produce different results to Spark
 - There is a known performance issue when pushing filters down to Parquet. See the [Comet Tuning Guide] for more
 information.
+- Reading maps containing complex types can result in errors or incorrect results [#1754]
+- `PARQUET_FIELD_ID_READ_ENABLED` is not respected [#1758]
 - There are failures in the Spark SQL test suite when enabling these new scans (tracking issues: [#1542] and [#1545]).
 
+Issues specific to `native_datafusion`:
+
+- Bucketed scans are not supported
+- No support for row indexes
+
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
+[#1754]: https://github.com/apache/datafusion-comet/issues/1754
+[#1758]: https://github.com/apache/datafusion-comet/issues/1758
 [Comet Tuning Guide]: tuning.md
 
 ## ANSI mode

diff --git a/docs/templates/compatibility-template.md b/docs/templates/compatibility-template.md
@@ -29,12 +29,6 @@ Comet aims to provide consistent results with the version of Apache Spark that i
 
 This guide offers information about areas of functionality where there are known differences.
 
-# Compatibility Guide
-
-Comet aims to provide consistent results with the version of Apache Spark that is being used.
-
-This guide offers information about areas of functionality where there are known differences.
-
 ## Parquet Scans
 
 Comet currently has three distinct implementations of the Parquet scan operator. The configuration property
@@ -56,6 +50,8 @@ implementation:
 
 The new scans currently have the following limitations:
 
+Issues common to both `native_datafusion` and `native_iceberg_compat`:
+
 - When reading Parquet files written by systems other than Spark that contain columns with the logical types `UINT_8`
   or `UINT_16`, Comet will produce different results than Spark because Spark does not preserve or understand these
   logical types. Arrow-based readers, such as DataFusion and Comet do respect these types and read the data as unsigned
@@ -65,10 +61,19 @@ The new scans currently have the following limitations:
 - Reading legacy INT96 timestamps contained within complex types can produce different results to Spark
 - There is a known performance issue when pushing filters down to Parquet. See the [Comet Tuning Guide] for more
   information.
+- Reading maps containing complex types can result in errors or incorrect results [#1754]
+- `PARQUET_FIELD_ID_READ_ENABLED` is not respected [#1758]
 - There are failures in the Spark SQL test suite when enabling these new scans (tracking issues: [#1542] and [#1545]).
 
+Issues specific to `native_datafusion`:
+
+- Bucketed scans are not supported
+- No support for row indexes
+
 [#1545]: https://github.com/apache/datafusion-comet/issues/1545
 [#1542]: https://github.com/apache/datafusion-comet/issues/1542
+[#1754]: https://github.com/apache/datafusion-comet/issues/1754
+[#1758]: https://github.com/apache/datafusion-comet/issues/1758
 [Comet Tuning Guide]: tuning.md
 
 ## ANSI mode

diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -93,21 +93,41 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        val scanImpl = COMET_NATIVE_SCAN_IMPL.get()
-        if (scanImpl == CometConf.SCAN_NATIVE_DATAFUSION && !COMET_EXEC_ENABLED.get()) {
+        var scanImpl = COMET_NATIVE_SCAN_IMPL.get()
+
+        // if scan is auto then pick best available scan
+        if (scanImpl == SCAN_AUTO) {
+          val typeChecker = CometScanTypeChecker(SCAN_NATIVE_DATAFUSION)
+          val schemaSupported =
+            typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons)
+          val partitionSchemaSupported =
+            typeChecker.isSchemaSupported(r.partitionSchema, fallbackReasons)
+
+          // TODO these checks are not yet exhaustive. For example, native_datafusion does
+          //  not support reading from object stores such as S3 yet
+
+          if (COMET_EXEC_ENABLED
+              .get() && schemaSupported && partitionSchemaSupported && !scanExec.bucketedScan) {
+            scanImpl = SCAN_NATIVE_DATAFUSION
+          } else {
+            scanImpl = SCAN_NATIVE_COMET
+          }
+        }
+
+        if (scanImpl == SCAN_NATIVE_DATAFUSION && !COMET_EXEC_ENABLED.get()) {
           fallbackReasons +=
             s"Full native scan disabled because ${COMET_EXEC_ENABLED.key} disabled"
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        if (scanImpl == CometConf.SCAN_NATIVE_DATAFUSION && scanExec.bucketedScan) {
+        if (scanImpl == SCAN_NATIVE_DATAFUSION && scanExec.bucketedScan) {
           // https://github.com/apache/datafusion-comet/issues/1719
           fallbackReasons +=
             "Full native scan disabled because bucketed scan is not supported"
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        val typeChecker = new CometScanTypeChecker(scanImpl)
+        val typeChecker = CometScanTypeChecker(scanImpl)
         val schemaSupported =
           typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons)
         val partitionSchemaSupported =