apache · andygrove · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 7, 2025
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
@@ -75,6 +75,8 @@ jobs:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
           # upload test reports only for java 17
           upload-test-reports: ${{ matrix.java_version == '17' }}
+        env:
+          COMET_FUZZ_TEST: "true"
 
   linux-test-with-spark4_0:
     strategy:
@@ -102,6 +104,8 @@ jobs:
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }}
           upload-test-reports: true
+        env:
+          COMET_FUZZ_TEST: "true"
 
   linux-test-with-old-spark:
     strategy:
@@ -127,6 +131,8 @@ jobs:
         uses: ./.github/actions/java-test
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
+        env:
+          COMET_FUZZ_TEST: "true"
 
   macos-test:
     strategy:
@@ -155,6 +161,8 @@ jobs:
         uses: ./.github/actions/java-test
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
+        env:
+          COMET_FUZZ_TEST: "true"
 
   macos-aarch64-test:
     strategy:
@@ -188,6 +196,8 @@ jobs:
         uses: ./.github/actions/java-test
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
+        env:
+          COMET_FUZZ_TEST: "true"
 
   macos-test-with-spark4_0:
     strategy:
@@ -212,6 +222,8 @@ jobs:
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }}
           upload-test-reports: true
+        env:
+          COMET_FUZZ_TEST: "true"
 
   macos-aarch64-test-with-spark4_0:
     strategy:
@@ -241,6 +253,8 @@ jobs:
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }}
           upload-test-reports: true
+        env:
+          COMET_FUZZ_TEST: "true"
 
   macos-aarch64-test-with-old-spark:
     strategy:
@@ -269,4 +283,5 @@ jobs:
         uses: ./.github/actions/java-test
         with:
           maven_opts: -Pspark-${{ matrix.spark-version }},scala-${{ matrix.scala-version }}
-
+        env:
+          COMET_FUZZ_TEST: "true"
diff --git a/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala b/spark/src/main/scala/org/apache/comet/testing/ParquetGenerator.scala
@@ -212,8 +212,8 @@ object ParquetGenerator {
 }
 
 case class DataGenOptions(
-    allowNull: Boolean,
-    generateNegativeZero: Boolean,
-    generateArray: Boolean,
-    generateStruct: Boolean,
-    generateMap: Boolean)
+    allowNull: Boolean = true,
+    generateNegativeZero: Boolean = true,
+    generateArray: Boolean = false,
+    generateStruct: Boolean = false,
+    generateMap: Boolean = false)
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet
+
+import scala.util.Random
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.CometTestBase
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
+import org.apache.spark.sql.types._
+
+import org.apache.comet.testing.{DataGenOptions, ParquetGenerator}
+
+class CometFuzzTestSuite extends CometTestBase with AdaptiveSparkPlanHelper {
+
+  private val fuzzTestEnabled: Boolean = sys.env.contains("COMET_FUZZ_TEST")
+
+  test("aggregates") {
+    assume(fuzzTestEnabled)
+    withTempDir { dir =>
+      val path = new Path(dir.toURI.toString, "test.parquet")
+      val filename = path.toString
+      val random = new Random(42)
+      withSQLConf(CometConf.COMET_ENABLED.key -> "false") {
+        ParquetGenerator.makeParquetFile(random, spark, filename, 10000, DataGenOptions())
+      }
+      val table = spark.read.parquet(filename).coalesce(1)
+      table.createOrReplaceTempView("t1")
+
+      val groupingFields: Array[StructField] = table.schema.fields.filterNot(isNumeric)
+
+      // test grouping by each non-numeric column, grouping by all non-numeric columns, and no grouping
+      val groupByIndividualCols: Seq[Seq[String]] = groupingFields.map(f => Seq(f.name)).toSeq
+      val groupByAllCols: Seq[Seq[String]] = Seq(groupingFields.map(_.name).toSeq)
+      val noGroup: Seq[Seq[String]] = Seq(Seq.empty)
+      val groupings: Seq[Seq[String]] = groupByIndividualCols ++ groupByAllCols ++ noGroup
+
+      val scanTypes = Seq(
+        CometConf.SCAN_NATIVE_COMET
+          /*CometConf.SCAN_NATIVE_DATAFUSION,
+        CometConf.SCAN_NATIVE_ICEBERG_COMPAT*/ )
+
+      for (scan <- scanTypes) {
+        for (shuffleMode <- Seq("auto", "jvm", "native")) {
+          withSQLConf(
+            CometConf.COMET_NATIVE_SCAN_IMPL.key -> scan,
+            CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
+            for (group <- groupings) {
+              for (agg <- Exprs.aggregate) {
+
+                // pick all compatible columns for all input args
+                val argFields: Seq[Array[StructField]] = agg.args.map(argType =>
+                  table.schema.fields.filter(f => isMatch(f.dataType, argType)))
+
+                // just pick the first compatible column for each type for now, but should randomize this or
+                // test all combinations
+                val args: Seq[StructField] = argFields.map(_.head)
+
+                if (agg.name == "_avg" && args.head.dataType.isInstanceOf[DecimalType]) {
+                  // skip known issue
+                } else {
+
+                  val aggSql = s"${agg.name}(${args.map(_.name).mkString(",")})"
+
+                  val sql = if (group.isEmpty) {
+                    s"SELECT $aggSql FROM t1"
+                  } else {
+                    val groupCols = group.mkString(", ")
+                    s"SELECT $groupCols, $aggSql FROM t1 GROUP BY $groupCols ORDER BY $groupCols"
+                  }
+                  println(sql)
+                  checkSparkAnswerWithTol(sql)
+                  // TODO check operators
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def isNumeric(field: StructField) = {
+    field.dataType match {
+      case _: ByteType | _: ShortType | _: IntegerType | _: LongType => true
+      case _: FloatType | _: DoubleType => true
+      case _: DecimalType => true
+      case _ => false
+    }
+  }
+
+  def isMatch(dt: DataType, at: ArgType): Boolean = {
+    at match {
+      case AnyType => true
+      case NumericType =>
+        dt match {
+          case _: ByteType | _: ShortType | _: IntegerType | _: LongType => true
+          case _: FloatType | _: DoubleType => true
+          case _: DecimalType => true
+          case _ => false
+        }
+      case OrderedTypes =>
+        // TODO exclude map or other complex types that contain maps
+        true
+      case _ => false
+    }
+  }
+
+}
+
+object Exprs {
+
+  /**
+   * Aggregate expressions. Note that `first` and `last` are excluded because they are
+   * non-deterministic.
+   */
+  val aggregate: Seq[ExprMeta] = Seq(
+    ExprMeta("min", Seq(OrderedTypes)),
+    ExprMeta("max", Seq(OrderedTypes)),
+    ExprMeta("count", Seq(AnyType)),
+    ExprMeta("sum", Seq(NumericType)),
+    ExprMeta("avg", Seq(NumericType)),
+    ExprMeta("median", Seq(NumericType)),
+    ExprMeta("stddev", Seq(NumericType)),
+    ExprMeta("stddev_pop", Seq(NumericType)),
+    ExprMeta("stddev_samp", Seq(NumericType)),
+    ExprMeta("variance", Seq(NumericType)),
+    ExprMeta("var_pop", Seq(NumericType)),
+    ExprMeta("var_samp", Seq(NumericType)),
+    ExprMeta("corr", Seq(NumericType, NumericType)),
+    ExprMeta("covar_pop", Seq(NumericType, NumericType)),
+    ExprMeta("covar_samp", Seq(NumericType, NumericType)))
+}
+
+/** Meta-data about a Spark expression */
+case class ExprMeta(name: String, args: Seq[ArgType])
+
+/** Represents that data type(s) that an argument accepts */
+sealed trait ArgType
+
+/** Supports any input type */
+case object AnyType extends ArgType
+
+/** Integral, floating-point, and decimal */
+case object NumericType extends ArgType
+
+/** Types that can ordered. Includes struct and array but excludes maps */
+case object OrderedTypes extends ArgType
diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala
@@ -116,12 +116,49 @@ abstract class CometTestBase
     require(absTol > 0 && absTol <= 1e-6, s"absTol $absTol is out of range (0, 1e-6]")
 
     actualAnswer.toSeq.zip(expectedAnswer.toSeq).foreach {
+      case (actual: Float, expected: Float) =>
+        def isPosInfinity(value: Float): Boolean = {
+          value.isPosInfinity || value == 3.4028235e38
+        }
+
+        if ((actual.isNaN && expected.isNaN) ||
+          (isPosInfinity(actual) && isPosInfinity(expected)) ||
+          (actual.isNegInfinity && expected.isNegInfinity)) {
+          // ok
+        } else {
+
+          def almostEqual(a: Double, b: Double, tolerance: Double = 1e-6f): Boolean = {
+            Math.abs(a - b) <= tolerance * Math.max(Math.abs(a), Math.abs(b))
+          }
+
+          assert(
+            almostEqual(actual, expected),
+            s"actual answer $actual not within $absTol of correct answer $expected")
+        }
+
       case (actual: Double, expected: Double) =>
-        if (!actual.isNaN && !expected.isNaN) {
+        def isPosInfinity(value: Double): Boolean = {
+          value.isPosInfinity || value == 3.4028235e38
+        }
+
+        if ((actual.isNaN && expected.isNaN) ||
+          (isPosInfinity(actual) && isPosInfinity(expected)) ||
+          (actual.isNegInfinity && expected.isNegInfinity)) {
+          // ok
+        } else {
+
+          def almostEqual(a: Double, b: Double, tolerance: Double = 1e-6f): Boolean = {
+            Math.abs(a - b) <= tolerance * Math.max(Math.abs(a), Math.abs(b))
+          }
+
           assert(
-            math.abs(actual - expected) < absTol,
+            almostEqual(actual, expected),
             s"actual answer $actual not within $absTol of correct answer $expected")
         }
+
+      case (actual: Array[_], expected: Array[_]) =>
+        assert(actual.sameElements(expected), s"$actualAnswer did not equal $expectedAnswer")
+
       case (actual, expected) =>
         assert(actual == expected, s"$actualAnswer did not equal $expectedAnswer")
     }