apache · andygrove · May 21, 2024 · May 16, 2024 · May 21, 2024 · May 21, 2024
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -111,3 +111,23 @@ jobs:
           source venv/bin/activate
           pip install -e . -vv
           pytest -v .
+
+      - name: Cache the generated dataset
+        id: cache-tpch-dataset
+        uses: actions/cache@v3
+        with:
+          path: benchmarks/tpch/data
+          key: tpch-data-2.18.0
+
+      - name: Run dbgen to create 1 Gb dataset
+        if: ${{ steps.cache-tpch-dataset.outputs.cache-hit != 'true' }}
+        run: |
+          cd benchmarks/tpch
+          RUN_IN_CI=TRUE ./tpch-gen.sh 1
+
+      - name: Run TPC-H examples
+        run: |
+          source venv/bin/activate
+          cd examples/tpch
+          python convert_data_to_parquet.py
+          pytest _tests.py
diff --git a/benchmarks/tpch/tpch-gen.sh b/benchmarks/tpch/tpch-gen.sh
@@ -20,6 +20,15 @@ mkdir -p data/answers 2>/dev/null
 
 set -e
 
+# If RUN_IN_CI is set, then do not produce verbose output or use an interactive terminal
+if [[ -z "${RUN_IN_CI}" ]]; then
+  TERMINAL_FLAG="-it"
+  VERBOSE_OUTPUT="-vf"
+else
+  TERMINAL_FLAG=""
+  VERBOSE_OUTPUT="-f"
+fi
+
 #pushd ..
 #. ./dev/build-set-env.sh
 #popd
@@ -29,7 +38,7 @@ FILE=./data/supplier.tbl
 if test -f "$FILE"; then
     echo "$FILE exists."
 else
-  docker run -v `pwd`/data:/data -it --rm ghcr.io/scalytics/tpch-docker:main -vf -s $1
+  docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1
 
   # workaround for https://github.com/apache/arrow-datafusion/issues/6147
   mv data/customer.tbl data/customer.csv
@@ -49,5 +58,5 @@ FILE=./data/answers/q1.out
 if test -f "$FILE"; then
     echo "$FILE exists."
 else
-  docker run -v `pwd`/data:/data -it --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
+  docker run -v `pwd`/data:/data $TERMINAL_FLAG --entrypoint /bin/bash --rm ghcr.io/scalytics/tpch-docker:main -c "cp /opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
 fi
diff --git a/examples/tpch/_tests.py b/examples/tpch/_tests.py
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+from importlib import import_module
+import pyarrow as pa
+from datafusion import col, lit, functions as F
+from util import get_answer_file
+
+def df_selection(col_name, col_type):
+    if col_type == pa.float64() or isinstance(col_type, pa.Decimal128Type):
+        return F.round(col(col_name), lit(2)).alias(col_name)
+    elif col_type == pa.string():
+        return F.trim(col(col_name)).alias(col_name)
+    else:
+        return col(col_name)
+
+def load_schema(col_name, col_type):
+    if col_type == pa.int64() or col_type == pa.int32():
+        return col_name, pa.string()
+    elif isinstance(col_type, pa.Decimal128Type):
+        return col_name, pa.float64()
+    else:
+        return col_name, col_type
+
+def expected_selection(col_name, col_type):
+    if col_type == pa.int64() or col_type == pa.int32():
+        return F.trim(col(col_name)).cast(col_type).alias(col_name)
+    elif col_type == pa.string():
+        return F.trim(col(col_name)).alias(col_name)
+    else:
+        return col(col_name)
+
+def selections_and_schema(original_schema):
+    columns = [ (c, original_schema.field(c).type) for c in original_schema.names ]
+
+    df_selections = [ df_selection(c, t) for (c, t) in columns]
+    expected_schema = [ load_schema(c, t) for (c, t) in columns]
+    expected_selections = [ expected_selection(c, t) for (c, t) in columns]
+
+    return (df_selections, expected_schema, expected_selections)
+
+def check_q17(df):
+    raw_value = float(df.collect()[0]["avg_yearly"][0].as_py())
+    value = round(raw_value, 2)
+    assert abs(value - 348406.05) < 0.001
+
+@pytest.mark.parametrize(
+    ("query_code", "answer_file"),
+    [
+        ("q01_pricing_summary_report", "q1"),
+        ("q02_minimum_cost_supplier", "q2"),
+        ("q03_shipping_priority", "q3"),
+        ("q04_order_priority_checking", "q4"),
+        ("q05_local_supplier_volume", "q5"),
+        ("q06_forecasting_revenue_change", "q6"),
+        ("q07_volume_shipping", "q7"),
+        ("q08_market_share", "q8"),
+        ("q09_product_type_profit_measure", "q9"),
+        ("q10_returned_item_reporting", "q10"),
+        ("q11_important_stock_identification", "q11"),
+        ("q12_ship_mode_order_priority", "q12"),
+        ("q13_customer_distribution", "q13"),
+        ("q14_promotion_effect", "q14"),
+        ("q15_top_supplier", "q15"),
+        ("q16_part_supplier_relationship", "q16"),
+        ("q17_small_quantity_order", "q17"),
+        ("q18_large_volume_customer", "q18"),
+        ("q19_discounted_revenue", "q19"),
+        ("q20_potential_part_promotion", "q20"),
+        ("q21_suppliers_kept_orders_waiting", "q21"),
+        ("q22_global_sales_opportunity", "q22"),
+    ],
+)
+def test_tpch_query_vs_answer_file(query_code: str, answer_file: str):
+    module = import_module(query_code)
+    df = module.df
+
+    # Treat q17 as a special case. The answer file does not match the spec. Running at
+    # scale factor 1, we have manually verified this result does match the expected value.
+    if answer_file == "q17":
+        return check_q17(df)
+
+    (df_selections, expected_schema, expected_selections) = selections_and_schema(df.schema())
+
+    df = df.select(*df_selections)
+
+    read_schema = pa.schema(expected_schema)
+
+    df_expected = module.ctx.read_csv(get_answer_file(answer_file), schema=read_schema, delimiter="|", file_extension=".out")
+
+    df_expected = df_expected.select(*expected_selections)
+
+    cols = list(read_schema.names)
+
+    assert df.join(df_expected, (cols, cols), "anti").count() == 0
+    assert df.count() == df_expected.count()
diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py
@@ -36,7 +36,7 @@
     ("C_ADDRESS", pyarrow.string()),
     ("C_NATIONKEY", pyarrow.int32()),
     ("C_PHONE", pyarrow.string()),
-    ("C_ACCTBAL", pyarrow.float32()),
+    ("C_ACCTBAL", pyarrow.decimal128(15, 2)),
     ("C_MKTSEGMENT", pyarrow.string()),
     ("C_COMMENT", pyarrow.string()),
 ]
@@ -46,10 +46,10 @@
     ("L_PARTKEY", pyarrow.int32()),
     ("L_SUPPKEY", pyarrow.int32()),
     ("L_LINENUMBER", pyarrow.int32()),
-    ("L_QUANTITY", pyarrow.float32()),
-    ("L_EXTENDEDPRICE", pyarrow.float32()),
-    ("L_DISCOUNT", pyarrow.float32()),
-    ("L_TAX", pyarrow.float32()),
+    ("L_QUANTITY", pyarrow.decimal128(15, 2)),
+    ("L_EXTENDEDPRICE", pyarrow.decimal128(15, 2)),
+    ("L_DISCOUNT", pyarrow.decimal128(15, 2)),
+    ("L_TAX", pyarrow.decimal128(15, 2)),
     ("L_RETURNFLAG", pyarrow.string()),
     ("L_LINESTATUS", pyarrow.string()),
     ("L_SHIPDATE", pyarrow.date32()),
@@ -71,7 +71,7 @@
     ("O_ORDERKEY", pyarrow.int32()),
     ("O_CUSTKEY", pyarrow.int32()),
     ("O_ORDERSTATUS", pyarrow.string()),
-    ("O_TOTALPRICE", pyarrow.float32()),
+    ("O_TOTALPRICE", pyarrow.decimal128(15, 2)),
     ("O_ORDERDATE", pyarrow.date32()),
     ("O_ORDERPRIORITY", pyarrow.string()),
     ("O_CLERK", pyarrow.string()),
@@ -87,15 +87,15 @@
     ("P_TYPE", pyarrow.string()),
     ("P_SIZE", pyarrow.int32()),
     ("P_CONTAINER", pyarrow.string()),
-    ("P_RETAILPRICE", pyarrow.float32()),
+    ("P_RETAILPRICE", pyarrow.decimal128(15, 2)),
     ("P_COMMENT", pyarrow.string()),
 ]
 
 all_schemas["partsupp"] = [
     ("PS_PARTKEY", pyarrow.int32()),
     ("PS_SUPPKEY", pyarrow.int32()),
     ("PS_AVAILQTY", pyarrow.int32()),
-    ("PS_SUPPLYCOST", pyarrow.float32()),
+    ("PS_SUPPLYCOST", pyarrow.decimal128(15, 2)),
     ("PS_COMMENT", pyarrow.string()),
 ]
 
@@ -111,7 +111,7 @@
     ("S_ADDRESS", pyarrow.string()),
     ("S_NATIONKEY", pyarrow.int32()),
     ("S_PHONE", pyarrow.string()),
-    ("S_ACCTBAL", pyarrow.float32()),
+    ("S_ACCTBAL", pyarrow.decimal128(15, 2)),
     ("S_COMMENT", pyarrow.string()),
 ]
 

diff --git a/examples/tpch/q01_pricing_summary_report.py b/examples/tpch/q01_pricing_summary_report.py
@@ -31,10 +31,11 @@
 
 import pyarrow as pa
 from datafusion import SessionContext, col, lit, functions as F
+from util import get_data_path
 
 ctx = SessionContext()
 
-df = ctx.read_parquet("data/lineitem.parquet")
+df = ctx.read_parquet(get_data_path("lineitem.parquet"))
 
 # It may be that the date can be hard coded, based on examples shown.
 # This approach will work with any date range in the provided data set.
@@ -45,7 +46,7 @@
 
 # From the given problem, this is how close to the last date in the database we
 # want to report results for. It should be between 60-120 days before the end.
-DAYS_BEFORE_FINAL = 68
+DAYS_BEFORE_FINAL = 90
 
 # Note: this is a hack on setting the values. It should be set differently once
 # https://github.com/apache/datafusion-python/issues/665 is resolved.
@@ -63,13 +64,13 @@
     [
         F.sum(col("l_quantity")).alias("sum_qty"),
         F.sum(col("l_extendedprice")).alias("sum_base_price"),
-        F.sum(col("l_extendedprice") * (lit(1.0) - col("l_discount"))).alias(
+        F.sum(col("l_extendedprice") * (lit(1) - col("l_discount"))).alias(
             "sum_disc_price"
         ),
         F.sum(
             col("l_extendedprice")
-            * (lit(1.0) - col("l_discount"))
-            * (lit(1.0) + col("l_tax"))
+            * (lit(1) - col("l_discount"))
+            * (lit(1) + col("l_tax"))
         ).alias("sum_charge"),
         F.avg(col("l_quantity")).alias("avg_qty"),
         F.avg(col("l_extendedprice")).alias("avg_price"),

diff --git a/examples/tpch/q02_minimum_cost_supplier.py b/examples/tpch/q02_minimum_cost_supplier.py
@@ -31,8 +31,10 @@
 
 import datafusion
 from datafusion import SessionContext, col, lit, functions as F
+from util import get_data_path
 
-# This is the part we're looking for
+# This is the part we're looking for. Values selected here differ from the spec in order to run
+# unit tests on a small data set.
 SIZE_OF_INTEREST = 15
 TYPE_OF_INTEREST = "BRASS"
 REGION_OF_INTEREST = "EUROPE"
@@ -41,10 +43,10 @@
 
 ctx = SessionContext()
 
-df_part = ctx.read_parquet("data/part.parquet").select_columns(
+df_part = ctx.read_parquet(get_data_path("part.parquet")).select_columns(
     "p_partkey", "p_mfgr", "p_type", "p_size"
 )
-df_supplier = ctx.read_parquet("data/supplier.parquet").select_columns(
+df_supplier = ctx.read_parquet(get_data_path("supplier.parquet")).select_columns(
     "s_acctbal",
     "s_name",
     "s_address",
@@ -53,13 +55,13 @@
     "s_nationkey",
     "s_suppkey",
 )
-df_partsupp = ctx.read_parquet("data/partsupp.parquet").select_columns(
+df_partsupp = ctx.read_parquet(get_data_path("partsupp.parquet")).select_columns(
     "ps_partkey", "ps_suppkey", "ps_supplycost"
 )
-df_nation = ctx.read_parquet("data/nation.parquet").select_columns(
+df_nation = ctx.read_parquet(get_data_path("nation.parquet")).select_columns(
     "n_nationkey", "n_regionkey", "n_name"
 )
-df_region = ctx.read_parquet("data/region.parquet").select_columns(
+df_region = ctx.read_parquet(get_data_path("region.parquet")).select_columns(
     "r_regionkey", "r_name"
 )
 

diff --git a/examples/tpch/q03_shipping_priority.py b/examples/tpch/q03_shipping_priority.py
@@ -28,6 +28,7 @@
 """
 
 from datafusion import SessionContext, col, lit, functions as F
+from util import get_data_path
 
 SEGMENT_OF_INTEREST = "BUILDING"
 DATE_OF_INTEREST = "1995-03-15"
@@ -36,13 +37,13 @@
 
 ctx = SessionContext()
 
-df_customer = ctx.read_parquet("data/customer.parquet").select_columns(
+df_customer = ctx.read_parquet(get_data_path("customer.parquet")).select_columns(
     "c_mktsegment", "c_custkey"
 )
-df_orders = ctx.read_parquet("data/orders.parquet").select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
     "o_orderdate", "o_shippriority", "o_custkey", "o_orderkey"
 )
-df_lineitem = ctx.read_parquet("data/lineitem.parquet").select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
     "l_orderkey", "l_extendedprice", "l_discount", "l_shipdate"
 )
 
@@ -73,9 +74,9 @@
 
 df = df.sort(col("revenue").sort(ascending=False), col("o_orderdate").sort())
 
-# Only return 100 results
+# Only return 10 results
 
-df = df.limit(100)
+df = df.limit(10)
 
 # Change the order that the columns are reported in just to match the spec
 

diff --git a/examples/tpch/q04_order_priority_checking.py b/examples/tpch/q04_order_priority_checking.py
@@ -29,6 +29,7 @@
 from datetime import datetime
 import pyarrow as pa
 from datafusion import SessionContext, col, lit, functions as F
+from util import get_data_path
 
 # Ideally we could put 3 months into the interval. See note below.
 INTERVAL_DAYS = 92
@@ -38,10 +39,10 @@
 
 ctx = SessionContext()
 
-df_orders = ctx.read_parquet("data/orders.parquet").select_columns(
+df_orders = ctx.read_parquet(get_data_path("orders.parquet")).select_columns(
     "o_orderdate", "o_orderpriority", "o_orderkey"
 )
-df_lineitem = ctx.read_parquet("data/lineitem.parquet").select_columns(
+df_lineitem = ctx.read_parquet(get_data_path("lineitem.parquet")).select_columns(
     "l_orderkey", "l_commitdate", "l_receiptdate"
 )