Skip to content

Commit 7a20e39

Browse files
committed
Initial commit of q20 in tph-c, potential part promotion
1 parent c9ca2ba commit 7a20e39

File tree

1 file changed

+92
-0
lines changed

1 file changed

+92
-0
lines changed
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""
19+
The Potential Part Promotion query identifies suppliers who have an excess of a given part
20+
available; an excess is defined to be more than 50% of the parts like the given part that the
21+
supplier shipped in a given year for a given nation. Only parts whose names share a certain naming
22+
convention are considered.
23+
"""
24+
25+
from datetime import datetime
26+
import pyarrow as pa
27+
from datafusion import SessionContext, col, lit, functions as F
28+
29+
COLOR_OF_INTEREST = "forest"
30+
DATE_OF_INTEREST = "1994-01-01"
31+
NATION_OF_INTEREST = "CANADA"
32+
33+
# Load the dataframes we need
34+
35+
ctx = SessionContext()
36+
37+
df_part = ctx.read_parquet("data/part.parquet").select_columns("p_partkey", "p_name")
38+
df_lineitem = ctx.read_parquet("data/lineitem.parquet").select_columns(
39+
"l_shipdate", "l_partkey", "l_suppkey", "l_quantity"
40+
)
41+
df_partsupp = ctx.read_parquet("data/partsupp.parquet").select_columns(
42+
"ps_partkey", "ps_suppkey", "ps_availqty"
43+
)
44+
df_supplier = ctx.read_parquet("data/supplier.parquet").select_columns(
45+
"s_suppkey", "s_address", "s_name", "s_nationkey"
46+
)
47+
df_nation = ctx.read_parquet("data/nation.parquet").select_columns(
48+
"n_nationkey", "n_name"
49+
)
50+
51+
date = datetime.strptime(DATE_OF_INTEREST, "%Y-%m-%d").date()
52+
53+
# Note: this is a hack on setting the values. It should be set differently once
54+
# https://github.com/apache/datafusion-python/issues/665 is resolved.
55+
interval = pa.scalar((0, 0, 365), type=pa.month_day_nano_interval())
56+
57+
# Filter down dataframes
58+
df_nation = df_nation.filter(col("n_name") == lit(NATION_OF_INTEREST))
59+
df_part = df_part.filter(
60+
F.substr(col("p_name"), lit(0), lit(len(COLOR_OF_INTEREST) + 1))
61+
== lit(COLOR_OF_INTEREST)
62+
)
63+
64+
df = df_lineitem.filter(col("l_shipdate") >= lit(date)).filter(
65+
col("l_shipdate") < lit(date) + lit(interval)
66+
)
67+
68+
# This will filter down the line items to the parts of interest
69+
df = df.join(df_part, (["l_partkey"], ["p_partkey"]), "inner")
70+
71+
# Compute the total sold and limit ourselves to indivdual supplier/part combinations
72+
df = df.aggregate(
73+
[col("l_partkey"), col("l_suppkey")], [F.sum(col("l_quantity")).alias("total_sold")]
74+
)
75+
76+
df = df.join(
77+
df_partsupp, (["l_partkey", "l_suppkey"], ["ps_partkey", "ps_suppkey"]), "inner"
78+
)
79+
80+
# Find cases of excess quantity
81+
df.filter(col("ps_availqty") > lit(0.5) * col("total_sold"))
82+
83+
# We could do these joins earlier, but now limit to the nation of interest suppliers
84+
df = df.join(df_supplier, (["ps_suppkey"], ["s_suppkey"]), "inner")
85+
df = df.join(df_nation, (["s_nationkey"], ["n_nationkey"]), "inner")
86+
87+
# Restrict to the requested data per the problem statement
88+
df = df.select_columns("s_name", "s_address")
89+
90+
df = df.sort(col("s_name").sort())
91+
92+
df.show()

0 commit comments

Comments
 (0)