Skip to content

Commit 33ecff3

Browse files
committed
Initial commit of q22 in tph-c, global sales opportunity
1 parent 7497829 commit 33ecff3

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
"""
19+
This query counts how many customers within a specific range of country codes have not placed
20+
orders for 7 years but who have a greater than average “positive” account balance. It also reflects
21+
the magnitude of that balance. Country code is defined as the first two characters of c_phone.
22+
"""
23+
24+
from datafusion import SessionContext, WindowFrame, col, lit, functions as F
25+
26+
NATION_CODE = 13
27+
28+
# Load the dataframes we need
29+
30+
ctx = SessionContext()
31+
32+
df_customer = ctx.read_parquet("data/customer.parquet").select_columns(
33+
"c_phone", "c_acctbal", "c_custkey"
34+
)
35+
df_orders = ctx.read_parquet("data/orders.parquet").select_columns("o_custkey")
36+
37+
# The nation code is a two digit number, but we need to convert it to a string literal
38+
nation_code = lit(str(NATION_CODE))
39+
40+
# Use the substring operation to extract the first two charaters of the phone number
41+
df = df_customer.with_column("cntrycode", F.substr(col("c_phone"), lit(0), lit(3)))
42+
43+
# Limit our search to customers with some balance and in the country code above
44+
df = df.filter(col("c_acctbal") > lit(0.0))
45+
df = df.filter(nation_code == col("cntrycode"))
46+
47+
# Compute the average balance. By default, the window frame is from unbounded preceeding to the
48+
# current row. We want our frame to cover the entire data frame.
49+
window_frame = WindowFrame("rows", None, None)
50+
df = df.with_column(
51+
"avg_balance", F.window("avg", [col("c_acctbal")], window_frame=window_frame)
52+
)
53+
54+
# Limit results to customers with above average balance
55+
df = df.filter(col("c_acctbal") > col("avg_balance"))
56+
57+
# Limit results to customers with no orders
58+
df = df.join(df_orders, (["c_custkey"], ["o_custkey"]), "anti")
59+
60+
# Count up the customers and the balances
61+
df = df.aggregate(
62+
[col("cntrycode")],
63+
[
64+
F.count(col("c_custkey")).alias("numcust"),
65+
F.sum(col("c_acctbal")).alias("totacctbal"),
66+
],
67+
)
68+
69+
df = df.sort(col("cntrycode").sort())
70+
71+
df.show()

0 commit comments

Comments
 (0)