Skip to content

Commit 32bcb8b

Browse files
authored
New run duckdb 1.1 (#96)
Update many other libraries as well
1 parent e54b17f commit 32bcb8b

File tree

32 files changed

+5817
-46
lines changed

32 files changed

+5817
-46
lines changed

.github/workflows/regression.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848

4949
- name: Install all solutions
5050
shell: bash
51-
run: source path.env && python3 _utils/install_all_solutions.py ${{ matrix.solution }}
51+
run: source path.env && python3 _setup_utils/install_all_solutions.py ${{ matrix.solution }}
5252

5353
- name: Turn swap off
5454
shell: bash
@@ -135,7 +135,7 @@ jobs:
135135

136136
- name: Install all solutions
137137
shell: bash
138-
run: source path.env && python3 _utils/install_all_solutions.py all
138+
run: source path.env && python3 _setup_utils/install_all_solutions.py all
139139

140140
- name: Turn swap off
141141
shell: bash

R-arrow/VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
17.0.0.1

_benchplot/benchplot-dict.R

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ solution.dict = {list(
3939
"spark" = list(name=c(short="spark", long="spark"), color=c(strong="#8000FFFF", light="#CC66FF")),
4040
"dask" = list(name=c(short="dask", long="dask"), color=c(strong="slategrey", light="lightgrey")),
4141
"juliadf" = list(name=c(short="DF.jl", long="DataFrames.jl"), color=c(strong="deepskyblue", light="darkturquoise")),
42-
"juliads" = list(name=c(short="IMD.jl", long="InMemoryDatasets.jl"), color=c(strong="#b80000", light="#ff1f1f")),
42+
"juliads" = list(name=c(short="IMD.jl", long="InMemData.jl"), color=c(strong="#b80000", light="#ff1f1f")),
4343
"clickhouse" = list(name=c(short="clickhouse", long="ClickHouse"), color=c(strong="hotpink4", light="hotpink1")),
4444
"polars" = list(name=c(short="polars", long="Polars"), color=c(strong="deepskyblue4", light="deepskyblue3")),
4545
"R-arrow" = list(name=c(short="R-arrow", long="R-arrow"), color=c(strong="aquamarine3", light="aquamarine1")),
@@ -299,10 +299,12 @@ groupby.data.exceptions = {list(
299299
"G1_1e8_2e0_0_0") # q3
300300
)},
301301
"juliadf" = {list(
302-
"timeout" = "G1_1e8_2e0_0_0",
303-
"out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0") # CSV.File
302+
# "timeout" = "G1_1e8_2e0_0_0",
303+
# "out of memory" = c("G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0"), # CSV.File
304+
"CSV import Segfault: JuliaLang#55765" = c("G1_1e7_1e2_0_0","G1_1e7_1e1_0_0","G1_1e7_2e0_0_0","G1_1e7_1e2_0_1","G1_1e7_1e2_5_0","G1_1e8_1e2_0_0","G1_1e8_1e1_0_0","G1_1e8_2e0_0_0","G1_1e8_1e2_0_1","G1_1e8_1e2_5_0","G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0")
304305
)},
305306
"juliads" = {list(
307+
"CSV import Segfault: JuliaLang#55765" = c("G1_1e7_1e2_0_0","G1_1e7_1e1_0_0","G1_1e7_2e0_0_0","G1_1e7_1e2_0_1","G1_1e7_1e2_5_0","G1_1e8_1e2_0_0","G1_1e8_1e1_0_0","G1_1e8_2e0_0_0","G1_1e8_1e2_0_1","G1_1e8_1e2_5_0","G1_1e9_1e2_0_0","G1_1e9_1e1_0_0","G1_1e9_2e0_0_0","G1_1e9_1e2_0_1","G1_1e9_1e2_5_0")
306308
)},
307309
"clickhouse" = {list(
308310
)},
@@ -485,9 +487,11 @@ join.data.exceptions = {list(
485487
"out of memory" = c("J1_1e9_NA_0_0") # q1 even when using on-disk, after 47m (480m timeout)
486488
)},
487489
"juliadf" = {list(
488-
"out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # CSV.File
490+
# "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1") # CSV.File
491+
"CSV import Segfault: JuliaLang#55765" = c("J1_1e7_NA_0_0", "J1_1e7_NA_5_0", "J1_1e7_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1", "J1_1e9_NA_0_0")
489492
)},
490493
"juliads" = {list(
494+
"CSV import Segfault: JuliaLang#55765" = c("J1_1e7_NA_0_0", "J1_1e7_NA_5_0", "J1_1e7_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1", "J1_1e9_NA_0_0")
491495
)},
492496
"clickhouse" = {list(
493497
)},

_report/index.Rmd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except
145145
![](./join/J1_1e7_NA_0_0_advanced.png)
146146
-->
147147

148-
#### 5 GB {.active}
148+
#### 5 GB
149149

150150
##### **basic questions**
151151

@@ -158,7 +158,7 @@ loop_benchplot(dt_join, report_name="join", syntax.dict=join.syntax.dict, except
158158
![](./join/J1_1e8_NA_0_0_advanced.png)
159159
-->
160160

161-
#### 50 GB
161+
#### 50 GB {.active}
162162

163163
##### **basic questions**
164164

_run/partitioned_run.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# set machine type
2+
./_run/run_small_medium_groupby_join.sh
3+
4+
./_run/run_large_groupby_join.sh
5+

_run/run_large_groupby_join.sh

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# download and expand large data
2+
3+
# get groupby large (0.5GB and 5GB datasets)
4+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_large.duckdb data/groupby_large.duckdb
5+
# get join small (0.5GB and 5GB datasets)
6+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_large.duckdb data/join_large.duckdb
7+
8+
9+
# expand groupby-small datasets to csv
10+
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_0 to 'data/G1_1e9_1e2_0_0.csv' (FORMAT CSV)"
11+
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e1_0_0 to 'data/G1_1e9_1e1_0_0.csv' (FORMAT CSV)"
12+
duckdb data/groupby_large.duckdb -c "copy G1_1e9_2e0_0_0 to 'data/G1_1e9_2e0_0_0.csv' (FORMAT CSV)"
13+
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_0_1 to 'data/G1_1e9_1e2_0_1.csv' (FORMAT CSV)"
14+
duckdb data/groupby_large.duckdb -c "copy G1_1e9_1e2_5_0 to 'data/G1_1e9_1e2_5_0.csv' (FORMAT CSV)"
15+
16+
# expand join-small datasets to csv
17+
duckdb data/join_large.duckdb -c "copy J1_1e9_NA_0_0 to 'data/J1_1e9_NA_0_0.csv' (FORMAT CSV)"
18+
duckdb data/join_large.duckdb -c "copy J1_1e9_1e9_0_0 to 'data/J1_1e9_1e9_0_0.csv' (FORMAT CSV)"
19+
duckdb data/join_large.duckdb -c "copy J1_1e9_1e6_0_0 to 'data/J1_1e9_1e6_0_0.csv' (FORMAT CSV)"
20+
duckdb data/join_large.duckdb -c "copy J1_1e9_1e3_0_0 to 'data/J1_1e9_1e3_0_0.csv' (FORMAT CSV)"
21+
22+
23+
echo "Running all solutions on large (50GB) datasets"
24+
./run.sh
25+
26+
27+
###
28+
echo "done..."
29+
echo "removing data files"
30+
#rm data/*.csv
31+
#rm data/*.duckdb

_run/run_small_medium_groupby_join.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# first download and expand small data
2+
3+
# get groupby small (0.5GB and 5GB datasets)
4+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/groupby_small.duckdb data/groupby_small.duckdb
5+
# get join small (0.5GB and 5GB datasets)
6+
aws s3 cp s3://duckdb-blobs/data/db-benchmark-data/join_small.duckdb data/join_small.duckdb
7+
8+
9+
# expand groupby-small datasets to csv
10+
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_0 to 'data/G1_1e7_1e2_0_0.csv' (FORMAT CSV)"
11+
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e1_0_0 to 'data/G1_1e7_1e1_0_0.csv' (FORMAT CSV)"
12+
duckdb data/groupby_small.duckdb -c "copy G1_1e7_2e0_0_0 to 'data/G1_1e7_2e0_0_0.csv' (FORMAT CSV)"
13+
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_0_1 to 'data/G1_1e7_1e2_0_1.csv' (FORMAT CSV)"
14+
duckdb data/groupby_small.duckdb -c "copy G1_1e7_1e2_5_0 to 'data/G1_1e7_1e2_5_0.csv' (FORMAT CSV)"
15+
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_0 to 'data/G1_1e8_1e2_0_0.csv' (FORMAT CSV)"
16+
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e1_0_0 to 'data/G1_1e8_1e1_0_0.csv' (FORMAT CSV)"
17+
duckdb data/groupby_small.duckdb -c "copy G1_1e8_2e0_0_0 to 'data/G1_1e8_2e0_0_0.csv' (FORMAT CSV)"
18+
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_0_1 to 'data/G1_1e8_1e2_0_1.csv' (FORMAT CSV)"
19+
duckdb data/groupby_small.duckdb -c "copy G1_1e8_1e2_5_0 to 'data/G1_1e8_1e2_5_0.csv' (FORMAT CSV)"
20+
21+
# expand join-small datasets to csv
22+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_0 to 'data/J1_1e7_1e1_0_0.csv' (FORMAT CSV)"
23+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_5_0 to 'data/J1_1e7_1e4_5_0.csv' (FORMAT CSV)"
24+
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_1 to 'data/J1_1e7_NA_0_1.csv' (FORMAT CSV)"
25+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_0 to 'data/J1_1e8_1e5_0_0.csv' (FORMAT CSV)"
26+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_5_0 to 'data/J1_1e8_1e8_5_0.csv' (FORMAT CSV)"
27+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_0_1 to 'data/J1_1e7_1e1_0_1.csv' (FORMAT CSV)"
28+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_0 to 'data/J1_1e7_1e7_0_0.csv' (FORMAT CSV)"
29+
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_5_0 to 'data/J1_1e7_NA_5_0.csv' (FORMAT CSV)"
30+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_0_1 to 'data/J1_1e8_1e5_0_1.csv' (FORMAT CSV)"
31+
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_0 to 'data/J1_1e8_NA_0_0.csv' (FORMAT CSV)"
32+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e1_5_0 to 'data/J1_1e7_1e1_5_0.csv' (FORMAT CSV)"
33+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_0_1 to 'data/J1_1e7_1e7_0_1.csv' (FORMAT CSV)"
34+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_0 to 'data/J1_1e8_1e2_0_0.csv' (FORMAT CSV)"
35+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e5_5_0 to 'data/J1_1e8_1e5_5_0.csv' (FORMAT CSV)"
36+
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_0_1 to 'data/J1_1e8_NA_0_1.csv' (FORMAT CSV)"
37+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_0 to 'data/J1_1e7_1e4_0_0.csv' (FORMAT CSV)"
38+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e7_5_0 to 'data/J1_1e7_1e7_5_0.csv' (FORMAT CSV)"
39+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_0_1 to 'data/J1_1e8_1e2_0_1.csv' (FORMAT CSV)"
40+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_0 to 'data/J1_1e8_1e8_0_0.csv' (FORMAT CSV)"
41+
duckdb data/join_small.duckdb -c "copy J1_1e8_NA_5_0 to 'data/J1_1e8_NA_5_0.csv' (FORMAT CSV)"
42+
duckdb data/join_small.duckdb -c "copy J1_1e7_1e4_0_1 to 'data/J1_1e7_1e4_0_1.csv' (FORMAT CSV)"
43+
duckdb data/join_small.duckdb -c "copy J1_1e7_NA_0_0 to 'data/J1_1e7_NA_0_0.csv' (FORMAT CSV)"
44+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e2_5_0 to 'data/J1_1e8_1e2_5_0.csv' (FORMAT CSV)"
45+
duckdb data/join_small.duckdb -c "copy J1_1e8_1e8_0_1 to 'data/J1_1e8_1e8_0_1.csv' (FORMAT CSV)"
46+
47+
48+
cp _control/data_small.csv _control/data.csv
49+
50+
51+
echo "Running all solutions on small (0.5GB and 5GB) datasets"
52+
./run.sh
53+
54+
55+
###
56+
echo "done..."
57+
echo "removing small data files"
58+
rm data/*.csv
59+
rm data/*.duckdb
File renamed without changes.
File renamed without changes.
File renamed without changes.

clickhouse/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
23.10.4.25
1+
24.8.4.13

collapse/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.3
1+
2.0.16

dask/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2023.10.0
1+
2024.9.0

dask/setup-dask.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
set -e
33

4-
virtualenv dask/py-dask --python=python3.10
4+
virtualenv dask/py-dask --python=python3.12
55
source dask/py-dask/bin/activate
66

77
# install binaries

datafusion/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
31.0.0
1+
41.0.0

datatable/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.14.9
1+
1.16.99

dplyr/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.1.3
1+
1.1.4

duckdb-latest/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.9.1.1
1+
1.0.99.9000

duckdb/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.0
1+
1.1.0

duckdb/setup-duckdb.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Rscript -e 'install.packages("DBI", lib="./duckdb/r-duckdb", repos = "http://clo
1111
cd duckdb
1212
git clone https://github.com/duckdb/duckdb-r.git
1313
cd duckdb-r
14-
git checkout v1.0.0
14+
git checkout v1.1.0
1515
cd ..
1616
ncores=`python3 -c 'import multiprocessing as mp; print(mp.cpu_count())'`
1717
MAKE="make -j$ncores" R CMD INSTALL -l "./r-duckdb" duckdb-r

juliadf/setup-juliadf.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# install julia
22

3-
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
4-
tar -xvf julia-1.10.4-linux-x86_64.tar.gz
5-
sudo mv julia-1.10.4 /opt
6-
rm julia-1.10.4-linux-x86_64.tar.gz
3+
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
4+
tar -xvf julia-1.10.5-linux-x86_64.tar.gz
5+
sudo mv julia-1.10.5 /opt
6+
rm julia-1.10.5-linux-x86_64.tar.gz
77
# put to paths
8-
echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
8+
echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
99
echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
1010
# note that cron job must have path updated as well
1111

juliads/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.7.18
1+
0.7.21

juliads/setup-juliads.sh

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
1-
21
# install julia
3-
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.4-linux-x86_64.tar.gz
4-
tar -xvf julia-1.10.4-linux-x86_64.tar.gz
5-
sudo mv julia-1.10.4 /opt
6-
rm julia-1.10.4-linux-x86_64.tar.gz
72

3+
wget https://julialang-s3.julialang.org/bin/linux/x64/1.10/julia-1.10.5-linux-x86_64.tar.gz
4+
tar -xvf julia-1.10.5-linux-x86_64.tar.gz
5+
sudo mv julia-1.10.5 /opt
6+
rm julia-1.10.5-linux-x86_64.tar.gz
87
# put to paths
9-
echo 'export JULIA_HOME=/opt/julia-1.10.4' >> path.env
8+
echo 'export JULIA_HOME=/opt/julia-1.10.5' >> path.env
109
echo 'export PATH=$PATH:$JULIA_HOME/bin' >> path.env
11-
echo "export JULIA_NUM_THREADS=40" >> path.env
1210
# note that cron job must have path updated as well
1311

1412
source path.env
1513

1614
# install julia InMemoryDatasets and csv packages
17-
julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow"])'
15+
julia -q -e 'using Pkg; Pkg.add(["InMemoryDatasets","DLMReader", "PooledArrays", "Arrow", "CSV"])'
1816
julia -q -e 'include("$(pwd())/_helpers/helpersds.jl"); pkgmeta = getpkgmeta("InMemoryDatasets"); println(string(pkgmeta["version"])); pkgmeta = getpkgmeta("DLMReader"); println(string(pkgmeta["version"]))'
1917

20-
./juliadf/ver-juliads.sh
18+
./juliadf/ver-juliadf.sh

0 commit comments

Comments
 (0)