Skip to content

Commit 7ae9789

Browse files
committed
fix other solutions to use machine type
1 parent 1493560 commit 7ae9789

24 files changed

+363
-336
lines changed

R-arrow/groupby-R-arrow.R

Lines changed: 21 additions & 20 deletions
Large diffs are not rendered by default.

R-arrow/join-R-arrow.R

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ cache = TRUE
1717
on_disk = FALSE
1818

1919
data_name = Sys.getenv("SRC_DATANAME")
20+
machine_type = Sys.getenv("MACHINE_TYPE")
2021
src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
2122
y_data_name = join_to_tbls(data_name)
2223
src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
@@ -46,15 +47,15 @@ t = system.time({
4647
})[["elapsed"]]
4748
m = memory_usage()
4849
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
49-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
50+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
5051
rm(ans)
5152
t = system.time({
5253
ans<-collect(inner_join(x, small, by="id1"))
5354
print(dim(ans))
5455
})[["elapsed"]]
5556
m = memory_usage()
5657
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
57-
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
58+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
5859
ans <- collect(ans)
5960
print(head(ans, 3))
6061
print(tail(ans, 3))
@@ -68,15 +69,15 @@ t = system.time({
6869
})[["elapsed"]]
6970
m = memory_usage()
7071
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
71-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
72+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
7273
rm(ans)
7374
t = system.time({
7475
ans<-collect(inner_join(x, medium, by="id2"))
7576
print(dim(ans))
7677
})[["elapsed"]]
7778
m = memory_usage()
7879
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
79-
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
80+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
8081
ans <- collect(ans)
8182
print(head(ans, 3))
8283
print(tail(ans, 3))
@@ -90,15 +91,15 @@ t = system.time({
9091
})[["elapsed"]]
9192
m = memory_usage()
9293
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
93-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
94+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
9495
rm(ans)
9596
t = system.time({
9697
ans<-collect(left_join(x, medium, by="id2"))
9798
print(dim(ans))
9899
})[["elapsed"]]
99100
m = memory_usage()
100101
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
101-
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
102+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
102103
ans <- collect(ans)
103104
print(head(ans, 3))
104105
print(tail(ans, 3))
@@ -112,15 +113,15 @@ t = system.time({
112113
})[["elapsed"]]
113114
m = memory_usage()
114115
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
115-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
116+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
116117
rm(ans)
117118
t = system.time({
118119
ans <- collect(inner_join(x, medium, by="id5"))
119120
print(dim(ans))
120121
})[["elapsed"]]
121122
m = memory_usage()
122123
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
123-
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
124+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
124125
ans <- collect(ans)
125126
print(head(ans, 3))
126127
print(tail(ans, 3))
@@ -134,15 +135,15 @@ t = system.time({
134135
})[["elapsed"]]
135136
m = memory_usage()
136137
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
137-
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
138+
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
138139
rm(ans)
139140
t = system.time({
140141
ans<-collect(inner_join(x, big, by="id3"))
141142
print(dim(ans))
142143
})[["elapsed"]]
143144
m = memory_usage()
144145
chkt = system.time(chk <- collect(summarise(ans, sum(v1, na.rm=TRUE), sum(v2, na.rm=TRUE))))[["elapsed"]]
145-
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
146+
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk, machine_type=machine_type)
146147
ans <- collect(ans)
147148
print(head(ans, 3))
148149
print(tail(ans, 3))

_helpers/helpers.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ function getpkgmeta(name::AbstractString)
77
Pkg.TOML.parse(read(fname, String))["deps"][name][1]
88
end;
99

10-
function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk)
10+
function write_log(run, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type)
1111
file=try
1212
ENV["CSV_TIME_FILE"]
1313
catch
@@ -31,7 +31,7 @@ function write_log(run, task, data, in_rows, question, out_rows, out_cols, solut
3131
chk_time_sec=round(chk_time_sec, digits=3)
3232
timestamp=@sprintf("%0.6f", time())
3333
csv_verbose = false # hardcoded for now, TODO ENV["CSV_VERBOSE"] and print
34-
log = DataFrame(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)))
34+
log = DataFrame(nodename=nodename, batch=batch, timestamp=timestamp, task=task, data=data, in_rows=in_rows, question=question, out_rows=out_rows, out_cols=out_cols, solution=solution, version=version, git=git, fun=fun, run=run, time_sec=time_sec, mem_gb=mem_gb, cache=uppercase(string(cache)), chk=chk, chk_time_sec=chk_time_sec, comment=comment, on_disk=uppercase(string(on_disk)), machine_type=machine_type)
3535
CSV.write(file, log, append=isfile(file))
3636
end;
3737

_helpers/helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import os
66
import platform
77

8-
def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk):
8+
def write_log(task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, on_disk, machine_type='xlarge'):
99
batch = os.getenv('BATCH', "")
1010
timestamp = time.time()
1111
csv_file = os.getenv('CSV_TIME_FILE', "time.csv")
@@ -18,7 +18,7 @@ def write_log(task, data, in_rows, question, out_rows, out_cols, solution, versi
1818
time_sec = ""
1919
if math.isnan(mem_gb):
2020
mem_gb = ""
21-
log_row = [nodename, batch, timestamp, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, comment, on_disk]
21+
log_row = [nodename, batch, timestamp, task, data, in_rows, question, out_rows, out_cols, solution, version, git, fun, run, time_sec, mem_gb, cache, chk, chk_time_sec, comment, on_disk, machine_type]
2222
log_header = ["nodename","batch","timestamp","task","data","in_rows","question","out_rows","out_cols","solution","version","git","fun","run","time_sec","mem_gb","cache","chk","chk_time_sec","comment","on_disk"]
2323
if os.path.isfile(csv_file) and not(os.path.getsize(csv_file)):
2424
os.remove(csv_file)

clickhouse/clickhouse-parse-log.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ args = commandArgs(TRUE) # args = c("groupby","G1_1e6_1e2_0_0")
77
stopifnot(length(args)==2L)
88
task = args[1L]
99
data_name = args[2L]
10+
machine_type = Sys.getenv("MACHINE_TYPE", "large")
1011

1112
library(data.table)
1213
# sort files according to question and run
@@ -34,7 +35,7 @@ stopifnot(all(d$task==task), all(d$data_name==data_name))
3435
d[,
3536
write.log(run=as.integer(run), timestamp=as.numeric(timestamp), task=as.character(task), data=as.character(data_name), in_rows=as.numeric(.in_rows), question=as.character(question),
3637
out_rows=as.numeric(NA), out_cols=as.integer(NA), solution=as.character(solution), version=as.character(version), git=as.character(NA), fun=as.character(fun),
37-
time_sec=as.numeric(time_sec), mem_gb=as.numeric(NA), cache=as.logical(cache), chk=as.character(NA), chk_time_sec=as.numeric(NA), on_disk=as.logical(on_disk)),
38+
time_sec=as.numeric(time_sec), mem_gb=as.numeric(NA), cache=as.logical(cache), chk=as.character(NA), chk_time_sec=as.numeric(NA), on_disk=as.logical(on_disk), machine_type=as.character(machine_type)),
3839
by = seq_len(nrow(d))] -> nul
3940

4041
cat("# clickhouse-parse-log.R: parsing timings to time.csv finished\n")

clickhouse/exec.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ IS_SORTED=$(clickhouse-client --query "SELECT splitByChar('_','$SRC_DATANAME')[5
3434
ON_DISK=0
3535

3636
if [ $1 == 'groupby' ]; then
37-
ON_DISK=$(clickhouse-client --query "SELECT (splitByChar('_','$SRC_DATANAME')[2])::Float32 >= 1e10::Float32 FORMAT TSV")
37+
ON_DISK=$(clickhouse-client --query "SELECT (splitByChar('_','$SRC_DATANAME')[2])::Float32 >= 1e10::Float32 FORMAT TSV") || ($(clickhouse-client --query "SELECT (splitByChar('_','$SRC_DATANAME')[2])::Float32 >= 1e9::Float32 || '$MACHINE_TYPE' == 'small') FORMAT TSV"))
3838
clickhouse-client --query "DROP TABLE IF EXISTS $SRC_DATANAME"
3939
if [ $HAS_NULL -eq 1 ]; then
4040
if [ $IS_SORTED -eq 1 ]; then

0 commit comments

Comments
 (0)