@@ -23,8 +23,19 @@ def fix_duplicate_runs(df):
23
23
df = pl .read_parquet ("results/master_list.parquet" )
24
24
df = fix_duplicate_runs (df )
25
25
df = df .filter (~ pl .col ("estimator" ).is_in (["nojoin" , "top_k_full_join" ]))
26
- #%%
27
- df = df .with_columns (base_table = pl .col ("base_table" ).str .split ("-" ).list .first ())
26
+
27
+ df_ram = pl .read_csv ("stats/dummy_peak_ram.csv" )
28
+ df_query_time_retrieval = pl .read_csv (
29
+ "stats/avg_query_time_for_pareto_plot_retrieval.csv"
30
+ )
31
+ df_query_time_all_datalakes = pl .read_csv (
32
+ "stats/avg_query_time_for_pareto_plot_all_datalakes.csv"
33
+ )
34
+
35
+ # %%
36
+ df = df .with_columns (base_table = pl .col ("base_table" ).str .split ("-" ).list .first ()).join (
37
+ df_ram , on = "jd_method"
38
+ ).with_columns (peak_ram = pl .max_horizontal ("peak_ram" , "peak_fit" , "peak_predict" , "peak_test" ))
28
39
# %%
29
40
# General configuration (all data lakes, no Starmie)
30
41
config_general = json .load (
@@ -33,6 +44,9 @@ def fix_duplicate_runs(df):
33
44
df_config = prepare_config (config_general )
34
45
group_keys = df_config .columns
35
46
df_test = df_config .join (df , on = group_keys , how = "inner" )
47
+ df_test .join (df_query_time_all_datalakes , on = "jd_method" ).with_columns (
48
+ total_runtime = pl .col ("time_run" ) + pl .col ("time_query" )
49
+ )
36
50
df_test .write_parquet ("results/results_general.parquet" )
37
51
# %%
38
52
# Retrieval method configuration (Starmie, no 50k/open data)
@@ -42,6 +56,10 @@ def fix_duplicate_runs(df):
42
56
df_config = prepare_config (config_general )
43
57
group_keys = df_config .columns
44
58
df_test = df_config .join (df , on = group_keys , how = "inner" )
59
+ df_test = df_test .join (df_query_time_retrieval , on = "jd_method" ).with_columns (
60
+ total_runtime = pl .col ("time_run" ) + pl .col ("time_query" )
61
+ )
62
+
45
63
df_test .write_parquet ("results/results_retrieval.parquet" )
46
64
47
65
# %%
@@ -52,6 +70,10 @@ def fix_duplicate_runs(df):
52
70
df_config = prepare_config (config_general )
53
71
group_keys = df_config .columns
54
72
df_test = df_config .join (df , on = group_keys , how = "inner" )
73
+ df_test = df_test .join (df_query_time_all_datalakes , on = "jd_method" ).with_columns (
74
+ total_runtime = pl .col ("time_run" ) + pl .col ("time_query" )
75
+ )
76
+
55
77
df_test .write_parquet ("results/results_aggregation.parquet" )
56
78
57
79
# %%
0 commit comments