Updating splotting scripts

rcap107 · rcap107 · commit b36ef94bc1ff · 2025-04-16T10:29:04.000+02:00
diff --git a/prepare_results.py b/prepare_results.py
@@ -44,7 +44,7 @@ def fix_duplicate_runs(df):
 df_config = prepare_config(config_general)
 group_keys = df_config.columns
 df_test = df_config.join(df, on=group_keys, how="inner")
-df_test.join(df_query_time_all_datalakes, on="jd_method").with_columns(
+df_test = df_test.join(df_query_time_all_datalakes, on="jd_method").with_columns(
     total_runtime=pl.col("time_run") + pl.col("time_query")
 )
 df_test.write_parquet("results/results_general.parquet")
diff --git a/results_pivot.py b/results_pivot.py
@@ -0,0 +1,45 @@
+# %%
+import polars as pl
+
+from src.utils.constants import LABEL_MAPPING
+
+# %%
+df_aggregation = pl.read_parquet("results/results_aggregation.parquet")
+df_general = pl.read_parquet("results/results_general.parquet")
+df_retrieval = pl.read_parquet("results/results_retrieval.parquet")
+df_master = pl.read_parquet("results/master_list.parquet")
+
+# %%
+variables = ["chosen_model", "jd_method", "estimator", "target_dl", "base_table"]
+
+# %%
+df_general.pivot(
+    on="estimator",
+    index="chosen_model",
+    values="prediction_metric",
+    aggregate_function="median",
+)
+# %%
+for var_1 in variables:
+    df_list = []
+    for var_2 in variables:
+        if var_1 == var_2:
+            continue
+        _this_df = df_general.pivot(
+            on=var_2,
+            index=var_1,
+            values="prediction_metric",
+            aggregate_function="median",
+        )
+        _index = _this_df.get_column(var_1).replace(LABEL_MAPPING[var_1])
+        _this_df.drop_in_place(var_1)
+        _this_df = _this_df.rename(lambda c : LABEL_MAPPING[var_2][c])
+        _col_order = [var_1] + _this_df.columns
+        _this_df = _this_df.with_columns(_index.alias(var_1)).select(_col_order)
+        df_list.append(_this_df)
+
+    df_aligned = pl.concat(df_list, how="align")
+    df_aligned.write_csv(f"results/results_pivot_{var_1}.csv")
+
+# %%
+# %%
diff --git a/scripts/plotting/plot_comparison_large.py b/scripts/plotting/plot_comparison_large.py
@@ -4,9 +4,7 @@
 
 # %%
 # %cd ~/bench
-# %load_ext autoreload
-# %autoreload 2
-#%%
+# %%
 import matplotlib.pyplot as plt
 import polars as pl
 
@@ -16,6 +14,7 @@
 plot_case = "dep"
 savefig = False
 
+
 # %%
 def read_and_format(file_path):
     return (
@@ -35,14 +34,24 @@ def read_and_format(file_path):
     )
 
 
-#%%
+# %%
 _results_general = read_and_format("results/results_general.parquet")
 _results_aggr = read_and_format("results/results_aggregation.parquet")
-
-_results_aggr = _results_aggr.filter(pl.col("estimator") != "nojoin")
 _results_retrieval = read_and_format("results/results_retrieval.parquet")
-# _results_aggr = _results_aggr.filter(pl.col("jd_method") == "exact_matching")
-# _results_general = _results_general.filter(pl.col("jd_method") == "exact_matching")
+
+# _results_aggr = _results_aggr.with_columns(time_run = pl.col("time_run")*10 + pl.col("time_query"))
+# _results_general = _results_general.with_columns(time_run = pl.col("time_run")*10 + pl.col("time_query"))
+# _results_retrieval = _results_retrieval.with_columns(time_run = pl.col("time_run")*10 + pl.col("time_query"))
+
+_results_aggr = _results_aggr.filter(pl.col("estimator") != "nojoin").with_columns(
+    time_run=pl.col("time_run") * 10 + pl.col("time_query")
+)
+_results_general = _results_general.filter(
+    pl.col("estimator") != "nojoin"
+).with_columns(time_run=pl.col("time_run") * 10 + pl.col("time_query"))
+_results_retrieval = _results_retrieval.filter(
+    pl.col("estimator") != "nojoin"
+).with_columns(time_run=pl.col("time_run") * 10 + pl.col("time_query"))
 
 
 # %%
diff --git a/scripts/plotting/plot_pareto_topk.py b/scripts/plotting/plot_pareto_topk.py
@@ -1,3 +1,6 @@
+"""This script is used to prepare the Pareto plots that compare the performance 
+by value of top-k, considering both the run time and the peak RAM. 
+"""
 # %%
 # %cd ~/bench
 # %%
@@ -7,6 +10,9 @@
 from src.utils import constants
 from src.utils.plotting import pareto_frontier_plot
 
+plt.style.use("seaborn-v0_8-talk")
+plt.rc("font", family="sans-serif")
+
 # %%
 df = pl.read_csv("results/results_topk.csv")
 df = df.group_by(constants.GROUPING_KEYS + ["top_k"]).agg(
@@ -47,7 +53,7 @@ def prepare_sem_df(df, variable):
 xerr = df_sem["sem_time_run"].to_numpy()
 yerr = df_sem["sem_pred"].to_numpy()
 
-fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(6, 4), layout="constrained")
+fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(5, 3.5), layout="constrained")
 (h, l), _ = pareto_frontier_plot(
     df_pareto.to_pandas(),
     x_var="time_run",
@@ -78,14 +84,12 @@ def prepare_sem_df(df, variable):
         ecolor=c,
     )
 
-ax.legend(h, l, title="Value of k", loc="upper right", bbox_to_anchor=(1.30, 1))
-
+ax.legend(h, l, title="Value of k", loc="upper right", bbox_to_anchor=(1.35, 1.05), frameon=False)
 _x, _y = df_pareto.filter(top_k=30).select("time_run", "prediction_metric")
 
 x_text = _x.item()
 y_text = _y.item()
 
-# Annotate the point (36, 0.52)
 ax.annotate(
     "k used in experiments",  # Annotation text
     xy=(x_text, y_text),  # Point to annotate
@@ -104,7 +108,7 @@ def prepare_sem_df(df, variable):
 xerr = df_sem["sem_peak_fit"].to_numpy()
 yerr = df_sem["sem_pred"].to_numpy()
 
-fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(6, 4), layout="constrained")
+fig, ax = plt.subplots(1, 1, squeeze=True, figsize=(5, 3.5), layout="constrained")
 (h, l), _ = pareto_frontier_plot(
     df_pareto.to_pandas(),
     x_var="peak_fit",
@@ -135,7 +139,7 @@ def prepare_sem_df(df, variable):
         ecolor=c,
     )
 
-ax.legend(h, l, title="Value of k", loc="upper right", bbox_to_anchor=(1.30, 1))
+ax.legend(h, l, title="Value of k", loc="upper right", bbox_to_anchor=(1.35, 1.05), frameon=False)
 fig.savefig("images/pareto_topk_ram.png", bbox_inches="tight")
 fig.savefig("images/pareto_topk_ram.pdf", bbox_inches="tight")
 
diff --git a/stats/compile_stats.py b/stats/compile_stats.py
@@ -0,0 +1,63 @@
+# %%
+import polars as pl
+import pandas as pd
+
+# %%
+df_starmie = (
+    pl.read_csv("stats_retrieval_starmie.csv")
+    .drop("time_save")
+    .with_columns(index_name=pl.lit("starmie"))
+)
+df_others = pl.read_csv("stats_retrieval_others.csv").drop("time_save", "n_candidates")
+# %%
+val_starmie = (
+    df_starmie.group_by("data_lake_version", "base_table", "index_name")
+    .agg(pl.mean("time_create", "time_load", "time_query"))
+    .with_columns(total_query=pl.col("time_load") + pl.col("time_query"))["total_query"]
+    .mean()
+) / 6
+
+# %% Include only the data lakes that starmie works on
+_d = (
+    df_others.filter(~pl.col("data_lake_version").is_in(["wordnet_vldb_50", "open_data_us"])).with_columns(
+        total_query=pl.when(pl.col("index_name") == "exact_matching")
+        .then(pl.sum_horizontal("time_create", "time_load", "time_query"))
+        .otherwise(pl.sum_horizontal("time_load", "time_query"))
+    )
+    .group_by("index_name")
+    .agg(pl.mean("total_query"))
+    .with_columns(total_query=pl.col("total_query") / 6)
+)
+
+r_dict = dict(_d.rows())
+r_dict["starmie"] = val_starmie
+
+pl.from_dict({"jd_method": r_dict.keys(), "time_query": r_dict.values()}).write_csv(
+    "avg_query_time_for_pareto_plot_retrieval.csv"
+)
+# %% Now all data lakes and no starmie
+_d = (
+    df_others.with_columns(
+        total_query=pl.when(pl.col("index_name") == "exact_matching")
+        .then(pl.sum_horizontal("time_create", "time_load", "time_query"))
+        .otherwise(pl.sum_horizontal("time_load", "time_query"))
+    )
+    .group_by("index_name")
+    .agg(pl.mean("total_query"))
+    .with_columns(total_query=pl.col("total_query") / 6)
+)
+
+r_dict = dict(_d.rows())
+
+pl.from_dict({"jd_method": r_dict.keys(), "time_query": r_dict.values()}).write_csv(
+    "avg_query_time_for_pareto_plot_all_datalakes.csv"
+)
+
+# %%
+df_others_max_ram=df_others.filter(~pl.col("data_lake_version").is_in(["wordnet_vldb_50", "open_data_us"])).with_columns(
+        max_ram=pl.max_horizontal("peak_create", "peak_query")
+    )
+# %%
+import seaborn as sns
+
+sns.displot(data=df_others_max_ram.to_pandas(), x="max_ram", col="index_name", binwidth=200)
diff --git a/summary_results.py b/summary_results.py
@@ -1,3 +1,7 @@
+"""This script is used to build the ablation table to compare the performance of each 
+configuration against the reference. 
+"""
+
 # %%
 # %cd ~/bench
 # %%
@@ -64,23 +68,15 @@
 df_reference.write_csv("results/results_reference.csv")
 
 # %%
-query_times_retrieval = pl.read_csv(
-    "stats/avg_query_time_for_pareto_plot_retrieval.csv"
-)
-query_times_all_datalakes = pl.read_csv(
-    "stats/avg_query_time_for_pareto_plot_all_datalakes.csv"
+df_retrieval = df_retrieval.filter(pl.col("estimator") != "nojoin").with_columns(
+    time_run=pl.col("time_run") * 10 + pl.col("time_query")
 )
-
-# %%
-df_retrieval = df_retrieval.join(query_times_retrieval, on="jd_method").with_columns(
-    time_run=pl.col("time_run")*10 + pl.col("time_query")
+df_general = df_general.filter(pl.col("estimator") != "nojoin").with_columns(
+    time_run=pl.col("time_run") * 10 + pl.col("time_query")
 )
-df_general = df_general.join(query_times_retrieval, on="jd_method").with_columns(
-    time_run=pl.col("time_run")*10 + pl.col("time_query")
+df_aggregation = df_aggregation.filter(pl.col("estimator") != "nojoin").with_columns(
+    time_run=pl.col("time_run") * 10 + pl.col("time_query")
 )
-df_aggregation = df_aggregation.join(
-    query_times_retrieval, on="jd_method"
-).with_columns(time_run=pl.col("time_run")*10 + pl.col("time_query"))
 
 
 # %%

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ def fix_duplicate_runs(df):`
`44`	`44`	`df_config = prepare_config(config_general)`
`45`	`45`	`group_keys = df_config.columns`
`46`	`46`	`df_test = df_config.join(df, on=group_keys, how="inner")`
`47`		`-df_test.join(df_query_time_all_datalakes, on="jd_method").with_columns(`
	`47`	`+df_test = df_test.join(df_query_time_all_datalakes, on="jd_method").with_columns(`
`48`	`48`	`total_runtime=pl.col("time_run") + pl.col("time_query")`
`49`	`49`	`)`
`50`	`50`	`df_test.write_parquet("results/results_general.parquet")`