Skip to content

Commit 9c493c5

Browse files
committed
Adding scripts and updating git ignore to exclude results
1 parent c937811 commit 9c493c5

File tree

5 files changed

+381
-0
lines changed

5 files changed

+381
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,7 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
# results
132+
plots/
133+
results/

dataset_plots.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import json
2+
from pathlib import Path
3+
from utils import get_plot_folder
4+
5+
import matplotlib.pyplot as plt
6+
import pandas as pd
7+
import seaborn as sns
8+
9+
10+
def build_info(file_name, meta_features):
11+
"""Builds the info of a dataset in a presentable manner"""
12+
info = {'dataset': int(file_name[:-5])}
13+
for feat_name, value in meta_features.items():
14+
new_name = feat_name.replace("_", " ")
15+
info[new_name] = value
16+
return info
17+
18+
19+
def extract_feature(task):
20+
"""Extracts a feature from an specific task info"""
21+
info_folder = Path('results/95_datasets_info') / task
22+
data_dict = []
23+
for fn in info_folder.glob('*.json'):
24+
info = json.load(open(fn, 'r+'))
25+
data_dict.append(build_info(fn.name, info['meta_features']))
26+
return pd.DataFrame(data_dict)
27+
28+
29+
def plot_numerical_values(data, prop_name):
30+
"""Plots numerical values in a kde format"""
31+
# sns.displot(data=data[prop_name], kind='kde')
32+
plt.figure(prop_name)
33+
sns.kdeplot(x=prop_name, data=data, shade=True)
34+
35+
36+
def plot_categorical_values(data, prop_name):
37+
"""Plots categorical values counting values"""
38+
plt.figure(prop_name)
39+
sns.countplot(x=prop_name, data=data, palette='rainbow')
40+
41+
42+
def plot_values(data, values, plot_folder, type_):
43+
"""Plots categorical or numeric values"""
44+
for value in values:
45+
if type_ == 'numerical':
46+
plot_numerical_values(data, value)
47+
elif type_ == 'categorical':
48+
try:
49+
plot_categorical_values(data, value)
50+
except:
51+
print(value)
52+
plt.savefig(plot_folder / f'{value}.pdf')
53+
plt.close()
54+
55+
56+
def plot_scatterplot(data, plot_folder):
57+
"""Scatter plot of #instances-#features, #instances-#classes (auto-sklearn style)"""
58+
fig, axs = plt.subplots(nrows=2)
59+
sns.scatterplot(data=data, x='input dimensionality', y='number of samples', ax=axs[0])
60+
sns.scatterplot(data=data, x='input dimensionality', y='number of classes', ax=axs[1])
61+
plt.savefig(plot_folder / f'scatterplot.pdf')
62+
plt.close()
63+
64+
65+
def plot_lineplot(data, plot_folder):
66+
"""Line plot of #instances, #features, #classes - dataset id (atm style)"""
67+
fig, axs = plt.subplots(nrows=3)
68+
datasets_number = range(len(data))
69+
sns.lineplot(data=data, x=datasets_number, y='number of samples', ax=axs[0])
70+
sns.lineplot(data=data, x=datasets_number, y='input dimensionality', ax=axs[1])
71+
sns.lineplot(data=data, x=datasets_number, y='number of classes', ax=axs[2])
72+
plt.savefig(plot_folder / f'lineplot.pdf')
73+
plt.close()
74+
75+
76+
def main():
77+
"""Does all main stuff"""
78+
numerical_features = ['number of samples', 'input dimensionality',
79+
'output dimensionality', 'dataset dimensionality', 'standard deviation',
80+
'coefficient of variation', 'covariance avg', 'linear corr coef',
81+
'skewness', 'skewness 1', 'skewness 2', 'skewness 3', 'kurtosis',
82+
'kurtosis 1', 'kurtosis 2', 'kurtosis 3', 'normalized class entropy',
83+
'normalized attr entropy', 'normalized attr entropy 1',
84+
'normalized attr entropy 2', 'normalized attr entropy 3',
85+
'joint entropy', 'joint entropy 1', 'joint entropy 2',
86+
'joint entropy 3', 'mutual information', 'equivalent number of attr',
87+
'noise signal ratio']
88+
categorical_features = ['is supervised', 'has numeric features',
89+
'average number of words', 'has text features', 'semantic input types',
90+
'semantic output types']
91+
92+
df = extract_feature('Classification') # build dataframe with all datasets characteristics
93+
94+
plt_folder = get_plot_folder('plots/meta_features')
95+
96+
plot_scatterplot(df, plt_folder)
97+
plot_lineplot(df, plt_folder)
98+
99+
# Plots numerical values
100+
plot_values(df, numerical_features, plt_folder, 'numerical')
101+
102+
# Plots categorical values
103+
plot_values(df, categorical_features, plt_folder, 'categorical')
104+
105+
106+
if __name__ == '__main__':
107+
main()

metalearner_plots.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import json
2+
from pathlib import Path
3+
from typing import List
4+
5+
import matplotlib.pyplot as plt
6+
import pandas as pd
7+
import seaborn as sns
8+
9+
from utils import get_plot_folder
10+
11+
12+
def build_info(scores):
13+
return [{'dataset': dataset, **dscores} for dataset, dscores in scores.items()]
14+
15+
16+
def extract_scores(strategy):
17+
"""Extracts a feature from an specific task info"""
18+
info_folder = Path('results') / strategy / 'results'
19+
data_dict = []
20+
for fn in info_folder.glob('*.json'):
21+
info = json.load(open(fn, 'r+'))
22+
data_dict.append(build_info(info))
23+
return [pd.DataFrame(data) for data in data_dict]
24+
25+
26+
def get_globals(data):
27+
"""Get the globals of different iterations"""
28+
return pd.DataFrame([df.iloc[-1] for df in data])
29+
30+
31+
def plot_boxplot(data: pd.DataFrame, metric: str, fig_path: Path):
32+
"""Plots the data in a given metric and stores it in the figure path"""
33+
plt.figure(metric)
34+
sns.boxplot(data=data, y=metric)
35+
plt.savefig(fig_path)
36+
plt.close()
37+
38+
39+
def plot_results(strategies: List[str], metrics: List[str], plot_folder: Path):
40+
"""
41+
Plots the results of a list of strategies by a given metrics
42+
and results are stored in plot folder.
43+
"""
44+
for strategy in strategies:
45+
data = extract_scores(strategy)
46+
plot_folder = get_plot_folder(plot_folder / strategy)
47+
globl = get_globals(data)
48+
for metric in metrics:
49+
for i, df in enumerate(data, 1):
50+
plot_boxplot(df, metric, plot_folder / f'{metric}_{i}.pdf')
51+
plot_boxplot(globl, metric, plot_folder / f'global_{metric}.pdf')
52+
53+
54+
def main():
55+
"""Configures everything to save all plots"""
56+
strategies = ['xgb_metalearner'] #, 'nn_metalearner']
57+
metrics = ['srcc_score', 'wrc_score', 'dcg_score', 'ndcg_score']
58+
plot_folder = get_plot_folder('plots/meta_learners')
59+
plot_results(strategies, metrics, plot_folder)
60+
61+
62+
if __name__ == '__main__':
63+
main()

results_plots.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import seaborn as sns
2+
import pandas as pd
3+
import numpy as np
4+
import matplotlib.pyplot as plt
5+
6+
from utils import get_plot_folder
7+
from functools import reduce
8+
from pathlib import Path
9+
import json
10+
import re
11+
12+
13+
def build_info(file_name, info, i):
14+
data = {key: value for key, value in info.items() if not isinstance(value, list)}
15+
# data['failed_pipelines'] = data['failed_pipelines'] / len(info['scores'])
16+
data['dataset'] = file_name
17+
data['i'] = i
18+
if 'max_idx' not in data:
19+
try:
20+
max_idx = info['scores'].index(data['best_fn'])
21+
except:
22+
max_idx = None
23+
data['max_idx'] = max_idx
24+
return data
25+
26+
def extract_scores(metalearner_path: Path):
27+
"Extracts a feature from an especific task info"
28+
data_dict = {}
29+
file_re = re.compile('\w+_(\d+)_(\d+)\.json')
30+
for fn in metalearner_path.glob('*.json'):
31+
info = json.load(open(fn, 'r+'))
32+
m = file_re.match(fn.name)
33+
dataset_name = m.group(1)
34+
iteration = int(m.group(2))
35+
36+
try:
37+
data_dict[iteration].append(build_info(dataset_name, info, iteration))
38+
except KeyError:
39+
data_dict[iteration] = [build_info(dataset_name, info, iteration)]
40+
41+
return [pd.DataFrame(data) for data in data_dict.values()]
42+
43+
44+
def build_average_dataframe(dfs, metalearner):
45+
datasets = {}
46+
for df in dfs:
47+
for _, row in df.iterrows():
48+
dataset = row['dataset']
49+
for column in df.columns:
50+
if column == 'dataset' or column == 'i':
51+
continue
52+
try:
53+
datasets[dataset][column].append(row[column])
54+
except KeyError:
55+
try:
56+
datasets[dataset][column] = [row[column]]
57+
except KeyError:
58+
datasets[dataset] = {column: [row[column]]}
59+
aggregate_datasets = []
60+
for ds, values in datasets.items():
61+
ds_dict = {prop: np.mean(list_v) for prop, list_v in values.items()}
62+
aggregate_datasets.append({'dataset': ds, 'i': metalearner, **ds_dict})
63+
return pd.DataFrame(aggregate_datasets)
64+
65+
66+
def plot_boxplot(data, prop_name, folder):
67+
plt.figure(prop_name).suptitle(prop_name)
68+
sns.boxplot(data=data, y=prop_name)
69+
plt.savefig(folder)
70+
plt.close()
71+
72+
73+
def plot_multiple_boxplot(data, prop_name, folder):
74+
plt.figure(prop_name).suptitle(prop_name)
75+
g = sns.boxplot(data=data, x='i', y=prop_name, hue='i', dodge=False)
76+
plt.legend(title='Estrategias', loc='best')
77+
g.set(xticklabels=[])
78+
g.set(xlabel=None)
79+
plt.savefig(folder)
80+
plt.close()
81+
82+
83+
def plot_histogram(data, metalearners, folder):
84+
autogoal = metalearners[0]
85+
fig, axs = plt.subplots(nrows=len(metalearners)-1)
86+
for i, metalearner in enumerate(metalearners[1:]):
87+
df = data[(data['i'] == autogoal) | (data['i'] == metalearner)]
88+
axs[i].set_xlabel('Accuracy Obtenido')
89+
axs[i].set_ylabel('Datasets')
90+
sns.histplot(data=df, x='best_fn', hue='i', bins=20, ax=axs[i])
91+
axs[i].legend(title='Estrategias', loc='best', labels=[autogoal, metalearner])
92+
plt.savefig(folder / 'histogram')
93+
plt.close()
94+
95+
96+
def plot_results(metalearners, metalearners_path: Path, plot_folder: Path):
97+
avg_results = []
98+
for j, metalearner in enumerate(metalearners):
99+
metalearner_folder = get_plot_folder(plot_folder / metalearner)
100+
data = extract_scores(metalearners_path / metalearner)
101+
avg = build_average_dataframe(data, metalearner)
102+
data.append(avg)
103+
avg_results.append(avg)
104+
for i, df in enumerate(data):
105+
for column in df.columns:
106+
if column in ['i', 'dataset']:
107+
continue
108+
plot_boxplot(df, column, metalearner_folder / f'{column}_{i}')
109+
df = pd.concat(avg_results)
110+
df.loc[df['best_fn'] < 0, 'best_fn'] = 0
111+
for column in df.columns:
112+
if column in ['i', 'dataset']:
113+
continue
114+
plot_multiple_boxplot(df, column, plot_folder / f'{column}')
115+
116+
plot_histogram(df, metalearners, plot_folder)
117+
118+
dfs = build_performance_info(metalearners, metalearners_path)
119+
plotting_performance(dfs, plot_folder / 'performance')
120+
121+
122+
def fix_performance_info(performance: list):
123+
new_performance = [performance[0]]
124+
for p in performance:
125+
if p > new_performance[-1]:
126+
new_performance.append(p)
127+
else:
128+
new_performance.append(new_performance[-1])
129+
return new_performance
130+
131+
132+
def build_performance_info(metalearners, metalearners_path: Path):
133+
dataframes = {}
134+
for metalearner in metalearners:
135+
data_dict = {'i': [], metalearner: []}
136+
metalearner_path = metalearners_path / metalearner
137+
for fn in metalearner_path.glob('*.json'):
138+
info = json.load(open(fn, 'r+'))
139+
140+
# performance = info['scores']
141+
#
142+
performance = [p if p > 0 else 0 for p in info['scores']]
143+
# performance = [p if p > 0 else 0 for p in info['scores']]
144+
if len(performance) == 0:
145+
continue
146+
performance = fix_performance_info(performance)
147+
data_dict[metalearner].extend(performance)
148+
data_dict['i'].extend(range(len(performance)))
149+
dataframes[metalearner] = pd.DataFrame(data_dict)
150+
return dataframes
151+
152+
153+
def plotting_performance(data, folder):
154+
fig = plt.figure()
155+
for metalearner, df in data.items():
156+
sns.lineplot(data=df, x='i', y=metalearner)
157+
# fig.legend(title='Estrategias', labels=data.keys())
158+
plt.legend(loc='best', title='Estrategias', labels=data.keys())
159+
plt.ylabel(None)
160+
plt.xlabel('Iteraciones')
161+
plt.xlim([0, 200])
162+
plt.savefig(folder)
163+
plt.close()
164+
165+
166+
def main():
167+
# plot_folder = get_plot_folder('plots/results/l1 distance')
168+
# metalearners = ['autogoal', 'nn_learner_aggregated', 'nn_learner_simple', 'xgb_metalearner']
169+
# plot_results(metalearners, Path('results/l1 distance/results'), plot_folder)
170+
171+
# plot_folder = get_plot_folder('plots/results/l2 distance')
172+
# metalearners = ['autogoal', 'nn_learner_aggregated', 'nn_metalearner_simple', 'xgb_metalearner']
173+
# plot_results(metalearners, Path('results/l2 distance/results'), plot_folder)
174+
175+
# plot_folder = get_plot_folder('plots/results/xgb_metalearner_v2')
176+
# metalearners = ['autogoal', 'xgb_metalearner_v2',
177+
# 'nn_learner_aggregated_l1', 'nn_learner_simple_l1', 'xgb_metalearner_l1',
178+
# 'nn_learner_aggregated_l2', 'nn_metalearner_simple_l2', 'xgb_metalearner_l2']
179+
# plot_results(metalearners, Path('results/xgb_metalearner_v2/results'), plot_folder)
180+
181+
plot_folder = get_plot_folder('plots/results/paper')
182+
metalearners = ['Autogoal', 'Vecinos Cercanos Simple',
183+
'Vecinos Cercanos Ponderado', 'XGBRanker']
184+
plot_results(metalearners, Path('results/paper/results'), plot_folder)
185+
186+
# plot_folder = get_plot_folder('plots/results/new_paper')
187+
# metalearners = ['Autogoal', 'Vecinos Cercanos Simple',
188+
# 'Vecinos Cercanos Ponderado', 'XGBRanker']
189+
# plot_results(metalearners, Path('results/new paperr/results'), plot_folder)
190+
191+
# plot_folder = get_plot_folder('plots/results/new_paper_performance')
192+
# metalearners = ['Autogoal', 'Vecinos Cercanos Simple',
193+
# 'Vecinos Cercanos Ponderado', 'XGBRanker']
194+
# plot_performance(metalearners, Path('results/new paperr/results'), plot_folder)
195+
196+
197+
if __name__ == '__main__':
198+
main()

utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from pathlib import Path
2+
3+
4+
def get_plot_folder(folder_path: str):
5+
"""Creates a folder for plots, creating also parents directories if necessary"""
6+
folder = Path(folder_path)
7+
if not folder.exists():
8+
folder.mkdir(parents=True)
9+
return folder

0 commit comments

Comments
 (0)