# ruff: noqa
%reload_ext autoreload
%autoreload 2
# Standard imports
import os
import json
# External imports
import hydra
import rootutils
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from wandb.sdk.wandb_run import Run
import wandb
Analysis
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()= hydra.initialize(config_path="../conf", job_name="eda", version_base=None)
h
# Setup root environment
= rootutils.setup_root(".")
root_path
rootutils.set_root(=root_path,
path=True,
project_root_env_var )
# Global paths
= root_path
ROOT_DIR = os.path.join(ROOT_DIR, "artifacts")
ARTIFACT_DIR = os.path.join(ROOT_DIR, "report", "figures")
FIGURE_DIR = os.path.join(ROOT_DIR, "report", "tables")
TABLE_DIR
# Global settings
= False
SAVE
=True)
os.makedirs(FIGURE_DIR, exist_ok=True) os.makedirs(TABLE_DIR, exist_ok
# Global Labeling
= {
rename_dict "human": "Human",
"gpt3.5": "GPT-3.5",
"gpt4": "GPT-4",
"context1": "Context 1",
"context2": "Context 2",
"context3": "Context 3",
"zeroshot": "0-shot",
"oneshot": "1-shot",
"f1": "Macro F1",
"acc": "Acc",
"precision": "Precision",
"recall": "Recall",
"lpp": "Labels Per Page",
"Kids_and_Teens": "Kids & Teens",
}
def rename(x):
return rename_dict.get(x, x)
def get_labeler_name(name: str):
return " + ".join([rename_dict.get(n, n) for n in name.split("-")])
def get_metric_name(name: str):
if "/" in name:
= name.split("/")
split, metric return f"{rename_dict.get(split, split)} {rename_dict.get(metric, metric)}"
else:
return rename_dict.get(name, name)
# Load categories
with open(os.path.join("..", "data", "meta", "categories.json"), "r") as f:
= json.load(f)
categories_and_desc
= zip(*categories_and_desc.items())
categories, categories_desc = dict(enumerate(categories)) idx2categories
# Style and colors
"whitegrid")
sns.set_style("gist_stern") sns.set_palette(
# Initialise W&B
= "homepage2vec"
WANDB_PROJECT = "ml-project-2-mlp"
WANDB_ENTITY
# Initialize W&B API
= wandb.Api()
api
# Get all runs
= api.runs(f"{WANDB_ENTITY}/{WANDB_PROJECT}")
runs print(f"✅ Loaded {len(runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT})")
✅ Loaded 625 runs from W&B (ml-project-2-mlp/homepage2vec)
Helpers
First, we define a series of helper functions which we use to extract the relevant information (configuration, hyperparameters, performance metrics, etc.) from the W&B runs.
# Helpers
def extract_config(run: Run) -> dict:
"""
Extracts the config from a run.
"""
= ["name"]
data_attr = ["name", "model", "fewshot", "features"]
labeler_attr = {f"train_data_{k}":v for k,v in run.config.get("train_data", {}).items() if k in data_attr}
train_data = {f"test_data_{k}":v for k,v in run.config.get("test_data", {}).items() if k in data_attr}
test_data = {f"train_labeler_{k}":v for k,v in run.config.get("train_labeler", {}).items() if k in labeler_attr}
train_labeler = {f"test_labeler_{k}":v for k,v in run.config.get("test_labeler", {}).items() if k in labeler_attr}
test_labeler = run.config.get("train_datamodule", {}).get("data_split", [None])[0]
train_ratio = run.config.get("test_datamodule", {}).get("data_split", [None, None, None])[1:]
val_ratio, test_ratio
= {"id": run.id, "name": run.name, "finetune": run.config["finetune"], "train_ratio": train_ratio, "val_ratio": val_ratio, "test_ratio": test_ratio, **train_data, **test_data, **train_labeler, **test_labeler}
config
return config
def extract_hparams(run: Run) -> dict:
"""
Extracts the hparams from a run.
"""
= {
hparams "lr": run.config.get("model", {}).get("optimizer", {}).get("lr", None),
"weight_decay": run.config.get("model", {}).get("optimizer", {}).get("weight_decay", None),
"scheduler_factor": run.config.get("model", {}).get("scheduler", {}).get("factor", None),
"batch_size": run.config.get("train_datamodule", {}).get("batch_size", None),
}return hparams
def extract_summary(run: Run, exclude:list[str] = ["test/cm", "test/report"]) -> dict:
"""
Extracts the summary from a run.
"""
= {k:v for k, v in run.summary.items() if not k.startswith("_") and k not in exclude}
summary return summary
def runs_to_df(runs: list[Run]) -> pd.DataFrame:
"""
Convert a list of W&B runs to a dataframe.
"""
# Extract information from runs
= []
rows for run in runs[::-1]:
= extract_config(run)
config = extract_hparams(run)
hparams = extract_summary(run)
summary **config, **hparams, **summary})
rows.append({
# Add multi-index
= list(config.keys()) + list(hparams.keys()) + list(summary.keys())
columns = [("config", k) for k in config.keys()]
config_tuples = [("hparams", k) for k in hparams.keys()]
hparams_tuples = [("summary", k) for k in summary.keys()]
summary_tuples
# Create dataframe
= pd.DataFrame(rows, columns=columns)
run_df = pd.MultiIndex.from_tuples(
run_df.columns + hparams_tuples + summary_tuples,
config_tuples
)"config", "id"), inplace=True)
run_df.set_index((
return run_df
def best_runs(df_runs: pd.DataFrame, split: str="val", metric: str="f1") -> pd.DataFrame:
"""
Get the best runs based on the validation metric for
each unique combination of data, labeler - specified
in the run name.
Args:
df_runs (pd.DataFrame): Dataframe of runs.
metric (str): Metric to sort on.
Returns:
pd.DataFrame: Dataframe of best runs.
"""
= [("config", "train_labeler_name"), ("config", "finetune")]
experiment_cols = df_runs[experiment_cols].drop_duplicates()
unique_exps = []
best_runs for unique_exp in unique_exps.values:
= (df_runs[experiment_cols] == unique_exp).all(axis=1)
is_unique_exp = df_runs[is_unique_exp].sort_values(("summary", f"{split}/{metric}"), ascending=False).iloc[0]
best_exp_run
best_runs.append(best_exp_run)
return pd.DataFrame(best_runs)
def get_test_cm(run: Run) -> pd.DataFrame:
"""
Extracts the test confusion matrix from a run.
"""
= json.loads(run.summary.get("test/cm", None))
test_cm if test_cm is None:
return None
= pd.DataFrame.from_dict(test_cm)
test_cm "category"].replace(idx2categories, inplace=True)
test_cm["category", inplace=True)
test_cm.set_index(return test_cm
def get_test_report(run: Run) -> pd.DataFrame:
"""
Extracts the test report from a run.
"""
= json.loads(run.summary.get("test/report", None))
test_report if test_report is None:
return None
= pd.DataFrame.from_dict(test_report)
test_report = {str(idx): category for idx, category in idx2categories.items()}
mapper_ "category"] = test_report["category"].map(lambda x: mapper_.get(x, x))
test_report[return test_report
def get_test_targets(run: Run) -> pd.DataFrame:
"""
Extracts the test predictions from a run.
"""
= json.loads(run.summary.get("test/targets"))
test_targets return pd.DataFrame(test_targets, columns=categories)
def get_test_preds(run: Run) -> pd.DataFrame:
"""
Extracts the test predictions from a run.
"""
= json.loads(run.summary.get("test/preds"))
test_preds return pd.DataFrame(test_preds, columns=categories)
def get_test_probs(run: Run) -> pd.DataFrame:
"""
Extracts the test probabilities from a run.
"""
= json.loads(run.summary.get("test/probs"))
test_probs return pd.DataFrame(test_probs, columns=categories)
def get_test_reports_df(runs: list[Run]) -> pd.DataFrame:
= pd.DataFrame()
test_reports_df for run in runs:
= extract_config(run)
run_config = get_test_report(run)
test_report
for k, v in run_config.items():
= v
test_report[k]
# Concatenate
= pd.concat([test_reports_df, test_report])
test_reports_df
return test_reports_df
Results
In this section we are loading all the runs in the group of runs that fine-tuned on curlie-gpt3.5-10k
and curlie-gpt4-10k
, respectively. We aim to analyse and visualise the general performance of the models in comparison to the original Homepage2Vec model. Finally, we analyse the hyperparameter grid.
# Filter runs for Experiment 2
= "exp2-3"
GROUP
= [run for run in runs if run.group == GROUP and run.state == "finished"]
runs
print(f"✅ Loaded {len(runs)} runs from W&B ({WANDB_ENTITY}/{WANDB_PROJECT} - {GROUP})")
✅ Loaded 180 runs from W&B (ml-project-2-mlp/homepage2vec - exp2-3)
# Convert to dataframe
= runs_to_df(runs)
runs_df
# Get best runs by validation macro F1
= best_runs(runs_df, split="val", metric="f1_best")
best_runs_df
# Show best runs sorted by test macro F1
"summary", "test/f1"), ascending=False).hparams best_runs_df.sort_values((
lr | weight_decay | scheduler_factor | batch_size | |
---|---|---|---|---|
fnvzgxir | 0.000016 | 0.064037 | 0.376673 | 64 |
syqivch6 | 0.001535 | 0.000252 | 0.460896 | 64 |
81xcxk0e | 0.000100 | 0.000000 | 0.100000 | 32 |
Finetuning Results Table
# Save copy of best runs dataframe
= best_runs_df.copy()
df
= [x.config["train_labeler_name"] if x.config["finetune"] else "Pretrained" for _, x in df.iterrows()]
index = [get_labeler_name(x) for x in index]
index = df.summary[["test/precision", "test/recall", "test/f1", "test/lpp"]]
df
= df.set_index(pd.Index(index))
df
= {"test/precision": "Pr.", "test/recall": "Re.", "test/f1": "M.-F1", "test/lpp": "LPP"}
cols = df[cols.keys()].rename(columns=cols)
df
"Pr."] = df["Pr."] * 100
df["Re."] = df["Re."] * 100
df["M.-F1"] = df["M.-F1"] * 100
df[
# Save the dataframe to a latex table
= "!ht"
position = os.path.join(TABLE_DIR, "finetune-results.tex")
save_path = df.to_latex(
latex ="TODO",
caption="tab:finetune-results",
label=True,
escape=position,
position=True,
multirow="%.2f",
float_format=True,
multicolumn="c",
multicolumn_format
)= latex.replace("\\begin{table}" + f"[{position}]", "\\begin{table}" + f"[{position}]" + "\n\\centering")
latex = latex.replace("[t]", "[c]")
latex
# Save table if specified
if SAVE:
with open(save_path, "w") as f:
f.write(latex)print(f"✅ Saved table to {save_path}")
else:
print(f"❌ Not saving table. If you want to save it, set SAVE=True")
❌ Not saving table. If you want to save it, set SAVE=True
Finetuning Classwise F1
# Extract best runs
= best_runs_df[best_runs_df[("config", "finetune")] == False]
pretrained_runs = best_runs_df[best_runs_df[("config", "finetune")] == True]
finetuned_runs
= pretrained_runs.index.values
pretrained_run_ids = finetuned_runs.index.values
finetuned_run_ids
= [run for run in runs if run.id in pretrained_run_ids]
pretrained_runs = [run for run in runs if run.id in finetuned_run_ids]
finetuned_runs
print(f"Got {len(pretrained_runs)} pretrained run(s) and {len(finetuned_runs)} finetuned run(s) for {GROUP}")
Got 1 pretrained run(s) and 2 finetuned run(s) for exp2-3
# Get best runs
= pretrained_runs[0]
pretrained_run = finetuned_runs[0]
gpt3_5_run = finetuned_runs[1]
gpt4_run
= get_test_cm(pretrained_run)
pretrained_cms = get_test_cm(gpt3_5_run)
gpt3_5_cms = get_test_cm(gpt4_run) gpt4_cms
# Visualise the macro F1
= get_test_report(pretrained_run)
pretrained_report = get_test_report(gpt3_5_run)
gpt3_5_report = get_test_report(gpt4_run)
gpt4_report
"model"] = "Pretrained"
pretrained_report["model"] = "GPT-3.5"
gpt3_5_report["model"] = "GPT-4"
gpt4_report[
= pd.concat([pretrained_report, gpt3_5_report, gpt4_report])
test_reports
= test_reports[test_reports["category"].isin(categories)]
test_reports "category"] = test_reports["category"].map(lambda x: rename(x))
test_reports["f1-score"] = test_reports["f1-score"] * 100
test_reports[
= plt.subplots(figsize=(16, 4))
fig, ax =3.0)
fig.tight_layout(pad
sns.barplot(=test_reports,
data="category",
x="f1-score",
y="model",
hue=ax
ax
)"")
ax.set_xlabel("F1 (%)", fontsize=14)
ax.set_ylabel(
ax.set_xticks(ax.get_xticks())=30, horizontalalignment='right', fontsize=14)
ax.set_xticklabels(ax.get_xticklabels(), rotation"")
ax.get_legend().set_title(
# Add values to bars
for p in ax.patches:
= p.get_height()
height if height == 0:
continue
+p.get_width()/2.,
ax.text(p.get_x()+ 1,
height f"{height:.0f}",
="center", fontsize=12)
ha
0, 70)
ax.set_ylim(
# Save figure
= os.path.join(FIGURE_DIR, "finetune-results.pdf")
path ="tight")
fig.savefig(path, bbox_inchesprint(f"✅ Saved figure to {path}")
✅ Saved figure to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/report/figures/finetune-results.pdf
Hyperparameter Grid
# Create new dataframe with hyperparameters and test/f1
= runs_df["hparams"].copy()
grid_df "test/f1"] = runs_df["summary"]["test/f1"] grid_df[
# Hyperparameters
= plt.subplots(ncols=4, figsize=(20, 5))
fig, axs = ["lr", "weight_decay", "scheduler_factor", "batch_size"]
params
for ax, x in zip(axs, params):
sns.histplot(=grid_df,
data=x,
x="test/f1",
y=ax
ax
)
ax.set_xticks(ax.get_xticks())for t in ax.get_xticklabels()])
ax.set_xticklabels([rename_dict.get(t.get_text(), t.get_text()) =14)
ax.set_xlabel(x.capitalize(), fontsize"")
ax.set_ylabel(
0].set_ylabel("Macro F1", fontsize=14) axs[
Text(0, 0.5, 'Macro F1')