EDA

# ruff: noqa
%reload_ext autoreload
%autoreload 2

# Standard library
import os
import json

# External libraries
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import rootutils
import hydra

# Local imports
import ml_project_2_mlp.utils as utils
/Users/jonas-mika/Library/Caches/pypoetry/virtualenvs/ml-project-2-mlp-a6NSXBdT-py3.10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
# Reinitialize hydra on every run
hydra.core.global_hydra.GlobalHydra.instance().clear()
h = hydra.initialize(config_path="../conf", job_name="eda", version_base=None)

# Setup root environment
root_path = rootutils.setup_root(".")
rootutils.set_root(
    path=root_path,
    project_root_env_var=True,
)
# Global paths
ROOT_DIR = root_path
DATA_DIR = os.path.join(ROOT_DIR, "data")
ARTIFACT_DIR = os.path.join(ROOT_DIR, "artifacts")
FIGURE_DIR = os.path.join(ROOT_DIR, "report", "figures")
TABLE_DIR = os.path.join(ROOT_DIR, "report", "tables")

os.makedirs(FIGURE_DIR, exist_ok=True)
os.makedirs(TABLE_DIR, exist_ok=True)
# Costs per token based on https://openai.com/pricing
GPT4_COST_PER_INP_TOKEN = 0.00001
GPT4_COST_PER_OUT_TOKEN = 0.00003
GPT3_5_COST_PER_INP_TOKEN = 0.000001
GPT3_5_COST_PER_OUT_TOKEN = 0.000002
rename_dict = {
    "human": "Human",
    "gpt3.5": "GPT-3.5",
    "gpt4": "GPT-4",
    "context1": "C1",
    "context2": "C2",
    "context3": "C3",
    "zeroshot": "0-shot",
    "oneshot": "1-shot",
    "f1": "Macro F1",
    "acc": "Acc",
    "precision": "Precision",
    "recall": "Recall",
    "lpp": "Labels Per Page",
    "model": "Model",
    "context": "Context",
    "shot": "Shot",
    "gpt-3.5-turbo-1106": "GPT-3.5",
    "gpt-4-1106-preview": "GPT-4",
    False: "0-shot",
    True: "1-shot",
    "Kids_and_Teens": "Kids & Teens",
}

def get_labeler_name(name: str):
    return " + ".join([rename_dict.get(n, n) for n in name.split("-")])

def get_metric_name(name: str):
    if "/" in name:
        split, metric = name.split("/")
        return f"{rename_dict.get(split, split)} {rename_dict.get(metric, metric)}"
    else:
        return rename_dict.get(name, name)
style = "whitegrid"
palette = "gist_stern"
sns.set_style(style)
sns.set_palette(palette)
def get_num_features(feat, data):
    return len([w[feat] for w in data.values() if w[feat] is not None and w[feat] != []])

Websites


There are three copora of websites in this dataset:

  • crowdsourced: 761 websites from the crowdsourced dataset from the Homepage2Vec paper
  • curlie: A filtered version of the curlie dataset, containing ~1M websites

For each website, the repository contains a CSV file at the path data/raw/<corpus>.csv with the two columns - wid and url. The wid is a unique identifier for the website, and the url is the URL of the website.

# Initialise Config
crowdsourced_cfg = hydra.compose(config_name="eda", overrides=["data=crowdsourced"])
curlie_cfg = hydra.compose(config_name="eda", overrides=["data=curlie"])
# Load categories
path = os.path.join(DATA_DIR, "meta", "categories.json")
with open(path) as f:
    categories = json.load(f)

print(f"Classifying in {len(categories)} categories: {', '.join(categories)}")
Classifying in 14 categories: Arts, Business, Computers, Games, Health, Home, Kids_and_Teens, News, Recreation, Reference, Science, Shopping, Society, Sports

We will continue by exploring each of our datasets, we will store info about each of the indivudal datasets in a list of dictionaries.

webinfo = []

Crowdsourced Data

This is the annotations crowdsourced for 840 websites in the Homepage2Vec paper.

# Raw data
crowdsourced_data= hydra.utils.instantiate(crowdsourced_cfg.data)

raw_data = crowdsourced_data.get_raw_data()
processed_data = crowdsourced_data.get_processed_data()
embedded_data = crowdsourced_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)
Total number of samples: 840
wid url
0 1161124 www.pointlesssites.com
1 1081241 www.connecticutplastics.com
2 1162420 99percentinvisible.org
3 1146040 www.medicaid.gov
4 1117243 www.graalonline.com
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")
Collected data on ['title', 'description', 'keywords', 'links', 'sentences', 'metatags', 'tld', 'domain']

Title: PointlessSites.com  Fun Things To Do When You're Bored
Description: Are you bored? Want something fun to do? Check out these funny websites, pointless facts and stupid pictures brought to you by Pointless Sites!
Keywords: ['Pointless', 'Sites', 'portal,', 'useless', 'pointlessness']
Tags: ['description', 'viewport', 'author', 'keywords', 'robots', 'copyright']
Domain: pointlesssites
TLD: com
# Setup info dict for original
original_info = dict()

# Save these into a dict
original_info["n"] = len(processed_data)
original_info["tld"] = get_num_features("tld", processed_data) / original_info["n"] * 100
original_info["domain"] = get_num_features("domain", processed_data) / original_info["n"] * 100
original_info["tags"] = get_num_features("metatags", processed_data) / original_info["n"] * 100
original_info["titles"] = get_num_features("title", processed_data) / original_info["n"] * 100
original_info["descriptions"] = get_num_features("description", processed_data) / original_info["n"] * 100
original_info["keywords"] = get_num_features("keywords", processed_data) / original_info["n"] * 100
original_info["links"] = get_num_features("links", processed_data) / original_info["n"] * 100
original_info["sentences"] = get_num_features("sentences", processed_data) / original_info["n"] * 100

for k, v in original_info.items():
    if k != "n":
        print(f"ℹ️ Percentage of sites with {k}: {v:.2f}%")
ℹ️ Percentage of sites with tld: 100.00%
ℹ️ Percentage of sites with domain: 100.00%
ℹ️ Percentage of sites with tags: 93.69%
ℹ️ Percentage of sites with titles: 98.42%
ℹ️ Percentage of sites with descriptions: 54.93%
ℹ️ Percentage of sites with keywords: 19.58%
ℹ️ Percentage of sites with links: 89.88%
ℹ️ Percentage of sites with sentences: 99.08%

Curlie

This is the random subset of 10,000 websites from the Curlie website index that we wish to re-annotate using GPT labelers.

# Raw data
curlie_data = hydra.utils.instantiate(curlie_cfg.data)

raw_data = curlie_data.get_raw_data()
processed_data = curlie_data.get_processed_data()
embedded_jdata = curlie_data.get_embeddings()

print(f"Total number of samples: {len(raw_data)}")
raw_data.head(5)
Total number of samples: 10000
wid url
0 917678 www.winandsoft.fr
1 443072 gaude-ag.de
2 728091 www.housing.ucsb.edu
3 132596 www.daccad.nl
4 464355 www.rockhall.com
# Example of processed website
wid = list(processed_data.keys())[0]
data = processed_data[wid]

print(f"Collected data on {list(data.keys())}")

# Show some examples
print(f"\nTitle: {data['title']}")
print(f"Description: {data['description']}")
print(f"Keywords: {data['keywords']}")
print(f"Tags: {data['metatags']}")
print(f"Domain: {data['domain']}")
print(f"TLD: {data['tld']}")
Collected data on ['title', 'description', 'keywords', 'links', 'sentences', 'metatags', 'tld', 'domain']

Title: Logiciel bibliothèque, médiathèque, vidéothèque.
Description: Logiciel bibliothèque  logiciel permettant le catalogage de tous types de documents ou supports pour les besoins d'une bibliothèque. Le logiciel bibliothèque vous garantit la gestion documentaire la plus efficace (livres, cartes, manuscrits, incunables, etc.).
Keywords: ['logiciel', 'bibliotheque,logiciel', 'mediatheque,logiciel', 'gestion', 'bibliotheque,logiciel', 'gestion', 'mediatheque,logiciel,bibliotheque,mediatheque']
Tags: ['description', 'copyright', 'robots', 'keywords', 'author']
Domain: winandsoft
TLD: fr
curlie_info = dict()

# Save these into a dict
curlie_info["n"] = len(processed_data)
curlie_info["tld"] = get_num_features("tld", processed_data) / curlie_info["n"] * 100
curlie_info["domain"] = get_num_features("domain", processed_data) / curlie_info["n"] * 100
curlie_info["tags"] = get_num_features("metatags", processed_data) / curlie_info["n"] * 100
curlie_info["titles"] = get_num_features("title", processed_data) / curlie_info["n"] * 100
curlie_info["descriptions"] = get_num_features("description", processed_data) / curlie_info["n"] * 100
curlie_info["keywords"] = get_num_features("keywords", processed_data) / curlie_info["n"] * 100
curlie_info["links"] = get_num_features("links", processed_data) / curlie_info["n"] * 100
curlie_info["sentences"] = get_num_features("sentences", processed_data) / curlie_info["n"] * 100

for k, v in curlie_info.items():
    if k != "n":
        print(f"ℹ️ Percentage of sites with {k}: {v:.2f}%")
ℹ️ Percentage of sites with tld: 100.00%
ℹ️ Percentage of sites with domain: 100.00%
ℹ️ Percentage of sites with tags: 95.47%
ℹ️ Percentage of sites with titles: 98.28%
ℹ️ Percentage of sites with descriptions: 62.95%
ℹ️ Percentage of sites with keywords: 27.29%
ℹ️ Percentage of sites with links: 91.62%
ℹ️ Percentage of sites with sentences: 99.03%

LaTeX Table

# Put the data info into a dataframe
df = pd.DataFrame([original_info, curlie_info], index=["Crowdsourced", "Curlie-10k"]).round(2)

# Change all columns names to include (%)
df.columns = [f"{c} (%)" if c != "n" else c for c in df.columns]

df = df.T

# Save the dataframe to a latex table
position = "!ht"
latex = df.to_latex(
    caption="Percentage of websites with each feature accross our datasets.", 
    label="tab:feature-info",
    escape=True,
    float_format="%.2f",
    position=position
)

# Add \centering right after \begin{table}
latex = latex.replace("\\begin{table}" + f"[{position}]", "\\begin{table}" + f"[{position}]" + "\n\\centering")

# Save the latex table
save_path = os.path.join(root_path, "report", "tables", "feature-info.tex")
with open(save_path, "w") as f:
    f.write(latex)
print(f"✅ Saved table to {save_path}")
✅ Saved table to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/report/tables/feature-info.tex

Labelers


There are multiple GPT labeler instances that can be used to label the data. The labelers are defined in the labelers module and are identified by a context, model and fewshow parameter. The context parameter defines the context that is used to prompt the model. The model parameter defines the model that is used to generate the labels. The fewshot parameter defines whether the model is trained in a few-shot manner or not.

Parameter Variants Description
context context1 Uses the tld, domain and metatags as context
context2 Uses the tld, domain, metatags, links and text as context
context3 Uses the tld, domain, metatags, links, text and images as context
model gpt3.5 Uses GPT-3.5 (gpt-3.5-turbo-1106)
gpt4 Uses GPT-4 (gpt-4-1106-preview)
fewshot fewshot Injects an example website and label into the system prompt
zeroshot Does not inject any example website or label into the system prompt

We are considering all contexts and few-shot settings for the GPT-3.5 model, but only the context2 for the GPT-4 model. Additionally, we have the human labeler which simply loads the annotatations for the original dataset from the Homepage2Vec paper.

# Initialise configuration for all labelers
gpt_labeler_names = [
    "gpt3.5-zeroshot-context1", 
    "gpt3.5-oneshot-context1",
    "gpt3.5-zeroshot-context2",
    "gpt3.5-oneshot-context2",
    "gpt3.5-zeroshot-context3",
    "gpt3.5-oneshot-context3",
    "gpt4-zeroshot-context1",
    "gpt4-oneshot-context1",
    "gpt4-zeroshot-context2",
    "gpt4-oneshot-context2",
    "gpt4-zeroshot-context3",
    "gpt4-oneshot-context3"
] 

gpt_labelers_cfg = {labeler: hydra.compose(config_name="eda", overrides=[f"labeler={labeler}"]) for labeler in gpt_labeler_names}
# Instantiate data
crowdsourced_data_cfg = hydra.compose(config_name="eda", overrides=["data=crowdsourced"])
crowdsourced_data = hydra.utils.instantiate(crowdsourced_data_cfg.data)
# Instantiate labelers
gpt_labelers = {labeler: hydra.utils.instantiate(cfg.labeler, data=crowdsourced_data) for labeler, cfg in gpt_labelers_cfg.items()}
# Instatntiate labelers dataframe
def get_info(labeler):
    return {
        "model": labeler.model,
        "context": labeler.name.split("-")[-1],
        "shot": labeler.fewshot,
    }

labeler_info = pd.DataFrame(
    [get_info(labeler) for labeler in gpt_labelers.values()],
    index=[get_labeler_name(name) for name in gpt_labelers.keys()])
labeler_info
model context shot
GPT-3.5 + 0-shot + C1 gpt-3.5-turbo-1106 context1 False
GPT-3.5 + 1-shot + C1 gpt-3.5-turbo-1106 context1 True
GPT-3.5 + 0-shot + C2 gpt-3.5-turbo-1106 context2 False
GPT-3.5 + 1-shot + C2 gpt-3.5-turbo-1106 context2 True
GPT-3.5 + 0-shot + C3 gpt-3.5-turbo-1106 context3 False
GPT-3.5 + 1-shot + C3 gpt-3.5-turbo-1106 context3 True
GPT-4 + 0-shot + C1 gpt-4-1106-preview context1 False
GPT-4 + 1-shot + C1 gpt-4-1106-preview context1 True
GPT-4 + 0-shot + C2 gpt-4-1106-preview context2 False
GPT-4 + 1-shot + C2 gpt-4-1106-preview context2 True
GPT-4 + 0-shot + C3 gpt-4-1106-preview context3 False
GPT-4 + 1-shot + C3 gpt-4-1106-preview context3 True

Let’s verify that the labelers are working as expected by checking the number of labeled webpages.

num_processed_websites = len(crowdsourced_data.get_processed_data())

print(f"ℹ️ Number of processed websites: {num_processed_websites}")
for name, labeler in gpt_labelers.items():
    num_labels = len(labeler.get_labels())
    print(f"ℹ️ Number of {name} labels: {num_labels}")
ℹ️ Number of processed websites: 761
ℹ️ Number of gpt3.5-zeroshot-context1 labels: 761
ℹ️ Number of gpt3.5-oneshot-context1 labels: 761
ℹ️ Number of gpt3.5-zeroshot-context2 labels: 761
ℹ️ Number of gpt3.5-oneshot-context2 labels: 761
ℹ️ Number of gpt3.5-zeroshot-context3 labels: 761
ℹ️ Number of gpt3.5-oneshot-context3 labels: 761
ℹ️ Number of gpt4-zeroshot-context1 labels: 761
ℹ️ Number of gpt4-oneshot-context1 labels: 761
ℹ️ Number of gpt4-zeroshot-context2 labels: 761
ℹ️ Number of gpt4-oneshot-context2 labels: 761
ℹ️ Number of gpt4-zeroshot-context3 labels: 761
ℹ️ Number of gpt4-oneshot-context3 labels: 761

Labeling statistics

Let’s investigate some statistics about the labelers. We will compute:

  • The average number of labels per website
  • The number of valid labels
  • The number of invalid labels
  • The average time taken to label a website
  • The average number of prompt and completion tokens used to label a website
  • The estimated cost of labeling the entire dataset
def get_statistics(labeler):
    statistics = {"valid": 0, "invalid": 0, "lpp": [], "durations": [], "prompt_tokens": [], "completion_tokens": []}
    for website in labeler.get_labels().values():
        if not website["is_valid"]:
            statistics["invalid"] += 1
            continue

        statistics["valid"] += 1
        statistics["lpp"].append(sum(website["labels"]))
        statistics["durations"].append(website["duration"])
        statistics["prompt_tokens"].append(website["prompt_tokens"])
        statistics["completion_tokens"].append(website["completion_tokens"])

    lpps = np.array(statistics["lpp"])
    durations = np.array(statistics["durations"])
    prompt_tokens = np.array(statistics["prompt_tokens"])
    completion_tokens = np.array(statistics["completion_tokens"])

    statistics["lpp"] = f"{lpps.mean():.2f} ± {lpps.std():.2f}"
    statistics["durations"] = f"{durations.mean():.2f} ± {durations.std():.2f}"
    statistics["prompt_tokens"] = f"{prompt_tokens.mean():.2f} ± {prompt_tokens.std():.2f}"
    statistics["completion_tokens"] = f"{completion_tokens.mean():.2f} ± {completion_tokens.std():.2f}"
    
    # Compute estimated cost
    model = labeler.model
    COST_PER_INP_TOKEN = GPT4_COST_PER_INP_TOKEN if "gpt-4" in model else GPT3_5_COST_PER_INP_TOKEN
    COST_PER_OUT_TOKEN = GPT4_COST_PER_OUT_TOKEN if "gpt-4" in model else GPT3_5_COST_PER_OUT_TOKEN

    statistics["estimated_cost"] = (prompt_tokens.sum() * COST_PER_INP_TOKEN + completion_tokens.sum() * COST_PER_OUT_TOKEN)
    statistics["cost_per_1k_page"] = 1000 * statistics["estimated_cost"] / statistics["valid"]

    return statistics
labeler_statistics = pd.DataFrame(
    [get_statistics(labeler) for labeler in gpt_labelers.values()], 
    index=[get_labeler_name(name) for name in gpt_labelers.keys()]) 
labeler_statistics
valid invalid lpp durations prompt_tokens completion_tokens estimated_cost cost_per_1k_page
GPT-3.5 + 0-shot + C1 761 0 0.39 ± 0.61 3.51 ± 5.87 148.80 ± 18.14 107.54 ± 2.75 0.276907 0.363873
GPT-3.5 + 1-shot + C1 761 0 0.91 ± 0.95 2.99 ± 4.61 292.80 ± 18.14 92.71 ± 3.31 0.363929 0.478225
GPT-3.5 + 0-shot + C2 761 0 1.39 ± 0.98 3.63 ± 6.79 206.30 ± 60.33 107.90 ± 1.48 0.321219 0.422101
GPT-3.5 + 1-shot + C2 761 0 1.68 ± 1.15 2.85 ± 0.42 448.30 ± 60.33 93.11 ± 4.06 0.482875 0.634527
GPT-3.5 + 0-shot + C3 761 0 1.57 ± 1.08 3.27 ± 0.69 350.13 ± 124.30 108.00 ± 0.06 0.430822 0.566126
GPT-3.5 + 1-shot + C3 761 0 1.85 ± 1.24 3.07 ± 4.31 615.13 ± 124.30 94.92 ± 6.18 0.612587 0.804976
GPT-4 + 0-shot + C1 761 0 1.50 ± 0.93 9.15 ± 3.44 148.80 ± 18.14 106.29 ± 8.45 3.558880 4.676583
GPT-4 + 1-shot + C1 760 1 1.83 ± 1.36 7.92 ± 2.73 292.60 ± 17.34 94.01 ± 10.96 4.367230 5.746355
GPT-4 + 0-shot + C2 761 0 2.16 ± 1.03 5.73 ± 1.98 206.30 ± 60.33 106.61 ± 7.14 4.003900 5.261367
GPT-4 + 1-shot + C2 761 0 2.49 ± 1.28 5.16 ± 2.35 448.30 ± 60.33 92.29 ± 2.52 5.518490 7.251629
GPT-4 + 0-shot + C3 761 0 2.30 ± 1.11 9.44 ± 3.09 350.13 ± 124.30 105.80 ± 8.67 5.079990 6.675414
GPT-4 + 1-shot + C3 761 0 2.80 ± 1.30 7.93 ± 2.70 615.13 ± 124.30 94.68 ± 6.23 6.842570 8.991551

Class Distribution

def get_label_dist(labelers):
    rows = []
    for name, class_dist in {name: labeler.get_class_dist(normalise=True) for name, labeler in labelers.items()}.items():
        for category, freq in class_dist.items():
            rows.append({"labeler": get_labeler_name(name), "category": category, "freq": 100 * freq})
    return pd.DataFrame(rows)

label_dist = get_label_dist(gpt_labelers)
fig, ax = plt.subplots(figsize=(15, 6))
fig.tight_layout(pad=3.0)
sns.barplot(
    label_dist,
    x="category",
    y="freq",
    hue="labeler",
    palette=palette,
    order=label_dist.groupby("category").sum().sort_values("freq", ascending=False).index,
    ax=ax
)
ax.get_legend().set_title("Labeler")
ax.set_xlabel("Category", fontsize=14)
ax.set_ylabel("Frequency (%)", fontsize=14)
ax.set_title("Original Label Distribution", fontsize=14)

print(f"✅ Plotted figure to {path}")
✅ Plotted figure to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/meta/categories.json

Labeling Quality: Macro F1, Precision, Recall, etc.

The goal of all GPT labelers is to replicate the ground truth labels provide by the human annotators as closely as possible. As we only have human annotations for the crowdsourcede dataset, we can only evaluate the labelers on this dataset.

# Initialise crowdsourced config and labeler
crowdsourced_data_cfg = hydra.compose(config_name="eda", overrides=["data=crowdsourced"])
human_labeler_cfg = hydra.compose(config_name="eda", overrides=["labeler=human", "data=crowdsourced"])

# Instantitate
crowdsourced_data = hydra.utils.instantiate(crowdsourced_data_cfg.data)
human_labeler = hydra.utils.instantiate(human_labeler_cfg.labeler, data=crowdsourced_data)
def match_labels(labeler1, labeler2, subset = None):
    labels1 = labeler1.get_labels()
    labels2 = labeler2.get_labels()
    wid1 = set(labels1.keys())
    wid2 = set(labels2.keys())
    matched_wid = wid1 & wid2
    if subset:
        matched_wid = matched_wid & subset

    labels1 = np.array([labels1[wid]["labels"] for wid in matched_wid])
    labels2 = np.array([labels2[wid]["labels"] for wid in matched_wid])

    return labels1, labels2
def get_labeler_perf(labeler):
    labels1, labels2 = match_labels(human_labeler, labeler)

    acc = accuracy_score(labels1.flatten(), labels2.flatten())
    subset_acc = accuracy_score(labels1, labels2)
    macro_f1 = f1_score(labels1, labels2, average="macro")
    micro_f1 = f1_score(labels1, labels2, average="micro")
    weighted_f1 = f1_score(labels1, labels2, average="weighted")
    macro_precision = precision_score(labels1, labels2, average="macro", zero_division=0)
    micro_precision = precision_score(labels1, labels2, average="micro", zero_division=0)
    macro_recall = recall_score(labels1, labels2, average="macro", zero_division=0)
    micro_recall = recall_score(labels1, labels2, average="micro", zero_division=0)

    return {
        "acc": acc,
        "subset_acc": subset_acc,
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "weighted_f1": weighted_f1,
        "macro_precision": macro_precision,
        "micro_precision": micro_precision,
        "macro_recall": macro_recall,
        "micro_recall": micro_recall,
    }
labeler_perf = pd.DataFrame([get_labeler_perf(labeler) for labeler in gpt_labelers.values()], 
                            index=[get_labeler_name(name) for name in gpt_labelers.keys()])
labeler_perf
acc subset_acc macro_f1 micro_f1 weighted_f1 macro_precision micro_precision macro_recall micro_recall
GPT-3.5 + 0-shot + C1 0.827013 0.085414 0.159570 0.162653 0.155489 0.635233 0.606780 0.097283 0.093914
GPT-3.5 + 1-shot + C1 0.826826 0.091984 0.232630 0.289017 0.277751 0.565566 0.544267 0.161507 0.196747
GPT-3.5 + 0-shot + C2 0.836306 0.139290 0.375855 0.412003 0.411554 0.562262 0.576415 0.297584 0.320567
GPT-3.5 + 1-shot + C2 0.832551 0.127464 0.386863 0.440402 0.440510 0.526652 0.547582 0.320552 0.368311
GPT-3.5 + 0-shot + C3 0.828046 0.128778 0.372394 0.409794 0.410367 0.499444 0.530885 0.314836 0.333683
GPT-3.5 + 1-shot + C3 0.821476 0.115637 0.377024 0.426071 0.428640 0.464730 0.501420 0.328673 0.370409
GPT-4 + 0-shot + C1 0.820537 0.095926 0.355475 0.373115 0.374361 0.474167 0.497378 0.295117 0.298531
GPT-4 + 1-shot + C1 0.808804 0.069645 0.361039 0.381791 0.385413 0.452552 0.452844 0.320988 0.330010
GPT-4 + 0-shot + C2 0.828515 0.115637 0.453939 0.485497 0.490625 0.499528 0.524012 0.429976 0.452256
GPT-4 + 1-shot + C2 0.820255 0.086728 0.459306 0.496450 0.505203 0.484074 0.497628 0.461682 0.495278
GPT-4 + 0-shot + C3 0.820631 0.099869 0.440991 0.477441 0.484140 0.461169 0.498572 0.433199 0.458027
GPT-4 + 1-shot + C3 0.809461 0.077530 0.461246 0.496777 0.509009 0.457310 0.470865 0.492367 0.525708

Analysis

# Join with labeling statistics
labelers_df = pd.concat([labeler_info, labeler_statistics, labeler_perf], axis=1)
# Top-k performing labeler
top_k = 5
labelers_df.sort_values("macro_f1", ascending=False).head(top_k)
model context shot valid invalid lpp durations prompt_tokens completion_tokens estimated_cost cost_per_1k_page acc subset_acc macro_f1 micro_f1 weighted_f1 macro_precision micro_precision macro_recall micro_recall
GPT-4 + 1-shot + C3 gpt-4-1106-preview context3 True 761 0 2.80 ± 1.30 7.93 ± 2.70 615.13 ± 124.30 94.68 ± 6.23 6.842570 8.991551 0.809461 0.077530 0.461246 0.496777 0.509009 0.457310 0.470865 0.492367 0.525708
GPT-4 + 1-shot + C2 gpt-4-1106-preview context2 True 761 0 2.49 ± 1.28 5.16 ± 2.35 448.30 ± 60.33 92.29 ± 2.52 5.518490 7.251629 0.820255 0.086728 0.459306 0.496450 0.505203 0.484074 0.497628 0.461682 0.495278
GPT-4 + 0-shot + C2 gpt-4-1106-preview context2 False 761 0 2.16 ± 1.03 5.73 ± 1.98 206.30 ± 60.33 106.61 ± 7.14 4.003900 5.261367 0.828515 0.115637 0.453939 0.485497 0.490625 0.499528 0.524012 0.429976 0.452256
GPT-4 + 0-shot + C3 gpt-4-1106-preview context3 False 761 0 2.30 ± 1.11 9.44 ± 3.09 350.13 ± 124.30 105.80 ± 8.67 5.079990 6.675414 0.820631 0.099869 0.440991 0.477441 0.484140 0.461169 0.498572 0.433199 0.458027
GPT-3.5 + 1-shot + C2 gpt-3.5-turbo-1106 context2 True 761 0 1.68 ± 1.15 2.85 ± 0.42 448.30 ± 60.33 93.11 ± 4.06 0.482875 0.634527 0.832551 0.127464 0.386863 0.440402 0.440510 0.526652 0.547582 0.320552 0.368311
# Export table to latex
df = labelers_df.copy()

df["model"] = df["model"].apply(lambda x: rename_dict[x])
df["context"] = df["context"].apply(lambda x: rename_dict[x])
df["shot"] = df["shot"].apply(lambda x: rename_dict[x])
df["macro_f1"] = 100 * df["macro_f1"]

grouped = df.groupby(by=["model", "context", "shot"]).size()
df.index = pd.MultiIndex.from_tuples(grouped.index)

cols = {"lpp": "LPP", "cost_per_1k_page": "Cost ($)", "macro_f1": "M.-F1 (%)"}
df = df[cols.keys()].rename(columns=cols)

# Save the dataframe to a latex table
position = "!ht"
latex = df.to_latex(
    caption="\\textbf{Labeler Statistics.}", 
    label="tab:labeler-results",
    escape=True,
    float_format="%.2f",
    position=position,
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
)

# Add \centering right after \begin{table}
latex = latex.replace("\\begin{table}" + f"[{position}]", "\\begin{table}" + f"[{position}]" + "\n\\centering")
latex = latex.replace("[t]", "[c]")
print("❌ Saving disabled to prevent accidental overwrite - only turn on if numbers changed")
# save_path = os.path.join(TABLE_DIR, "labeler-results.tex")
# with open(save_path, "w") as f:
#     f.write(latex)
# print(f"✅ Saved table to {save_path}")
❌ Saving disabled to prevent accidental overwrite - only turn on if numbers changed
# Time-cost tradeoff
df = labelers_df.copy()
df["time"] = df["durations"].apply(lambda x: x.split(" ± ")[0]).astype(float)
df["cost"] = df["estimated_cost"].astype(float)

fig, ax = plt.subplots(figsize=(5, 5))
sns.scatterplot(
    data=df,
    x="cost",
    y="macro_f1",
    style="model",
    hue="context",
    palette=palette,
    s=200,
    alpha=0.8,
)
ax.set_ylim(0, 0.5)
ax.set_xlabel("Cost ($)")
ax.set_ylabel("Label Quality (M.-F1)")

# Rename legend labels
handles, labels = ax.get_legend_handles_labels()
labels = [f"{rename_dict.get(label, label)}" for label in labels]
ax.legend(handles=handles, labels=labels)

print(f"✅ Plotted figure to {path}")
✅ Plotted figure to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/data/meta/categories.json

# Univariate performance
fig, axs = plt.subplots(ncols=3, figsize=(20, 5))
xs = ["model", "context", "shot"]

for ax, x in zip(axs, xs):
    sns.barplot(
        data=labelers_df,
        x=x,
        y="macro_f1",
        ax=ax
    )
    ax.set_xticks(ax.get_xticks())
    ax.set_xticklabels([rename_dict.get(t.get_text(), t.get_text()) for t in ax.get_xticklabels()])
    ax.set_xlabel(x.capitalize(), fontsize=14)
    ax.set_ylabel("")

axs[0].set_ylabel("Macro F1", fontsize=14)

print(f"✅ Plotted labeler quality as function of model, context and shot")
✅ Plotted labeler quality as function of model, context and shot

# Labeler parameters
params = ["model", "context", "shot"]
fig = utils.grid(labelers_df, params, metric="macro_f1", agg="mean", cmap="Blues", figsize=(8, 8), rename_dict=rename_dict) 

path = os.path.join(FIGURE_DIR, "labeler-grid.pdf")
fig.savefig(path, bbox_inches="tight")
print(f"✅ Saved figure to {path}")
✅ Saved figure to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/report/figures/labeler-grid.pdf

Labeling Quality: Annotator agreeement between GPT and Human

We have seen in the preprocesssing notebook that the inter annotator agreement per website on average between humans measured via Cohen’s Kappa was already relatively low (\(0.2\)). In this section, we therefore focus on measuring the agreement level between the GPT labelers and the human annotators.

gpt_human_agg = dict()
human_labels = np.array([v["labels"] for v in human_labeler.get_labels().values()])
for name, labeler in gpt_labelers.items():

    # Obtain 2d array with labels for each website (n_websites, n_categories)
    gpt_labels = np.array([v["labels"] for v in labeler.get_labels().values()])
    k = gpt_labels.shape[1]

    # Ensure matching between human and gpt labels
    human_labs_matched, gpt_labs_matched = match_labels(human_labeler, labeler)

    kappa_scores = []
    for j in range(k):

        # Obtain the labels for kth class by human and gpt
        human_kthcls_labels = human_labs_matched[:, j]
        gpt_kthcls_labels = gpt_labs_matched[:, j]

        # Compute the cohens kappa score
        kappa = cohen_kappa_score(human_kthcls_labels, gpt_kthcls_labels)

        # Append to list
        kappa_scores.append(kappa)


    # Save the results
    gpt_human_agg[name] = kappa_scores

Let’s now evalute the results, first we look at the overall picture using dataframe:

aggreement_df = pd.DataFrame(gpt_human_agg, index=categories).round(2)
aggreement_df
gpt3.5-zeroshot-context1 gpt3.5-oneshot-context1 gpt3.5-zeroshot-context2 gpt3.5-oneshot-context2 gpt3.5-zeroshot-context3 gpt3.5-oneshot-context3 gpt4-zeroshot-context1 gpt4-oneshot-context1 gpt4-zeroshot-context2 gpt4-oneshot-context2 gpt4-zeroshot-context3 gpt4-oneshot-context3
Arts 0.16 0.29 0.47 0.47 0.46 0.42 0.33 0.30 0.48 0.49 0.47 0.51
Business 0.03 0.14 0.23 0.28 0.22 0.27 0.16 0.15 0.26 0.34 0.26 0.34
Computers 0.15 0.21 0.35 0.36 0.35 0.34 0.23 0.16 0.40 0.38 0.35 0.35
Games 0.21 0.30 0.38 0.40 0.42 0.40 0.25 0.28 0.38 0.39 0.34 0.36
Health 0.22 0.18 0.42 0.40 0.39 0.38 0.36 0.34 0.45 0.43 0.46 0.43
Home 0.03 0.01 0.11 0.09 0.06 0.08 0.13 0.16 0.15 0.16 0.13 0.21
Kids_and_Teens 0.10 0.18 0.15 0.17 0.08 0.15 0.27 0.21 0.25 0.34 0.23 0.31
News 0.14 0.13 0.29 0.27 0.24 0.21 0.25 0.23 0.29 0.30 0.25 0.32
Recreation 0.01 0.12 0.29 0.25 0.27 0.18 0.24 0.21 0.32 0.25 0.31 0.21
Reference 0.04 0.06 0.18 0.19 0.12 0.04 0.20 0.29 0.29 0.27 0.25 0.24
Science 0.12 0.09 0.25 0.31 0.21 0.31 0.28 0.31 0.42 0.33 0.37 0.35
Shopping 0.25 0.27 0.36 0.32 0.39 0.35 0.31 0.33 0.37 0.40 0.38 0.35
Society 0.05 0.09 0.19 0.27 0.25 0.29 0.23 0.22 0.35 0.37 0.34 0.36
Sports 0.16 0.13 0.38 0.29 0.42 0.36 0.36 0.31 0.45 0.41 0.46 0.44

Let’s now get some insights about the statistics of the agreement level between the GPT labelers and the human annotators:

agreement_stats = aggreement_df.describe().round(2)
agreement_stats
gpt3.5-zeroshot-context1 gpt3.5-oneshot-context1 gpt3.5-zeroshot-context2 gpt3.5-oneshot-context2 gpt3.5-zeroshot-context3 gpt3.5-oneshot-context3 gpt4-zeroshot-context1 gpt4-oneshot-context1 gpt4-zeroshot-context2 gpt4-oneshot-context2 gpt4-zeroshot-context3 gpt4-oneshot-context3
count 14.00 14.00 14.00 14.00 14.00 14.00 14.00 14.00 14.00 14.00 14.00 14.00
mean 0.12 0.16 0.29 0.29 0.28 0.27 0.26 0.25 0.35 0.35 0.33 0.34
std 0.08 0.09 0.11 0.10 0.13 0.12 0.07 0.07 0.09 0.08 0.10 0.08
min 0.01 0.01 0.11 0.09 0.06 0.04 0.13 0.15 0.15 0.16 0.13 0.21
25% 0.04 0.10 0.20 0.26 0.21 0.19 0.23 0.21 0.29 0.31 0.25 0.31
50% 0.13 0.14 0.29 0.29 0.26 0.30 0.25 0.26 0.36 0.36 0.34 0.35
75% 0.16 0.20 0.38 0.35 0.39 0.36 0.30 0.31 0.42 0.40 0.38 0.36
max 0.25 0.30 0.47 0.47 0.46 0.42 0.36 0.34 0.48 0.49 0.47 0.51

We can see that as the complexity of the labeler increases, the more average agreement we obtain with the human annotators with plus or minus same standard deviation. Interestingly, we can see that for some categories and gpt4 based labelers we obtain almost \(0.5\) agreement.

Curlie

In this section we explore the labels of the curlie-1000 that we relabel with the most promising GPT labelers given the label quality and cost of labeling as shown in the section above.

# Initialise data and labeler config
curlie_cfg = hydra.compose(config_name="eda", overrides=["data=curlie"])
curlie_labeler_names = ["gpt3.5-oneshot-context2", "gpt4-zeroshot-context2"]
curlie_labeler_cfg = {name: hydra.compose(config_name="eda", overrides=[f"labeler={name}"]) for name in curlie_labeler_names}
# Instantiate data
curlie_data = hydra.utils.instantiate(curlie_cfg.data)
curlie_labeler = {name: hydra.utils.instantiate(curlie_labeler.labeler, data=curlie_data) for name, curlie_labeler in curlie_labeler_cfg.items()}
print(f"✅ Initialised {len(curlie_labeler)} labeler(s).")
✅ Initialised 2 labeler(s).

We expect that the LPP will be higher as the labelers have learned to replicate the human labels.

values = [get_statistics(labeler) for labeler in curlie_labeler.values()]
index = [get_labeler_name(name) for name in curlie_labeler.keys()]
labeler_statistics = pd.DataFrame(values, index=index)
labeler_statistics
valid invalid lpp durations prompt_tokens completion_tokens estimated_cost cost_per_1k_page
GPT-3.5 + 1-shot + C2 9190 0 1.60 ± 1.08 2.09 ± 1.04 470.16 ± 87.36 92.99 ± 3.85 6.029949 0.656142
GPT-4 + 0-shot + C2 9190 0 2.03 ± 0.97 7.80 ± 3.32 228.16 ± 87.36 106.16 ± 8.10 50.235890 5.466365

Let’s first look at the distribution of labels for each category.

# Plot label distribution
label_dist = get_label_dist(curlie_labeler)
original_curlie_dist = {
 'Business': 27.6,
 'Society': 13.9,
 'Arts': 9.3,
 'Home': 8.4,
 'Health': 7.4,
 'Recreation': 8.4,
 'Computers': 6.2,
 'Shopping': 7.4,
 'Reference': 4.3,
 'Science': 4.8,
 'Sports': 6.8,
 'Kids_and_Teens': 1.1,
 'News': 1.1,
 'Games': 1.7
}
for category, freq in original_curlie_dist.items():
    label_dist.loc[len(label_dist)] = ["Original Curlie", category, freq]

renamer2 = {"Original Curlie": "Original Curlie", "GPT-3.5 + 1-shot + C2": "GPT-3.5", "GPT-4 + 0-shot + C2": "GPT-4"}
label_dist["labeler"] = label_dist["labeler"].apply(lambda x: renamer2[x])

label_dist["labeler"] = pd.Categorical(label_dist['labeler'], ["Original Curlie", "GPT-3.5", "GPT-4"], ordered=True)
labeler_dist = label_dist.sort_values("labeler")

fig, ax = plt.subplots(figsize=(7, 3))
fig.tight_layout(pad=1.0)
sns.barplot(
    label_dist,
    y="freq",
    x="category",
    hue="labeler",
    order=label_dist.groupby("category").sum({"freq": "sum"}).sort_values("freq", ascending=False).index,
    palette={"Original Curlie": "#852D48", "GPT-3.5": "#49498D", "GPT-4": "#6D6DD3"},
    ax=ax
)
ax.get_legend().set_title("Labeler")
ax.set_xlabel("")
ax.set_ylabel("(%)")
ax.set_xticks(ax.get_xticks())
ax.set_xticklabels([rename_dict.get(x.get_text(), x.get_text()) for x  in ax.get_xticklabels()], rotation=40)

path = os.path.join(FIGURE_DIR, "curlie-10k-dist.pdf")
fig.savefig(path, bbox_inches="tight")
print(f"✅ Saved figure to {path}")
✅ Saved figure to /Users/jonas-mika/epfl/coursework/projects/ml-project-2-mlp/report/figures/curlie-10k-dist.pdf