Spaces:

ought
/

raft-leaderboard

Running

File size: 3,712 Bytes

import os
from datetime import datetime
from pathlib import Path
from re import sub

import pandas as pd
import requests
import streamlit as st
from datasets import get_dataset_config_names
from dotenv import load_dotenv

if Path(".env").is_file():
    load_dotenv(".env")

auth_token = os.getenv("HF_HUB_TOKEN")
header = {"Authorization": "Bearer " + auth_token}

TASKS = get_dataset_config_names("ought/raft")
# Split and capitalize the task names, e.g. banking_77 => Banking 77
FORMATTED_TASK_NAMES = [" ".join(t.capitalize() for t in task.split("_")) for task in TASKS]


def extract_tags(dataset):
    tags = {}
    for tag in dataset["tags"]:
        k, v = tuple(tag.split(":", 1))
        tags[k] = v
    return tags


def download_submissions():
    response = requests.get("http://huggingface.co/api/datasets", headers=header)
    all_datasets = response.json()

    submissions = []

    for dataset in all_datasets:
        tags = extract_tags(dataset)
        if tags.get("benchmark") == "ought/raft" and tags.get("type") == "evaluation":
            submissions.append(dataset)
    submissions = sorted(submissions, key=lambda x: int(x["id"].split("-")[-1]))
    return submissions


def format_submissions(submissions):
    submission_data = {**{"Submission": []}, **{"Date": []}, **{t: [] for t in TASKS}}

    # TODO(lewtun): delete / filter all the junk repos from development
    # The following picks the latest submissions which adhere to the model card schema
    for submission in submissions[-2:]:
        submission_id = submission["id"]
        response = requests.get(
            f"http://huggingface.co/api/datasets/{submission_id}?full=true",
            headers=header,
        )
        data = response.json()
        card_data = data["card_data"]
        submission_name = card_data["submission_dataset"]
        submission_data["Submission"].append(submission_name)
        submission_id = card_data["submission_id"]
        timestamp = submission_id.split("-")[-1]
        timestamp = pd.to_datetime(int(timestamp))
        submission_data["Date"].append(datetime.date(timestamp))

        for task in card_data["results"]:
            task_data = task["task"]
            task_name = task_data["name"]
            score = task_data["metrics"][0]["value"]
            submission_data[task_name].append(score)

    df = pd.DataFrame(submission_data)
    df.insert(2, "Overall", df[TASKS].mean(axis=1))
    df = df.copy().sort_values("Overall", ascending=False).reset_index().rename(columns={"index": "Rank"})
    df.rename(columns={k: v for k, v in zip(TASKS, FORMATTED_TASK_NAMES)}, inplace=True)
    return df


###########
### APP ###
###########
st.set_page_config(layout="wide")
st.title("RAFT Leaderboard")
st.markdown("""
RAFT: Real-world Annotated Few-shot Tasks

Large pre-trained language models have shown promise for few-shot learning, completing text-based tasks given only a few task-specific examples. Will models soon solve classification tasks that have so far been reserved for human research assistants? RAFT is a few-shot classification benchmark that tests language models
across multiple domains (lit review, tweets, customer interaction, etc.)
on economically valuable classification tasks (someone inherently cares about the task)
in a setting that mirrors deployment (50 examples per task, info retrieval allowed, hidden test set)

To submit to RAFT, follow the instruction posted on [this page](https://github.com/oughtinc/raft_submission).
""")
submissions = download_submissions()
df = format_submissions(submissions)
# hack to remove index column from https://github.com/streamlit/streamlit/issues/641
st.table(df.assign(hack="").set_index("hack"))