Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Personal Leaderboard #45

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,17 @@ Some models require API keys which you can set as environment variables, e.g.
## Results

Results are auto-saved to [mteb/arena-results](https://huggingface.co/datasets/mteb/arena-results).

## Personal Leaderboards

We have introduced a new feature that allows users to have their own personal leaderboards. This feature enables users to track their individual preferences and votes.

### User Authentication

To access your personal leaderboard, you need to log in using your Hugging Face account. The authentication is handled via OAuth.

### Instructions

1. Log in to your Hugging Face account.
2. Navigate to the "Personal Leaderboard" tab in the UI.
3. View your personal rankings based on your votes.
10 changes: 9 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,15 @@ def load_elo_results(elo_results_dir):
with gr.Tab("🏆 Leaderboard", id=3):
build_leaderboard_tab(elo_results_file['sts'], leaderboard_table_file['sts'], task_type="STS")

with gr.Tab("🏅 Personal Leaderboard", id=13):
with gr.Tabs() as tabs_pl:
with gr.Tab("🔒 Login", id=14):
gr.Markdown("Please log in to view your personal leaderboard.")
gr.Button("Login with Hugging Face", variant="primary", elem_id="login_button")
with gr.Tab("🏆 Personal Leaderboard", id=15):
build_leaderboard_tab(elo_results_file['personal'], leaderboard_table_file['personal'], task_type="Personal")

gr.Markdown(acknowledgment_md, elem_id="ack_markdown")

block.queue(max_size=10)
block.launch(share=True)
block.launch(share=True)
93 changes: 93 additions & 0 deletions leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"Retrieval": "🔎",
"Clustering": "✨",
"STS": "☘️",
"Personal": "🏅",
}

def make_arena_leaderboard_md(elo_results):
Expand Down Expand Up @@ -137,6 +138,98 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa

leader_component_values[:] = [md, p1, p2, p3, p4]

"""
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles"
)
plot_1 = gr.Plot(p1, show_label=False)
with gr.Column():
gr.Markdown(
"#### Figure 2: Battle Count for Each Combination of Models (without Ties)"
)
plot_2 = gr.Plot(p2, show_label=False)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)"
)
plot_3 = gr.Plot(p3, show_label=False)
with gr.Column():
gr.Markdown(
"#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)"
)
plot_4 = gr.Plot(p4, show_label=False)
"""
# return [md_1, plot_1, plot_2, plot_3, plot_4]
return [md_1]

def build_personal_leaderboard_tab(elo_results_file, leaderboard_table_file, user_id, show_plot=False, task_type="Personal"):
if elo_results_file is None: # Do live update
md = "Loading ..."
p1 = p2 = p3 = p4 = None
else:
with open(elo_results_file, "rb") as fin:
elo_results = pickle.load(fin)

personal_elo_results = elo_results[user_id]
personal_arena_df = personal_elo_results["leaderboard_table_df"]
p1 = personal_elo_results["win_fraction_heatmap"]
p2 = personal_elo_results["battle_count_heatmap"]
p3 = personal_elo_results["bootstrap_elo_rating"]
p4 = personal_elo_results["average_win_rate_bar"]

md = f"""
# 🏅 Personal Leaderboard: {task_type} {TASK_TYPE_TO_EMOJI[task_type]}
"""
# | [GitHub](https://github.com/embeddings-benchmark) |
md_1 = gr.Markdown(md, elem_id="leaderboard_markdown")

if leaderboard_table_file:
model_table_df = load_leaderboard_table_csv(leaderboard_table_file)
personal_table_vals = get_arena_table(personal_arena_df, model_table_df, task_type=task_type)
md = make_arena_leaderboard_md(personal_elo_results)
gr.Markdown(md, elem_id="leaderboard_markdown")
gr.Dataframe(
headers=[
"Rank",
"🤖 Model",
"⭐ Personal Elo",
"📊 95% CI",
"🗳️ Votes",
"🥇 MTEB Overall Avg",
f"🥇 MTEB {task_type} Avg",
"Organization",
"License",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"number",
"number",
"str",
"str",
],
value=personal_table_vals,
elem_id="personal_leaderboard_dataframe",
height=700,
column_widths=[50, 150, 100, 100, 100, 100, 100, 150, 150],
wrap=True,
)
if not show_plot:
gr.Markdown(
"""## We are still collecting more votes on more models. The ranking will be updated very frequently. Please stay tuned!""",
elem_id="leaderboard_markdown",
)
else:
pass

leader_component_values[:] = [md, p1, p2, p3, p4]

"""
with gr.Row():
with gr.Column():
Expand Down