diff --git a/README.md b/README.md index 667c7e1..8ae2f13 100644 --- a/README.md +++ b/README.md @@ -38,3 +38,17 @@ Some models require API keys which you can set as environment variables, e.g. ## Results Results are auto-saved to [mteb/arena-results](https://huggingface.co/datasets/mteb/arena-results). + +## Personal Leaderboards + +We have introduced a new feature that allows users to have their own personal leaderboards. This feature enables users to track their individual preferences and votes. + +### User Authentication + +To access your personal leaderboard, you need to log in using your Hugging Face account. The authentication is handled via OAuth. + +### Instructions + +1. Log in to your Hugging Face account. +2. Navigate to the "Personal Leaderboard" tab in the UI. +3. View your personal rankings based on your votes. \ No newline at end of file diff --git a/app.py b/app.py index c310a2f..f8f967d 100644 --- a/app.py +++ b/app.py @@ -149,7 +149,15 @@ def load_elo_results(elo_results_dir): with gr.Tab("🏆 Leaderboard", id=3): build_leaderboard_tab(elo_results_file['sts'], leaderboard_table_file['sts'], task_type="STS") + with gr.Tab("🏅 Personal Leaderboard", id=13): + with gr.Tabs() as tabs_pl: + with gr.Tab("🔒 Login", id=14): + gr.Markdown("Please log in to view your personal leaderboard.") + gr.Button("Login with Hugging Face", variant="primary", elem_id="login_button") + with gr.Tab("🏆 Personal Leaderboard", id=15): + build_leaderboard_tab(elo_results_file['personal'], leaderboard_table_file['personal'], task_type="Personal") + gr.Markdown(acknowledgment_md, elem_id="ack_markdown") block.queue(max_size=10) -block.launch(share=True) +block.launch(share=True) \ No newline at end of file diff --git a/leaderboard.py b/leaderboard.py index 880a4ea..558df8d 100644 --- a/leaderboard.py +++ b/leaderboard.py @@ -11,6 +11,7 @@ "Retrieval": "🔎", "Clustering": "✨", "STS": "☘️", + "Personal": "🏅", } def make_arena_leaderboard_md(elo_results): @@ -137,6 +138,98 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file, show_plot=Fa leader_component_values[:] = [md, p1, p2, p3, p4] + """ + with gr.Row(): + with gr.Column(): + gr.Markdown( + "#### Figure 1: Fraction of Model A Wins for All Non-tied A vs. B Battles" + ) + plot_1 = gr.Plot(p1, show_label=False) + with gr.Column(): + gr.Markdown( + "#### Figure 2: Battle Count for Each Combination of Models (without Ties)" + ) + plot_2 = gr.Plot(p2, show_label=False) + with gr.Row(): + with gr.Column(): + gr.Markdown( + "#### Figure 3: Bootstrap of Elo Estimates (1000 Rounds of Random Sampling)" + ) + plot_3 = gr.Plot(p3, show_label=False) + with gr.Column(): + gr.Markdown( + "#### Figure 4: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)" + ) + plot_4 = gr.Plot(p4, show_label=False) + """ + # return [md_1, plot_1, plot_2, plot_3, plot_4] + return [md_1] + +def build_personal_leaderboard_tab(elo_results_file, leaderboard_table_file, user_id, show_plot=False, task_type="Personal"): + if elo_results_file is None: # Do live update + md = "Loading ..." + p1 = p2 = p3 = p4 = None + else: + with open(elo_results_file, "rb") as fin: + elo_results = pickle.load(fin) + + personal_elo_results = elo_results[user_id] + personal_arena_df = personal_elo_results["leaderboard_table_df"] + p1 = personal_elo_results["win_fraction_heatmap"] + p2 = personal_elo_results["battle_count_heatmap"] + p3 = personal_elo_results["bootstrap_elo_rating"] + p4 = personal_elo_results["average_win_rate_bar"] + + md = f""" +# 🏅 Personal Leaderboard: {task_type} {TASK_TYPE_TO_EMOJI[task_type]} +""" + # | [GitHub](https://github.com/embeddings-benchmark) | + md_1 = gr.Markdown(md, elem_id="leaderboard_markdown") + + if leaderboard_table_file: + model_table_df = load_leaderboard_table_csv(leaderboard_table_file) + personal_table_vals = get_arena_table(personal_arena_df, model_table_df, task_type=task_type) + md = make_arena_leaderboard_md(personal_elo_results) + gr.Markdown(md, elem_id="leaderboard_markdown") + gr.Dataframe( + headers=[ + "Rank", + "🤖 Model", + "⭐ Personal Elo", + "📊 95% CI", + "🗳️ Votes", + "🥇 MTEB Overall Avg", + f"🥇 MTEB {task_type} Avg", + "Organization", + "License", + ], + datatype=[ + "str", + "markdown", + "number", + "str", + "number", + "number", + "number", + "str", + "str", + ], + value=personal_table_vals, + elem_id="personal_leaderboard_dataframe", + height=700, + column_widths=[50, 150, 100, 100, 100, 100, 100, 150, 150], + wrap=True, + ) + if not show_plot: + gr.Markdown( + """## We are still collecting more votes on more models. The ranking will be updated very frequently. Please stay tuned!""", + elem_id="leaderboard_markdown", + ) + else: + pass + + leader_component_values[:] = [md, p1, p2, p3, p4] + """ with gr.Row(): with gr.Column():