8000 Expand Style Control to all category! Add UI support for style contro… · lm-sys/FastChat@ef16c16 · GitHub
[go: up one dir, main page]

Skip to content

Commit ef16c16

Browse files
Expand Style Control to all category! Add UI support for style control and deprecated models. (#3517)
1 parent 853168f commit ef16c16

File tree

2 files changed

+82
-24
lines changed

2 files changed

+82
-24
lines changed

fastchat/serve/monitor/monitor.py

Lines changed: 73 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from fastchat.serve.monitor.monitor_md import (
2929
cat_name_to_baseline,
3030
key_to_category_name,
31+
cat_nam 8000 e_to_explanation,
32+
deprecated_model_name,
3133
arena_hard_title,
3234
make_default_md_1,
3335
make_default_md_2,
@@ -258,10 +260,14 @@ def create_ranking_str(ranking, ranking_difference):
258260
return f"{int(ranking)}"
259261

260262

261-
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
263+
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
262264
arena_df = arena_df.sort_values(
263265
by=["final_ranking", "rating"], ascending=[True, False]
264266
)
267+
268+
if hidden_models:
269+
arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
270+
265271
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
266272

267273
if arena_subset_df is not None:
@@ -317,9 +323,11 @@ def process_row(row):
317323
round(row["num_battles"]),
318324
model_info.get("Organization", "Unknown"),
319325
model_info.get("License", "Unknown"),
320-
"Unknown"
321-
if model_info.get("Knowledge cutoff date", "-") == "-"
322-
else model_info.get("Knowledge cutoff date", "Unknown"),
326+
(
327+
"Unknown"
328+
if model_info.get("Knowledge cutoff date", "-") == "-"
329+
else model_info.get("Knowledge cutoff date", "Unknown")
330+
),
323331
]
324332
)
325333

@@ -350,21 +358,25 @@ def update_leaderboard_df(arena_table_vals):
350358

351359
def highlight_max(s):
352360
return [
353-
"color: green; font-weight: bold"
354-
if "\u2191" in str(v)
355-
else "color: red; font-weight: bold"
356-
if "\u2193" in str(v)
357 8000 -
else ""
361+
(
362+
"color: green; font-weight: bold"
363+
if "\u2191" in str(v)
364+
else "color: red; font-weight: bold"
365+
if "\u2193" in str(v)
366+
else ""
367+
)
358368
for v in s
359369
]
360370

361371
def highlight_rank_max(s):
362372
return [
363-
"color: green; font-weight: bold"
364-
if v > 0
365-
else "color: red; font-weight: bold"
366-
if v < 0
367-
else ""
373+
(
374+
"color: green; font-weight: bold"
375+
if v > 0
376+
else "color: red; font-weight: bold"
377+
if 10000 v < 0
378+
else ""
379+
)
368380
for v in s
369381
]
370382

@@ -398,7 +410,13 @@ def build_arena_tab(
398410

399411
arena_df = arena_dfs["Overall"]
400412

401-
def update_leaderboard_and_plots(category):
413+
def update_leaderboard_and_plots(category, filters):
414+
if len(filters) > 0 and "Style Control" in filters:
415+
if f"{category} (Style Control)" in arena_dfs:
416+
category = f"{category} (Style Control)"
417+
else:
418+
gr.Warning("This category does not support style control.")
419+
402420
arena_subset_df = arena_dfs[category]
403421
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 300]
404422
elo_subset_results = category_elo_results[category]
@@ -409,6 +427,11 @@ def update_leaderboard_and_plots(category):
409427
arena_df,
410428
model_table_df,
411429
arena_subset_df=arena_subset_df if category != "Overall" else None,
430+
hidden_models=(
431+
None
432+
if len(filters) > 0 and "Show Deprecate" in filters
433+
else deprecated_model_name
434+
),
412435
)
413436
if category != "Overall":
414437
arena_values = update_leaderboard_df(arena_values)
@@ -490,7 +513,9 @@ def update_leaderboard_and_plots(category):
490513
p4 = category_elo_results["Overall"]["average_win_rate_bar"]
491514

492515
# arena table
493-
arena_table_vals = get_arena_table(arena_df, model_table_df)
516+
arena_table_vals = get_arena_table(
517+
arena_df, model_table_df, hidden_models=deprecated_model_name
518+
)
494519

495520
md = make_arena_leaderboard_md(arena_df, last_updated_time, vision=vision)
496521
gr.Markdown(md, elem_id="leaderboard_markdown")
@@ -501,6 +526,10 @@ def update_leaderboard_and_plots(category):
501526
label="Category",
502527
value="Overall",
503528
)
529+
with gr.Column(scale=2):
530+
category_checkbox = gr.CheckboxGroup(
531+
["Style Control", "Show Deprecate"], label="Apply filter", info=""
532+
)
504533
default_category_details = make_category_arena_leaderboard_md(
505534
arena_df, arena_df, name="Overall"
506535
)
@@ -599,7 +628,21 @@ def update_leaderboard_and_plots(category):
599628
plot_2 = gr.Plot(p2, show_label=False)
600629
category_dropdown.change(
601630
update_leaderboard_and_plots,
602-
inputs=[category_dropdown],
631+
inputs=[category_dropdown, category_checkbox],
632+
outputs=[
633+
elo_display_df,
634+
plot_1,
635+
plot_2,
636+
plot_3,
637+
plot_4,
638+
more_stats_md,
639+
category_deets,
640+
],
641+
)
642+
643+
category_checkbox.change(
644+
update_leaderboard_and_plots,
645+
inputs=[category_dropdown, category_checkbox],
603646
outputs=[
604647
elo_display_df,
605648
plot_1,
@@ -659,13 +702,19 @@ def get_arena_category_table(results_df, categories, metric="ranking"):
659702

660703
def highlight_top_3(s):
661704
return [
662-
"background-color: rgba(255, 215, 0, 0.5); text-align: center; font-size: 110%"
663-
if v == 1 and v != 0
664-
else "background-color: rgba(192, 192, 192, 0.5); text-align: center; font-size: 110%"
665-
if v == 2 and v != 0
666-
else "background-color: rgba(255, 165, 0, 0.5); text-align: center; font-size: 110%"
667-
if v == 3 and v != 0
668-
else "text-align: center; font-size: 110%"
705+
(
706+
"background-color: rgba(255, 215, 0, 0.5); text-align: center; font-size: 110%"
707+
if v == 1 and v != 0
708+
else (
709+
"background-color: rgba(192, 192, 192, 0.5); text-align: center; font-size: 110%"
710+
if v == 2 and v != 0
711+
else (
712+
"background-color: rgba(255, 165, 0, 0.5); text-align: center; font-size: 110%"
713+
if v == 3 and v != 0
714+
else "text-align: center; font-size: 110%"
715+
)
716+
)
717+
)
669718
for v in s
670719
]
671720

fastchat/serve/monitor/monitor_md.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
from fastchat.constants import SURVEY_LINK
66

7+
deprecated_model_name = [
8+
"gemini-1.5-pro-exp-0801",
9+
"gemini-1.5-pro-api-0409-preview",
10+
]
11+
712
key_to_category_name = {
813
"full": "Overall",
914
"full_style_control": "Overall w/ Style Control",
@@ -29,6 +34,8 @@
2934
"no_refusal": "Exclude Refusal",
3035
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
3136
"full_old": "Overall (Deprecated)",
37+
"full_style_control": "Overall (Style Control)",
38+
"hard_6_style_control": "Hard Prompts (Overall) (Style Control)",
3239
}
3340
cat_name_to_explanation = {
3441
"Overall": "Overall Questions",
@@ -55,6 +62,8 @@
5562
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
5663
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
5764
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
65+
"Overall (Style Control)": "Overall Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
66+
"Hard Prompts (Overall) (Style Control)": "Hard Prompts (Overall) Leaderboard with Style Control. See details in [blog post](https://lmsys.org/blog/2024-08-28-style-control/).",
5867
}
5968
cat_name_to_baseline = {
6069
"Hard Prompts (English)": "English",

0 commit comments

Comments
 (0)
0