Spaces:
Running
Running
update UI
Browse files- app.py +86 -33
- eval-results/.gitattributes +0 -55
- eval-results/README.md +0 -24
- eval-results/SeaExam_results.csv +0 -47
- eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json +0 -15
- src/display/about.py +5 -4
- update_git.sh +0 -3
app.py
CHANGED
@@ -34,13 +34,16 @@ def restart_space():
|
|
34 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
35 |
|
36 |
all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
|
37 |
-
show_columns = ['R','type',
|
|
|
38 |
# Load the data from the csv file
|
39 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
|
40 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
41 |
-
df_m3exam = df_m3exam.copy()[show_columns]
|
42 |
-
df_mmlu = df_mmlu.copy()[show_columns]
|
43 |
df_avg_init = df_avg.copy()[df_avg['type'] == '🔶 chat'][show_columns]
|
|
|
|
|
44 |
|
45 |
# data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
46 |
# map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
|
@@ -143,10 +146,10 @@ with demo:
|
|
143 |
# + [AutoEvalColumn.dummy.name]
|
144 |
# ],
|
145 |
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
146 |
-
|
147 |
elem_id="leaderboard-table",
|
148 |
interactive=False,
|
149 |
-
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
150 |
# datatype=[map_types[k] for k in shown_columns.value],
|
151 |
visible=True,
|
152 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
@@ -194,17 +197,35 @@ with demo:
|
|
194 |
)
|
195 |
with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
|
196 |
with gr.Row():
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
leaderboard_table = gr.components.Dataframe(
|
204 |
-
value=
|
205 |
interactive=False,
|
206 |
visible=True,
|
207 |
-
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
|
|
208 |
)
|
209 |
|
210 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
@@ -216,31 +237,56 @@ with demo:
|
|
216 |
search_bar.submit(
|
217 |
update_table,
|
218 |
[
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
# filter_columns_type,
|
223 |
-
# filter_columns_precision,
|
224 |
-
# filter_columns_size,
|
225 |
-
# deleted_models_visibility,
|
226 |
search_bar,
|
227 |
],
|
228 |
leaderboard_table,
|
229 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
with gr.TabItem("MMLU", elem_id="llm-benchmark-MMLU", id=2):
|
232 |
with gr.Row():
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
|
239 |
leaderboard_table = gr.components.Dataframe(
|
240 |
-
value=
|
241 |
interactive=False,
|
242 |
visible=True,
|
243 |
-
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
|
|
244 |
)
|
245 |
|
246 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
@@ -252,17 +298,24 @@ with demo:
|
|
252 |
search_bar.submit(
|
253 |
update_table,
|
254 |
[
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
# filter_columns_type,
|
259 |
-
# filter_columns_precision,
|
260 |
-
# filter_columns_size,
|
261 |
-
# deleted_models_visibility,
|
262 |
search_bar,
|
263 |
],
|
264 |
leaderboard_table,
|
265 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
|
267 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
268 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
34 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
35 |
|
36 |
all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
|
37 |
+
show_columns = ['R', 'Model','type','open?','params(B)', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', ]
|
38 |
+
TYPES = ['number', 'markdown', 'str', 'str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
39 |
# Load the data from the csv file
|
40 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
|
41 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
42 |
+
# df_m3exam = df_m3exam.copy()[show_columns]
|
43 |
+
# df_mmlu = df_mmlu.copy()[show_columns]
|
44 |
df_avg_init = df_avg.copy()[df_avg['type'] == '🔶 chat'][show_columns]
|
45 |
+
df_m3exam_init = df_m3exam.copy()[df_m3exam['type'] == '🔶 chat'][show_columns]
|
46 |
+
df_mmlu_init = df_mmlu.copy()[df_mmlu['type'] == '🔶 chat'][show_columns]
|
47 |
|
48 |
# data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
49 |
# map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
|
|
|
146 |
# + [AutoEvalColumn.dummy.name]
|
147 |
# ],
|
148 |
# headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
149 |
+
datatype=TYPES,
|
150 |
elem_id="leaderboard-table",
|
151 |
interactive=False,
|
152 |
+
# datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
153 |
# datatype=[map_types[k] for k in shown_columns.value],
|
154 |
visible=True,
|
155 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
|
|
197 |
)
|
198 |
with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
|
199 |
with gr.Row():
|
200 |
+
with gr.Column():
|
201 |
+
search_bar = gr.Textbox(
|
202 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
203 |
+
show_label=False,
|
204 |
+
elem_id="search-bar",
|
205 |
+
)
|
206 |
+
with gr.Column():
|
207 |
+
type_query = gr.CheckboxGroup(
|
208 |
+
choices=["🟢 base", "🔶 chat"],
|
209 |
+
value=["🔶 chat" ],
|
210 |
+
label="model types to show",
|
211 |
+
elem_id="type-select",
|
212 |
+
interactive=True,
|
213 |
+
)
|
214 |
+
with gr.Column():
|
215 |
+
open_query = gr.CheckboxGroup(
|
216 |
+
choices=["open", "closed"],
|
217 |
+
value=["open", "closed"],
|
218 |
+
label="open-source or closed-source models?",
|
219 |
+
elem_id="open-select",
|
220 |
+
interactive=True,
|
221 |
+
)
|
222 |
|
223 |
leaderboard_table = gr.components.Dataframe(
|
224 |
+
value=df_m3exam_init,
|
225 |
interactive=False,
|
226 |
visible=True,
|
227 |
+
# datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
228 |
+
datatype=TYPES,
|
229 |
)
|
230 |
|
231 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
|
237 |
search_bar.submit(
|
238 |
update_table,
|
239 |
[
|
240 |
+
hidden_leaderboard_table_for_search,
|
241 |
+
type_query,
|
242 |
+
open_query,
|
|
|
|
|
|
|
|
|
243 |
search_bar,
|
244 |
],
|
245 |
leaderboard_table,
|
246 |
)
|
247 |
+
for selector in [type_query, open_query]:
|
248 |
+
selector.change(
|
249 |
+
update_table,
|
250 |
+
[
|
251 |
+
hidden_leaderboard_table_for_search,
|
252 |
+
type_query,
|
253 |
+
open_query,
|
254 |
+
search_bar,
|
255 |
+
],
|
256 |
+
leaderboard_table,
|
257 |
+
)
|
258 |
|
259 |
with gr.TabItem("MMLU", elem_id="llm-benchmark-MMLU", id=2):
|
260 |
with gr.Row():
|
261 |
+
with gr.Column():
|
262 |
+
search_bar = gr.Textbox(
|
263 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
264 |
+
show_label=False,
|
265 |
+
elem_id="search-bar",
|
266 |
+
)
|
267 |
+
with gr.Column():
|
268 |
+
type_query = gr.CheckboxGroup(
|
269 |
+
choices=["🟢 base", "🔶 chat"],
|
270 |
+
value=["🔶 chat" ],
|
271 |
+
label="model types to show",
|
272 |
+
elem_id="type-select",
|
273 |
+
interactive=True,
|
274 |
+
)
|
275 |
+
with gr.Column():
|
276 |
+
open_query = gr.CheckboxGroup(
|
277 |
+
choices=["open", "closed"],
|
278 |
+
value=["open", "closed"],
|
279 |
+
label="open-source or closed-source models?",
|
280 |
+
elem_id="open-select",
|
281 |
+
interactive=True,
|
282 |
+
)
|
283 |
|
284 |
leaderboard_table = gr.components.Dataframe(
|
285 |
+
value=df_mmlu_init,
|
286 |
interactive=False,
|
287 |
visible=True,
|
288 |
+
# datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
289 |
+
datatype=TYPES,
|
290 |
)
|
291 |
|
292 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
|
|
298 |
search_bar.submit(
|
299 |
update_table,
|
300 |
[
|
301 |
+
hidden_leaderboard_table_for_search,
|
302 |
+
type_query,
|
303 |
+
open_query,
|
|
|
|
|
|
|
|
|
304 |
search_bar,
|
305 |
],
|
306 |
leaderboard_table,
|
307 |
)
|
308 |
+
for selector in [type_query, open_query]:
|
309 |
+
selector.change(
|
310 |
+
update_table,
|
311 |
+
[
|
312 |
+
hidden_leaderboard_table_for_search,
|
313 |
+
type_query,
|
314 |
+
open_query,
|
315 |
+
search_bar,
|
316 |
+
],
|
317 |
+
leaderboard_table,
|
318 |
+
)
|
319 |
|
320 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
321 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
eval-results/.gitattributes
DELETED
@@ -1,55 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
-
# Audio files - uncompressed
|
38 |
-
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
-
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
-
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
-
# Audio files - compressed
|
42 |
-
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
-
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
-
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
-
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
-
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
-
# Image files - uncompressed
|
48 |
-
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
-
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
-
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
-
# Image files - compressed
|
53 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
-
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
-
*.webp filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval-results/README.md
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
language:
|
4 |
-
- en
|
5 |
-
- zh
|
6 |
-
- vi
|
7 |
-
- id
|
8 |
-
- th
|
9 |
-
|
10 |
-
size_categories:
|
11 |
-
- n<1K
|
12 |
-
configs:
|
13 |
-
- config_name: results
|
14 |
-
data_files: SeaExam_results.csv
|
15 |
-
---
|
16 |
-
|
17 |
-
# About
|
18 |
-
|
19 |
-
This repo contains the original results for the space [SeaExam Leaderboard](https://huggingface.co/spaces/SeaLLMs/SeaExam_leaderboard).
|
20 |
-
|
21 |
-
To reproduce our results, use the script in [this repo](https://github.com/DAMO-NLP-SG/SeaExam/tree/main). The script will download the model and tokenizer, and evaluate the model on the benchmark data.
|
22 |
-
```python
|
23 |
-
python scripts/main.py --model $model_name_or_path
|
24 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval-results/SeaExam_results.csv
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
,,,,M3Exam,,,,,,,MMLU,,,,,,,AVG,,,,,,
|
2 |
-
Model,type,open?,shot,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea
|
3 |
-
gpt-4-turbo-1106,chat,N,0,0.87683,0.78882,0.64873,0.68956,0.70774,0.74234,0.68201,0.79825,0.72912,0.74526,0.67088,0.71053,0.73081,0.70889,0.83754,0.75897,0.69700,0.68022,0.70913,0.73657,0.69545
|
4 |
-
Meta-Llama-3-70B,base,Y,3,0.84382,0.75621,0.61899,0.66181,0.68252,0.71267,0.65444,0.78772,0.70491,0.73509,0.65930,0.70526,0.71846,0.69988,0.81577,0.73056,0.67704,0.66055,0.69389,0.71556,0.67716
|
5 |
-
Meta-Llama-3-70B-Instruct,chat,Y,3,0.86321,0.69410,0.62975,0.64299,0.68424,0.70286,0.65233,0.79965,0.69088,0.72316,0.63228,0.68912,0.70702,0.68152,0.83143,0.69249,0.67645,0.63764,0.68668,0.70494,0.66692
|
6 |
-
Qwen1.5-72B,base,Y,3,0.83857,0.92547,0.58734,0.56820,0.64756,0.71343,0.60104,0.74491,0.69474,0.66456,0.56351,0.63088,0.65972,0.61965,0.79174,0.81010,0.62595,0.56586,0.63922,0.68657,0.61034
|
7 |
-
claude-3-sonnet-20240229,chat,N,0,0.78878,0.68323,0.58544,0.57150,0.62579,0.65095,0.59424,0.71333,0.60456,0.63684,0.54070,0.58421,0.61593,0.58725,0.75106,0.64390,0.61114,0.55610,0.60500,0.63344,0.59075
|
8 |
-
claude-3-haiku-20240307,chat,N,0,0.79036,0.65217,0.56266,0.57291,0.63095,0.64181,0.58884,0.71053,0.60526,0.61193,0.51474,0.56316,0.60112,0.56327,0.75044,0.62872,0.58729,0.54382,0.59705,0.62147,0.57606
|
9 |
-
dbrx-base,base,Y,3,0.80818,0.68944,0.53418,0.50659,0.60458,0.62859,0.54845,0.73123,0.64281,0.64456,0.47368,0.61754,0.62196,0.57860,0.76970,0.66612,0.58937,0.49013,0.61106,0.62528,0.56352
|
10 |
-
Mixtral-8x22B-v0.1,base,Y,3,0.83910,0.69565,0.56962,0.48730,0.60115,0.63856,0.55269,0.76877,0.62491,0.64667,0.45018,0.57649,0.61340,0.55778,0.80394,0.66028,0.60814,0.46874,0.58882,0.62598,0.55523
|
11 |
-
SeaLLM-7B-v2.5,chat,Y,3,0.75943,0.60248,0.50063,0.50659,0.61834,0.59749,0.54185,0.64877,0.53719,0.56772,0.48667,0.53018,0.55411,0.52819,0.70410,0.56984,0.53418,0.49663,0.57426,0.57580,0.53502
|
12 |
-
Qwen1.5-14B,base,Y,3,0.79665,0.86180,0.52722,0.47836,0.54900,0.64260,0.51819,0.67509,0.60211,0.55719,0.44491,0.52351,0.56056,0.50854,0.73587,0.73195,0.54220,0.46164,0.53625,0.60158,0.51336
|
13 |
-
gemini-1.0-pro,chat,N,0,0.56866,0.72516,0.43987,0.49247,0.60516,0.56626,0.51250,0.54912,0.59684,0.53368,0.43895,0.55298,0.53432,0.50854,0.55889,0.66100,0.48678,0.46571,0.57907,0.55029,0.51052
|
14 |
-
gemma-7b,base,Y,3,0.73061,0.52795,0.46456,0.46284,0.59656,0.55650,0.50799,0.63579,0.50772,0.55228,0.48842,0.49684,0.53621,0.51251,0.68320,0.51783,0.50842,0.47563,0.54670,0.54636,0.51025
|
15 |
-
gpt-3.5-turbo-0125,,N,3,0.75105,0.58851,0.50000,0.38852,0.53352,0.55232,0.47402,0.68211,0.54912,0.59088,0.38596,0.50246,0.54211,0.49310,0.71658,0.56882,0.54544,0.38724,0.51799,0.54721,0.48356
|
16 |
-
Mixtral-8x7B-v0.1,base,Y,3,0.77096,0.60559,0.47975,0.43509,0.52206,0.56269,0.47897,0.70351,0.54140,0.56632,0.39298,0.49404,0.53965,0.48444,0.73724,0.57350,0.52303,0.41404,0.50805,0.55117,0.48171
|
17 |
-
Llama-2-70b-hf,base,Y,3,0.74895,0.59938,0.49177,0.34478,0.55931,0.54884,0.46529,0.68526,0.55965,0.58982,0.32737,0.52035,0.53649,0.47918,0.71711,0.57951,0.54080,0.33607,0.53983,0.54267,0.47223
|
18 |
-
Meta-Llama-3-8B,base,Y,3,0.70021,0.54037,0.42722,0.45390,0.50888,0.52612,0.46333,0.63193,0.48561,0.51158,0.43579,0.49053,0.51109,0.47930,0.66607,0.51299,0.46940,0.44485,0.49970,0.51860,0.47132
|
19 |
-
Sailor-7B-Chat,chat,Y,3,0.65618,0.65062,0.47405,0.46425,0.51175,0.55137,0.48335,0.55579,0.47509,0.48526,0.41789,0.46105,0.47902,0.45474,0.60599,0.56285,0.47966,0.44107,0.48640,0.51519,0.46904
|
20 |
-
gpt-3.5-turbo-0125,chat,N,0,0.75577,0.60559,0.49304,0.39652,0.52894,0.55597,0.47283,0.67228,0.53018,0.56667,0.36070,0.46281,0.51853,0.46339,0.71402,0.56788,0.52985,0.37861,0.49587,0.53725,0.46811
|
21 |
-
Yi-34B,base,Y,3,0.81499,0.86025,0.54114,0.38147,0.50201,0.61997,0.47487,0.75860,0.68386,0.60105,0.31439,0.45018,0.56161,0.45520,0.78679,0.77205,0.57110,0.34793,0.47609,0.59079,0.46504
|
22 |
-
Meta-Llama-3-8B-Instruct,chat,Y,3,0.72537,0.53727,0.46646,0.37065,0.50946,0.52184,0.44885,0.64912,0.48246,0.50421,0.36702,0.47544,0.49565,0.44889,0.68724,0.50986,0.48533,0.36883,0.49245,0.50874,0.44887
|
23 |
-
SeaLLM-7B-v2,chat,Y,3,0.70178,0.51553,0.43165,0.40593,0.51519,0.51401,0.45092,0.61474,0.45930,0.49158,0.36246,0.44246,0.47411,0.43216,0.65826,0.48741,0.46161,0.38419,0.47882,0.49406,0.44154
|
24 |
-
Sailor-7B,base,Y,3,0.61111,0.63199,0.44304,0.40969,0.49914,0.51899,0.45062,0.52456,0.44737,0.45614,0.40070,0.43754,0.45326,0.43146,0.56784,0.53968,0.44959,0.40520,0.46834,0.48613,0.44104
|
25 |
-
Qwen1.5-7B-Chat,chat,Y,3,0.64570,0.62733,0.43038,0.39793,0.49226,0.51872,0.44019,0.58351,0.51579,0.42772,0.36316,0.44667,0.46737,0.41251,0.61461,0.57156,0.42905,0.38054,0.46947,0.49304,0.42635
|
26 |
-
Yi-9B,base,Y,3,0.77516,0.79193,0.49241,0.35748,0.45330,0.57405,0.43439,0.67684,0.59263,0.50772,0.29404,0.38140,0.49053,0.39439,0.72600,0.69228,0.50006,0.32576,0.41735,0.53229,0.41439
|
27 |
-
Qwen1.5-7B,base,Y,3,0.72117,0.81056,0.44114,0.36124,0.44986,0.55679,0.41741,0.61228,0.51509,0.45895,0.34105,0.41333,0.46814,0.40444,0.66673,0.66282,0.45004,0.35115,0.43159,0.51247,0.41093
|
28 |
-
Mistral-7B-v0.1,base,Y,3,0.67715,0.49689,0.42152,0.34572,0.40860,0.46998,0.39194,0.60877,0.45754,0.47053,0.31579,0.40351,0.45123,0.39661,0.64296,0.47722,0.44602,0.33075,0.40605,0.46060,0.39428
|
29 |
-
gemma-7b-it,chat,Y,3,0.62159,0.42702,0.37342,0.32079,0.46705,0.44197,0.38709,0.52421,0.42632,0.41719,0.34456,0.39298,0.42105,0.38491,0.57290,0.42667,0.39531,0.33268,0.43002,0.43151,0.38600
|
30 |
-
Mistral-7B-Instruct-v0.2,chat,Y,3,0.65671,0.49534,0.40443,0.30386,0.39885,0.45184,0.36905,0.58877,0.43404,0.44246,0.32596,0.38211,0.43467,0.38351,0.62274,0.46469,0.42344,0.31491,0.39048,0.44325,0.37628
|
31 |
-
Qwen1.5-4B,base,Y,3,0.66352,0.77174,0.35127,0.31891,0.38854,0.49879,0.35290,0.55018,0.46807,0.39298,0.31193,0.36947,0.41853,0.35813,0.60685,0.61990,0.37212,0.31542,0.37901,0.45866,0.35552
|
32 |
-
Yi-6B,base,Y,3,0.70440,0.80901,0.41076,0.29821,0.37020,0.51852,0.35972,0.62175,0.54316,0.43825,0.26140,0.33368,0.43965,0.34444,0.66308,0.67608,0.42450,0.27981,0.35194,0.47908,0.35208
|
33 |
-
Llama-2-13b-hf,base,Y,3,0.60535,0.36491,0.38418,0.28786,0.40860,0.41018,0.36021,0.53368,0.38877,0.42421,0.24175,0.36386,0.39046,0.34327,0.56952,0.37684,0.40419,0.26481,0.38623,0.40032,0.35174
|
34 |
-
Llama-2-13b-chat-hf,chat,Y,3,0.58910,0.38199,0.37152,0.28833,0.38968,0.40412,0.34985,0.53088,0.38281,0.40351,0.25789,0.34561,0.38414,0.33567,0.55999,0.38240,0.38751,0.27311,0.36765,0.39413,0.34276
|
35 |
-
Qwen1.5-MoE-A2.7B,base,Y,3,0.62788,0.78882,0.36582,0.25400,0.40172,0.48765,0.34051,0.56456,0.49123,0.40772,0.26070,0.31684,0.40821,0.32842,0.59622,0.64002,0.38677,0.25735,0.35928,0.44793,0.33447
|
36 |
-
gemma-2b-it,chat,Y,3,0.43868,0.37733,0.31646,0.28363,0.35702,0.35462,0.31904,0.37789,0.33614,0.33930,0.30526,0.32035,0.33579,0.32164,0.40829,0.35673,0.32788,0.29445,0.33869,0.34521,0.32034
|
37 |
-
Llama-2-7b-chat-hf,chat,Y,3,0.56604,0.32609,0.34114,0.26811,0.34040,0.36835,0.31655,0.48211,0.35509,0.35789,0.25684,0.33298,0.35698,0.31591,0.52407,0.34059,0.34952,0.26248,0.33669,0.36267,0.31623
|
38 |
-
bloomz-7b1,chat,Y,3,0.43082,0.37733,0.36139,0.25588,0.35645,0.35637,0.32457,0.36561,0.34386,0.32526,0.22386,0.31684,0.31509,0.28865,0.39822,0.36059,0.34333,0.23987,0.33664,0.33573,0.30661
|
39 |
-
gemma-2b,base,Y,3,0.41719,0.27484,0.30443,0.28645,0.31576,0.31974,0.30221,0.37860,0.30281,0.31474,0.30070,0.30667,0.32070,0.30737,0.39789,0.28883,0.30958,0.29358,0.31121,0.32022,0.30479
|
40 |
-
Llama-2-7b-hf,base,Y,3,0.49109,0.32298,0.30823,0.26341,0.31748,0.34064,0.29637,0.44982,0.33439,0.34421,0.25930,0.30877,0.33930,0.30409,0.47046,0.32868,0.32622,0.26135,0.31313,0.33997,0.30023
|
41 |
-
Qwen1.5-1.8B,base,Y,3,0.54612,0.71273,0.32595,0.24365,0.32378,0.43045,0.29779,0.46211,0.39018,0.32702,0.24456,0.32281,0.34933,0.29813,0.50411,0.55145,0.32648,0.24411,0.32329,0.38989,0.29796
|
42 |
-
Qwen1.5-0.5B,base,Y,3,0.44602,0.61025,0.29367,0.26011,0.29742,0.38149,0.28373,0.38737,0.32421,0.29649,0.28456,0.29965,0.31846,0.29357,0.41669,0.46723,0.29508,0.27234,0.29854,0.34998,0.28865
|
43 |
-
sea-lion-7b-instruct,chat,Y,3,0.26992,0.27329,0.28671,0.26435,0.26877,0.27261,0.27327,0.26947,0.26070,0.25684,0.26526,0.25474,0.26140,0.25895,0.26969,0.26700,0.27178,0.26480,0.26175,0.26700,0.26611
|
44 |
-
sea-lion-7b,base,Y,3,0.24476,0.22826,0.25443,0.26435,0.24126,0.24661,0.25335,0.24772,0.26175,0.24982,0.24491,0.26351,0.25354,0.25275,0.24624,0.24501,0.25213,0.25463,0.25238,0.25008,0.25305
|
45 |
-
phi-2,base,Y,3,0.58176,0.28571,0.29494,0.20978,0.26934,0.32831,0.25802,0.56842,0.29439,0.29333,0.14105,0.26842,0.31312,0.23427,0.57509,0.29005,0.29414,0.17542,0.26888,0.32072,0.24614
|
46 |
-
bloom-7b1,base,Y,3,0.22694,0.18323,0.25316,0.24036,0.24298,0.22933,0.24550,0.25088,0.23895,0.25158,0.23684,0.24456,0.24456,0.24433,0.23891,0.21109,0.25237,0.23860,0.24377,0.23695,0.24491
|
47 |
-
claude-3-opus-20240229,chat,N,0,,,0.70316,0.73330,0.74613,,0.72753,,,,,,,,,,,,,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"config": {
|
3 |
-
"model_dtype": "torch.float16",
|
4 |
-
"model_name": "demo-leaderboard/gpt2-demo",
|
5 |
-
"model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
|
6 |
-
},
|
7 |
-
"results": {
|
8 |
-
"task_name1": {
|
9 |
-
"metric_name": 0
|
10 |
-
},
|
11 |
-
"task_name2": {
|
12 |
-
"metric_name": 0.90
|
13 |
-
}
|
14 |
-
}
|
15 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/about.py
CHANGED
@@ -22,13 +22,14 @@ TITLE = """<h1 align="center" id="space-title">📃 SeaExam Leaderboard</h1>"""
|
|
22 |
SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
|
|
|
|
|
|
|
|
25 |
INTRODUCTION_TEXT = """
|
26 |
-
This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
|
27 |
"""
|
28 |
|
29 |
-
# INTRODUCTION_TEXT = """
|
30 |
-
# This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
|
31 |
-
|
32 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
|
33 |
|
34 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
|
|
22 |
SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Southeast Asian Languages❓</h1>"""
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
+
# INTRODUCTION_TEXT = """
|
26 |
+
# This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
|
27 |
+
# """
|
28 |
+
|
29 |
INTRODUCTION_TEXT = """
|
30 |
+
This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects). Refer to the "📝 About" tab for more information.
|
31 |
"""
|
32 |
|
|
|
|
|
|
|
33 |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
|
34 |
|
35 |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
update_git.sh
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
git add .
|
2 |
-
git commit -m "update scripts"
|
3 |
-
git push
|
|
|
|
|
|
|
|