KennethEnevoldsen commited on
Commit
b8ed799
1 Parent(s): 1fd4bba

Removed debug code in refresh.py intended to only update one board (#20)

Browse files

* Removed debug code in refresh.py intended to only update one board

I additionally removed types from docstring (some were lying) and added types where I could guess them to make it easier to debug the code in the future.

Also added a minor description to the readme for ease of navigation.

* Added import for type annotations

* Ensure that the reset_index happens inplace

* Avoid adding duplicates in refresh.py

Files changed (2) hide show
  1. README.md +20 -0
  2. refresh.py +334 -133
README.md CHANGED
@@ -12,3 +12,23 @@ tags:
12
  startup_duration_timeout: 1h
13
  fullWidth: true
14
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  startup_duration_timeout: 1h
13
  fullWidth: true
14
  ---
15
+
16
+ ## The MTEB Leaderboard repository
17
+
18
+ This repository contains the code for pushing and updating the MTEB leaderboard daily.
19
+
20
+ | Relevant Links | Decription |
21
+ |------------------------------------------|------------------------------|
22
+ | [mteb](https://github.com/embeddings-benchmark/mteb) | The implementation of the benchmark. Here you e.g. find the code to run your model on the benchmark. |
23
+ | [leaderboard](https://huggingface.co/spaces/mteb/leaderboard) | The leaderboard itself, here you can view results of model run on MTEB. |
24
+ | [results](https://github.com/embeddings-benchmark/results) | The results of MTEB is stored here. Though you can publish them to the leaderboard [adding](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) the result to your model card. |
25
+
26
+ ## Developer setup
27
+
28
+ To setup the repository:
29
+
30
+ ```
31
+ git clone {repo_url}
32
+ # potentially create virtual environment using python 3.9
33
+ pip install -r requirements.txt
34
+ ```
refresh.py CHANGED
@@ -1,17 +1,19 @@
1
- from functools import reduce
 
2
  import json
3
  import os
4
  import re
 
 
5
 
 
6
  from datasets import load_dataset
7
  from huggingface_hub import hf_hub_download
8
  from huggingface_hub.repocard import metadata_load
9
- import pandas as pd
10
  from tqdm.autonotebook import tqdm
11
 
 
12
  from utils.model_size import get_model_parameters_memory
13
- from envs import LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO, API
14
-
15
 
16
  MODEL_CACHE = {}
17
  TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
@@ -34,21 +36,44 @@ TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
34
  TASK_TO_METRIC["PairClassification"].append("cosine_ap")
35
 
36
 
37
- EXTERNAL_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_external", False)}
38
- EXTERNAL_MODEL_TO_LINK = {k: v["link"] for k,v in MODEL_META["model_meta"].items() if v.get("link", False)}
39
- EXTERNAL_MODEL_TO_DIM = {k: v["dim"] for k,v in MODEL_META["model_meta"].items() if v.get("dim", False)}
40
- EXTERNAL_MODEL_TO_SEQLEN = {k: v["seq_len"] for k,v in MODEL_META["model_meta"].items() if v.get("seq_len", False)}
41
- EXTERNAL_MODEL_TO_SIZE = {k: v["size"] for k,v in MODEL_META["model_meta"].items() if v.get("size", False)}
42
- PROPRIETARY_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)}
43
- TASK_DESCRIPTIONS = {k: v["task_description"] for k,v in TASKS_CONFIG.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
45
- SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("is_sentence_transformers_compatible", False)}
 
 
 
 
46
  MODELS_TO_SKIP = MODEL_META["models_to_skip"]
47
  CROSS_ENCODERS = MODEL_META["cross_encoders"]
48
- BI_ENCODERS = [k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]]
49
- INSTRUCT_MODELS = {k for k,v in MODEL_META["model_meta"].items() if v.get("uses_instruct", False)}
50
- NOINSTRUCT_MODELS = {k for k,v in MODEL_META["model_meta"].items() if not v.get("uses_instruct", False)}
51
-
 
 
 
 
 
52
 
53
 
54
  TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
@@ -64,12 +89,28 @@ MODEL_INFOS = {}
64
  # with open(model_infos_path) as f:
65
  # MODEL_INFOS = json.load(f)
66
 
67
- def add_rank(df):
68
- cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]]
 
 
 
 
 
 
 
 
 
 
 
 
69
  if len(cols_to_rank) == 1:
70
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
71
  else:
72
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
 
 
 
 
73
  df.sort_values("Average", ascending=False, inplace=True)
74
  df.insert(0, "Rank", list(range(1, len(df) + 1)))
75
  df = df.round(2)
@@ -78,23 +119,26 @@ def add_rank(df):
78
  return df
79
 
80
 
81
- def make_clickable_model(model_name, link=None):
82
  if link is None:
83
  link = "https://huggingface.co/" + model_name
84
  # Remove user from model name
85
- return (
86
- f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
87
- )
88
 
89
 
90
  def add_lang(examples):
91
- if not(examples["eval_language"]) or (examples["eval_language"] == "default"):
92
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
93
  else:
94
- examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
 
 
95
  return examples
96
 
97
- def norm(names): return set([name.split(" ")[0] for name in names])
 
 
 
98
 
99
  def add_task(examples):
100
  # Could be added to the dataset loading script instead
@@ -111,16 +155,22 @@ def add_task(examples):
111
  examples["mteb_task"] = "Unknown"
112
  return examples
113
 
114
- def filter_metric_external(x, task, metrics):
115
- # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
116
- if x['mteb_dataset_name'] in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval']:
117
- return x["mteb_task"] == task and x['metric'] == 'ndcg_at_1'
 
118
  else:
119
- return x["mteb_task"] == task and x["metric"] in metrics
 
120
 
121
- def filter_metric_fetched(name, metric, expected_metrics):
122
- # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
123
- return metric == 'ndcg_at_1' if name in ['LEMBNeedleRetrieval', 'LEMBPasskeyRetrieval'] else metric in expected_metrics
 
 
 
 
124
 
125
 
126
  def get_dim_seq_size(model):
@@ -139,12 +189,20 @@ def get_dim_seq_size(model):
139
  config_path = hf_hub_download(model.modelId, filename="config.json")
140
  config = json.load(open(config_path))
141
  if not dim:
142
- dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
143
- seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
144
-
 
 
 
 
 
 
 
 
145
  if dim == "" or seq == "":
146
  raise Exception(f"Could not find dim or seq for model {model.modelId}")
147
-
148
  # Get model file size without downloading. Parameters in million parameters and memory in GB
149
  parameters, memory = get_model_parameters_memory(model)
150
  return dim, seq, parameters, memory
@@ -159,27 +217,54 @@ def get_external_model_results():
159
  for model in EXTERNAL_MODELS:
160
  if model not in EXTERNAL_MODEL_RESULTS:
161
  models_to_run.append(model)
162
- EXTERNAL_MODEL_RESULTS[model] = {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
 
 
163
 
164
  ## only if we want to re-calculate all instead of using the cache... it's likely they haven't changed
165
  ## but if your model results have changed, delete it from the "EXTERNAL_MODEL_RESULTS.json" file
166
  else:
167
- EXTERNAL_MODEL_RESULTS = {model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS}
 
 
 
168
  models_to_run = EXTERNAL_MODELS
169
 
170
  pbar = tqdm(models_to_run, desc="Fetching external model results")
171
  for model in pbar:
172
  pbar.set_description(f"Fetching external model results for {model!r}")
173
- ds = load_dataset(RESULTS_REPO, model, trust_remote_code=True, download_mode='force_redownload', verification_mode="no_checks")
 
 
 
 
 
 
174
  ds = ds.map(add_lang)
175
  ds = ds.map(add_task)
176
- base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, f"https://huggingface.co/spaces/{REPO_ID}"))}
 
 
 
 
 
 
 
177
 
178
  for task, metrics in TASK_TO_METRIC.items():
179
- ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))["test"].to_dict()
180
- ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
 
 
 
 
 
 
 
181
  # metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
182
- EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append({**base_dict, **ds_dict})
 
 
183
 
184
  # Save & cache EXTERNAL_MODEL_RESULTS
185
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
@@ -188,7 +273,7 @@ def get_external_model_results():
188
  return EXTERNAL_MODEL_RESULTS
189
 
190
 
191
- def download_or_use_cache(modelId):
192
  global MODEL_CACHE
193
  if modelId in MODEL_CACHE:
194
  return MODEL_CACHE[modelId]
@@ -202,7 +287,15 @@ def download_or_use_cache(modelId):
202
  return meta
203
 
204
 
205
- def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=True, task_to_metric=TASK_TO_METRIC, rank=True):
 
 
 
 
 
 
 
 
206
  global MODEL_INFOS
207
 
208
  with open("EXTERNAL_MODEL_RESULTS.json", "r") as f:
@@ -211,46 +304,62 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
211
  api = API
212
  models = list(api.list_models(filter="mteb"))
213
  # Legacy names changes; Also fetch the old results & merge later
214
- if ('MLSUMClusteringP2P (fr)' in datasets):
215
- datasets.append('MLSUMClusteringP2P')
216
- if ('MLSUMClusteringS2S (fr)' in datasets):
217
- datasets.append('MLSUMClusteringS2S')
218
- if ('PawsXPairClassification (fr)' in datasets):
219
- datasets.append('PawsX (fr)')
220
  # Initialize list to models that we cannot fetch metadata from
221
  df_list = []
222
  for model in external_model_results:
223
  results_list = []
224
  for task in tasks:
225
  # Not all models have InstructionRetrieval, other new tasks
226
- if task not in external_model_results[model]: continue
 
227
  results_list += external_model_results[model][task][task_to_metric[task][0]]
228
-
229
  if len(datasets) > 0:
230
- res = {k: v for d in results_list for k, v in d.items() if (k == "Model") or any([x in k for x in datasets])}
 
 
 
 
 
231
  elif langs:
232
  # Would be cleaner to rely on an extra language column instead
233
  langs_format = [f"({lang})" for lang in langs]
234
- res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])}
 
 
 
 
 
235
  else:
236
  res = {k: v for d in results_list for k, v in d.items()}
237
  # Model & at least one result
238
  if len(res) > 1:
239
  if add_emb_dim:
240
- res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
241
- res["Memory Usage (GB, fp32)"] = round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2) if res["Model Size (Million Parameters)"] != "" else ""
 
 
 
 
 
 
242
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
243
  res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
244
  df_list.append(res)
245
 
246
  pbar = tqdm(models, desc="Fetching model metadata")
247
  for model in pbar:
248
- if model.modelId in MODELS_TO_SKIP: continue
 
249
  pbar.set_description(f"Fetching {model.modelId!r} metadata")
250
  meta = download_or_use_cache(model.modelId)
251
- MODEL_INFOS[model.modelId] = {
252
- "metadata": meta
253
- }
254
  if "model-index" not in meta:
255
  continue
256
  # meta['model-index'][0]["results"] is list of elements like:
@@ -269,13 +378,45 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
269
  # },
270
  # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
271
  if len(datasets) > 0:
272
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and any([x in sub_res.get("dataset", {}).get("name", "") for x in datasets])]
 
 
 
 
 
 
 
273
  elif langs:
274
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))]
 
 
 
 
 
 
 
 
275
  else:
276
- task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)]
 
 
 
 
277
  try:
278
- out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if filter_metric_fetched(res["dataset"]["name"].replace("MTEB ", ""), score["type"], task_to_metric.get(res["task"]["type"]))][0]} for res in task_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  except Exception as e:
280
  print("ERROR", model.modelId, e)
281
  continue
@@ -286,7 +427,9 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
286
  if add_emb_dim:
287
  # The except clause triggers on gated repos, we can use external metadata for those
288
  try:
289
- MODEL_INFOS[model.modelId]["dim_seq_size"] = list(get_dim_seq_size(model))
 
 
290
  except:
291
  name_without_org = model.modelId.split("/")[-1]
292
  # EXTERNAL_MODEL_TO_SIZE[name_without_org] refers to millions of parameters, so for memory usage
@@ -296,12 +439,29 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
296
  EXTERNAL_MODEL_TO_DIM.get(name_without_org, ""),
297
  EXTERNAL_MODEL_TO_SEQLEN.get(name_without_org, ""),
298
  EXTERNAL_MODEL_TO_SIZE.get(name_without_org, ""),
299
- round(EXTERNAL_MODEL_TO_SIZE[name_without_org] * 1e6 * 4 / 1024**3, 2) if name_without_org in EXTERNAL_MODEL_TO_SIZE else "",
 
 
 
 
 
 
 
 
300
  )
301
- out["Embedding Dimensions"], out["Max Tokens"], out["Model Size (Million Parameters)"], out["Memory Usage (GB, fp32)"] = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
 
 
 
 
 
302
  df_list.append(out)
303
  model_siblings = model.siblings or []
304
- if model.library_name == "sentence-transformers" or "sentence-transformers" in model.tags or "modules.json" in {file.rfilename for file in model_siblings}:
 
 
 
 
305
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
306
 
307
  # # Save & cache MODEL_INFOS
@@ -314,28 +474,39 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
314
  df = df.groupby("Model", as_index=False).first()
315
  # Put 'Model' column first
316
  cols = sorted(list(df.columns))
317
- base_columns = ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens"]
 
 
 
 
 
 
318
  if len(datasets) > 0:
319
  # Update legacy column names to be merged with newer ones
320
  # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
321
- if ('MLSUMClusteringP2P (fr)' in datasets) and ('MLSUMClusteringP2P' in cols):
322
- df['MLSUMClusteringP2P (fr)'] = df['MLSUMClusteringP2P (fr)'].fillna(df['MLSUMClusteringP2P'])
323
- datasets.remove('MLSUMClusteringP2P')
324
- if ('MLSUMClusteringS2S (fr)' in datasets) and ('MLSUMClusteringS2S' in cols):
325
- df['MLSUMClusteringS2S (fr)'] = df['MLSUMClusteringS2S (fr)'].fillna(df['MLSUMClusteringS2S'])
326
- datasets.remove('MLSUMClusteringS2S')
327
- if ('PawsXPairClassification (fr)' in datasets) and ('PawsX (fr)' in cols):
328
- # for the first bit no model has it, hence no column for it. We can remove this in a month or so
 
 
 
 
329
  if "PawsXPairClassification (fr)" not in cols:
330
- df['PawsXPairClassification (fr)'] = df['PawsX (fr)']
331
  else:
332
- df['PawsXPairClassification (fr)'] = df['PawsXPairClassification (fr)'].fillna(df['PawsX (fr)'])
 
 
333
  # make all the columns the same
334
- datasets.remove('PawsX (fr)')
335
- cols.remove('PawsX (fr)')
336
- df.drop(columns=['PawsX (fr)'], inplace=True)
337
- cols.append('PawsXPairClassification (fr)')
338
-
339
  # Filter invalid columns
340
  cols = [col for col in cols if col in base_columns + datasets]
341
  i = 0
@@ -345,7 +516,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
345
  i += 1
346
  df = df[cols]
347
  if rank:
348
- df = add_rank(df)
349
  if fillna:
350
  df.fillna("", inplace=True)
351
  return df
@@ -353,7 +524,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
353
 
354
  # Get dict with a task list for each task category
355
  # E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]}
356
- def get_mteb_average(task_dict: dict):
357
  all_tasks = reduce(lambda x, y: x + y, task_dict.values())
358
  DATA_OVERALL = get_mteb_data(
359
  tasks=list(task_dict.keys()),
@@ -364,10 +535,20 @@ def get_mteb_average(task_dict: dict):
364
  )
365
  # Debugging:
366
  # DATA_OVERALL.to_csv("overall.csv")
367
- DATA_OVERALL.insert(1, f"Average ({len(all_tasks)} datasets)", DATA_OVERALL[all_tasks].mean(axis=1, skipna=False))
 
 
 
 
368
  for i, (task_category, task_category_list) in enumerate(task_dict.items()):
369
- DATA_OVERALL.insert(i+2, f"{task_category} Average ({len(task_category_list)} datasets)", DATA_OVERALL[task_category_list].mean(axis=1, skipna=False))
370
- DATA_OVERALL.sort_values(f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True)
 
 
 
 
 
 
371
  # Start ranking from 1
372
  DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
373
 
@@ -375,15 +556,32 @@ def get_mteb_average(task_dict: dict):
375
 
376
  DATA_TASKS = {}
377
  for task_category, task_category_list in task_dict.items():
378
- DATA_TASKS[task_category] = add_rank(DATA_OVERALL[["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"] + task_category_list])
379
- DATA_TASKS[task_category] = DATA_TASKS[task_category][DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)]
 
 
 
 
 
 
 
380
 
381
  # Fill NaN after averaging
382
  DATA_OVERALL.fillna("", inplace=True)
383
 
384
- data_overall_rows = ["Rank", "Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)", "Embedding Dimensions", "Max Tokens", f"Average ({len(all_tasks)} datasets)"]
 
 
 
 
 
 
 
 
385
  for task_category, task_category_list in task_dict.items():
386
- data_overall_rows.append(f"{task_category} Average ({len(task_category_list)} datasets)")
 
 
387
 
388
  DATA_OVERALL = DATA_OVERALL[data_overall_rows]
389
  DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
@@ -391,13 +589,10 @@ def get_mteb_average(task_dict: dict):
391
  return DATA_OVERALL, DATA_TASKS
392
 
393
 
394
- def refresh_leaderboard():
395
  """
396
  The main code to refresh and calculate results for the leaderboard. It does this by fetching the results from the
397
  external models and the models in the leaderboard, then calculating the average scores for each task category.
398
-
399
- Returns:
400
- dict: A dictionary containing the overall leaderboard and the task category leaderboards.
401
  """
402
 
403
  # get external model results and cache them
@@ -406,14 +601,14 @@ def refresh_leaderboard():
406
 
407
  boards_data = {}
408
  all_data_tasks = []
409
- pbar_tasks = tqdm(BOARDS_CONFIG.items(), desc="Fetching leaderboard results for ???", total=len(BOARDS_CONFIG), leave=True)
 
 
 
 
 
410
  for board, board_config in pbar_tasks:
411
- # To add only a single new board, you can uncomment the below to be faster
412
- if board != "rar-b": continue
413
- boards_data[board] = {
414
- "data_overall": None,
415
- "data_tasks": {}
416
- }
417
  pbar_tasks.set_description(f"Fetching leaderboard results for {board!r}")
418
  pbar_tasks.refresh()
419
  if board_config["has_overall"]:
@@ -423,30 +618,30 @@ def refresh_leaderboard():
423
  all_data_tasks.extend(data_tasks.values())
424
  else:
425
  for task_category, task_category_list in board_config["tasks"].items():
426
- data_task_category = get_mteb_data(tasks=[task_category], datasets=task_category_list)
427
- data_task_category.drop(columns=["Embedding Dimensions", "Max Tokens"], inplace=True)
 
 
 
 
428
  boards_data[board]["data_tasks"][task_category] = data_task_category
429
  all_data_tasks.append(data_task_category)
430
 
431
  return all_data_tasks, boards_data
432
 
433
 
434
-
435
- def write_out_results(item, item_name: str):
436
  """
437
  Due to their complex structure, let's recursively create subfolders until we reach the end
438
  of the item and then save the DFs as jsonl files
439
 
440
  Args:
441
- item (dict): The item to save
442
- item_name (str): The name of the item
443
-
444
- Returns:
445
- None
446
  """
447
  main_folder = item_name
448
 
449
- if isinstance(item, list):
450
  for i, v in enumerate(item):
451
  write_out_results(v, os.path.join(main_folder, str(i)))
452
 
@@ -463,8 +658,9 @@ def write_out_results(item, item_name: str):
463
  elif isinstance(item, pd.DataFrame):
464
  print(f"Saving {main_folder} to {main_folder}/default.jsonl")
465
  os.makedirs(main_folder, exist_ok=True)
466
-
467
- item.reset_index().to_json(f"{main_folder}/default.jsonl", orient="records", lines=True)
 
468
 
469
  elif isinstance(item, str):
470
  print(f"Saving {main_folder} to {main_folder}/default.txt")
@@ -483,38 +679,44 @@ def write_out_results(item, item_name: str):
483
  raise Exception(f"Unknown type {type(item)}")
484
 
485
 
486
- def load_results(data_path):
487
  """
488
  Do the reverse of `write_out_results` to reconstruct the item
489
 
490
  Args:
491
- data_path (str): The path to the data to load
492
 
493
  Returns:
494
- dict: The loaded data
495
  """
496
  if os.path.isdir(data_path):
497
  # if the folder just has numbers from 0 to N, load as a list
498
  all_files_in_dir = list(os.listdir(data_path))
499
  if set(all_files_in_dir) == set([str(i) for i in range(len(all_files_in_dir))]):
500
  ### the list case
501
- return [load_results(os.path.join(data_path, str(i))) for i in range(len(os.listdir(data_path)))]
 
 
 
502
  else:
503
  if len(all_files_in_dir) == 1:
504
  file_name = all_files_in_dir[0]
505
- if file_name == "default.jsonl":
506
  return load_results(os.path.join(data_path, file_name))
507
- else: ### the dict case
508
  return {file_name: load_results(os.path.join(data_path, file_name))}
509
  else:
510
- return {file_name: load_results(os.path.join(data_path, file_name)) for file_name in all_files_in_dir}
511
-
 
 
 
512
  elif data_path.endswith(".jsonl"):
513
  df = pd.read_json(data_path, orient="records", lines=True)
514
  if "index" in df.columns:
515
  df = df.set_index("index")
516
  return df
517
-
518
  else:
519
  with open(data_path, "r") as f:
520
  data = f.read()
@@ -524,17 +726,16 @@ def load_results(data_path):
524
  return data
525
 
526
 
527
-
528
  if __name__ == "__main__":
529
- print(f"Refreshing leaderboard statistics...")
530
  all_data_tasks, boards_data = refresh_leaderboard()
531
- print(f"Done calculating, saving...")
532
  # save them so that the leaderboard can use them. They're quite complex though
533
- # but we can't use pickle files because of git-lfs.
534
  write_out_results(all_data_tasks, "all_data_tasks")
535
  write_out_results(boards_data, "boards_data")
536
 
537
  # to load them use
538
  # all_data_tasks = load_results("all_data_tasks")
539
  # boards_data = load_results("boards_data")
540
- print("Done saving results!")
 
1
+ from __future__ import annotations
2
+
3
  import json
4
  import os
5
  import re
6
+ from functools import reduce
7
+ from typing import Any
8
 
9
+ import pandas as pd
10
  from datasets import load_dataset
11
  from huggingface_hub import hf_hub_download
12
  from huggingface_hub.repocard import metadata_load
 
13
  from tqdm.autonotebook import tqdm
14
 
15
+ from envs import API, LEADERBOARD_CONFIG, MODEL_META, REPO_ID, RESULTS_REPO
16
  from utils.model_size import get_model_parameters_memory
 
 
17
 
18
  MODEL_CACHE = {}
19
  TASKS_CONFIG = LEADERBOARD_CONFIG["tasks"]
 
36
  TASK_TO_METRIC["PairClassification"].append("cosine_ap")
37
 
38
 
39
+ EXTERNAL_MODELS = {
40
+ k for k, v in MODEL_META["model_meta"].items() if v.get("is_external", False)
41
+ }
42
+ EXTERNAL_MODEL_TO_LINK = {
43
+ k: v["link"] for k, v in MODEL_META["model_meta"].items() if v.get("link", False)
44
+ }
45
+ EXTERNAL_MODEL_TO_DIM = {
46
+ k: v["dim"] for k, v in MODEL_META["model_meta"].items() if v.get("dim", False)
47
+ }
48
+ EXTERNAL_MODEL_TO_SEQLEN = {
49
+ k: v["seq_len"]
50
+ for k, v in MODEL_META["model_meta"].items()
51
+ if v.get("seq_len", False)
52
+ }
53
+ EXTERNAL_MODEL_TO_SIZE = {
54
+ k: v["size"] for k, v in MODEL_META["model_meta"].items() if v.get("size", False)
55
+ }
56
+ PROPRIETARY_MODELS = {
57
+ k for k, v in MODEL_META["model_meta"].items() if v.get("is_proprietary", False)
58
+ }
59
+ TASK_DESCRIPTIONS = {k: v["task_description"] for k, v in TASKS_CONFIG.items()}
60
  TASK_DESCRIPTIONS["Overall"] = "Overall performance across MTEB tasks."
61
+ SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS = {
62
+ k
63
+ for k, v in MODEL_META["model_meta"].items()
64
+ if v.get("is_sentence_transformers_compatible", False)
65
+ }
66
  MODELS_TO_SKIP = MODEL_META["models_to_skip"]
67
  CROSS_ENCODERS = MODEL_META["cross_encoders"]
68
+ BI_ENCODERS = [
69
+ k for k, _ in MODEL_META["model_meta"].items() if k not in CROSS_ENCODERS + ["bm25"]
70
+ ]
71
+ INSTRUCT_MODELS = {
72
+ k for k, v in MODEL_META["model_meta"].items() if v.get("uses_instruct", False)
73
+ }
74
+ NOINSTRUCT_MODELS = {
75
+ k for k, v in MODEL_META["model_meta"].items() if not v.get("uses_instruct", False)
76
+ }
77
 
78
 
79
  TASK_TO_TASK_TYPE = {task_category: [] for task_category in TASKS}
 
89
  # with open(model_infos_path) as f:
90
  # MODEL_INFOS = json.load(f)
91
 
92
+
93
+ def add_rank(df: pd.DataFrame) -> pd.DataFrame:
94
+ cols_to_rank = [
95
+ col
96
+ for col in df.columns
97
+ if col
98
+ not in [
99
+ "Model",
100
+ "Model Size (Million Parameters)",
101
+ "Memory Usage (GB, fp32)",
102
+ "Embedding Dimensions",
103
+ "Max Tokens",
104
+ ]
105
+ ]
106
  if len(cols_to_rank) == 1:
107
  df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
108
  else:
109
+ df.insert(
110
+ len(df.columns) - len(cols_to_rank),
111
+ "Average",
112
+ df[cols_to_rank].mean(axis=1, skipna=False),
113
+ )
114
  df.sort_values("Average", ascending=False, inplace=True)
115
  df.insert(0, "Rank", list(range(1, len(df) + 1)))
116
  df = df.round(2)
 
119
  return df
120
 
121
 
122
+ def make_clickable_model(model_name: str, link: None | str = None) -> str:
123
  if link is None:
124
  link = "https://huggingface.co/" + model_name
125
  # Remove user from model name
126
+ return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
 
 
127
 
128
 
129
  def add_lang(examples):
130
+ if not (examples["eval_language"]) or (examples["eval_language"] == "default"):
131
  examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"]
132
  else:
133
+ examples["mteb_dataset_name_with_lang"] = (
134
+ examples["mteb_dataset_name"] + f' ({examples["eval_language"]})'
135
+ )
136
  return examples
137
 
138
+
139
+ def norm(names: str) -> set:
140
+ return set([name.split(" ")[0] for name in names])
141
+
142
 
143
  def add_task(examples):
144
  # Could be added to the dataset loading script instead
 
155
  examples["mteb_task"] = "Unknown"
156
  return examples
157
 
158
+
159
+ def filter_metric_external(x, task, metrics) -> bool:
160
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
161
+ if x["mteb_dataset_name"] in ["LEMBNeedleRetrieval", "LEMBPasskeyRetrieval"]:
162
+ return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
163
  else:
164
+ return bool(x["mteb_task"] == task and x["metric"] in metrics)
165
+
166
 
167
+ def filter_metric_fetched(name: str, metric: str, expected_metrics) -> bool:
168
+ # This is a hack for the passkey and needle retrieval test, which reports ndcg_at_1 (i.e. accuracy), rather than the ndcg_at_10 that is commonly used for retrieval tasks.
169
+ return bool(
170
+ metric == "ndcg_at_1"
171
+ if name in ["LEMBNeedleRetrieval", "LEMBPasskeyRetrieval"]
172
+ else metric in expected_metrics
173
+ )
174
 
175
 
176
  def get_dim_seq_size(model):
 
189
  config_path = hf_hub_download(model.modelId, filename="config.json")
190
  config = json.load(open(config_path))
191
  if not dim:
192
+ dim = config.get(
193
+ "hidden_dim", config.get("hidden_size", config.get("d_model", ""))
194
+ )
195
+ seq = config.get(
196
+ "n_positions",
197
+ config.get(
198
+ "max_position_embeddings",
199
+ config.get("n_ctx", config.get("seq_length", "")),
200
+ ),
201
+ )
202
+
203
  if dim == "" or seq == "":
204
  raise Exception(f"Could not find dim or seq for model {model.modelId}")
205
+
206
  # Get model file size without downloading. Parameters in million parameters and memory in GB
207
  parameters, memory = get_model_parameters_memory(model)
208
  return dim, seq, parameters, memory
 
217
  for model in EXTERNAL_MODELS:
218
  if model not in EXTERNAL_MODEL_RESULTS:
219
  models_to_run.append(model)
220
+ EXTERNAL_MODEL_RESULTS[model] = {
221
+ k: {v[0]: []} for k, v in TASK_TO_METRIC.items()
222
+ }
223
 
224
  ## only if we want to re-calculate all instead of using the cache... it's likely they haven't changed
225
  ## but if your model results have changed, delete it from the "EXTERNAL_MODEL_RESULTS.json" file
226
  else:
227
+ EXTERNAL_MODEL_RESULTS = {
228
+ model: {k: {v[0]: []} for k, v in TASK_TO_METRIC.items()}
229
+ for model in EXTERNAL_MODELS
230
+ }
231
  models_to_run = EXTERNAL_MODELS
232
 
233
  pbar = tqdm(models_to_run, desc="Fetching external model results")
234
  for model in pbar:
235
  pbar.set_description(f"Fetching external model results for {model!r}")
236
+ ds = load_dataset(
237
+ RESULTS_REPO,
238
+ model,
239
+ trust_remote_code=True,
240
+ download_mode="force_redownload",
241
+ verification_mode="no_checks",
242
+ )
243
  ds = ds.map(add_lang)
244
  ds = ds.map(add_task)
245
+ base_dict = {
246
+ "Model": make_clickable_model(
247
+ model,
248
+ link=EXTERNAL_MODEL_TO_LINK.get(
249
+ model, f"https://huggingface.co/spaces/{REPO_ID}"
250
+ ),
251
+ )
252
+ }
253
 
254
  for task, metrics in TASK_TO_METRIC.items():
255
+ ds_dict = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
256
+ "test"
257
+ ].to_dict()
258
+ ds_dict = {
259
+ k: round(v, 2)
260
+ for k, v in zip(
261
+ ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"]
262
+ )
263
+ }
264
  # metrics[0] is the main name for this metric; other names in the list are legacy for backward-compat
265
+ EXTERNAL_MODEL_RESULTS[model][task][metrics[0]].append(
266
+ {**base_dict, **ds_dict}
267
+ )
268
 
269
  # Save & cache EXTERNAL_MODEL_RESULTS
270
  with open("EXTERNAL_MODEL_RESULTS.json", "w") as f:
 
273
  return EXTERNAL_MODEL_RESULTS
274
 
275
 
276
+ def download_or_use_cache(modelId: str):
277
  global MODEL_CACHE
278
  if modelId in MODEL_CACHE:
279
  return MODEL_CACHE[modelId]
 
287
  return meta
288
 
289
 
290
+ def get_mteb_data(
291
+ tasks: list = ["Clustering"],
292
+ langs: list = [],
293
+ datasets: list = [],
294
+ fillna: bool = True,
295
+ add_emb_dim: bool = True,
296
+ task_to_metric: dict = TASK_TO_METRIC,
297
+ rank: bool = True,
298
+ ) -> pd.DataFrame:
299
  global MODEL_INFOS
300
 
301
  with open("EXTERNAL_MODEL_RESULTS.json", "r") as f:
 
304
  api = API
305
  models = list(api.list_models(filter="mteb"))
306
  # Legacy names changes; Also fetch the old results & merge later
307
+ if "MLSUMClusteringP2P (fr)" in datasets:
308
+ datasets.append("MLSUMClusteringP2P")
309
+ if "MLSUMClusteringS2S (fr)" in datasets:
310
+ datasets.append("MLSUMClusteringS2S")
311
+ if "PawsXPairClassification (fr)" in datasets:
312
+ datasets.append("PawsX (fr)")
313
  # Initialize list to models that we cannot fetch metadata from
314
  df_list = []
315
  for model in external_model_results:
316
  results_list = []
317
  for task in tasks:
318
  # Not all models have InstructionRetrieval, other new tasks
319
+ if task not in external_model_results[model]:
320
+ continue
321
  results_list += external_model_results[model][task][task_to_metric[task][0]]
322
+
323
  if len(datasets) > 0:
324
+ res = {
325
+ k: v
326
+ for d in results_list
327
+ for k, v in d.items()
328
+ if (k == "Model") or any([x in k for x in datasets])
329
+ }
330
  elif langs:
331
  # Would be cleaner to rely on an extra language column instead
332
  langs_format = [f"({lang})" for lang in langs]
333
+ res = {
334
+ k: v
335
+ for d in results_list
336
+ for k, v in d.items()
337
+ if any([k.split(" ")[-1] in (k, x) for x in langs_format])
338
+ }
339
  else:
340
  res = {k: v for d in results_list for k, v in d.items()}
341
  # Model & at least one result
342
  if len(res) > 1:
343
  if add_emb_dim:
344
+ res["Model Size (Million Parameters)"] = EXTERNAL_MODEL_TO_SIZE.get(
345
+ model, ""
346
+ )
347
+ res["Memory Usage (GB, fp32)"] = (
348
+ round(res["Model Size (Million Parameters)"] * 1e6 * 4 / 1024**3, 2)
349
+ if res["Model Size (Million Parameters)"] != ""
350
+ else ""
351
+ )
352
  res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
353
  res["Max Tokens"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
354
  df_list.append(res)
355
 
356
  pbar = tqdm(models, desc="Fetching model metadata")
357
  for model in pbar:
358
+ if model.modelId in MODELS_TO_SKIP:
359
+ continue
360
  pbar.set_description(f"Fetching {model.modelId!r} metadata")
361
  meta = download_or_use_cache(model.modelId)
362
+ MODEL_INFOS[model.modelId] = {"metadata": meta}
 
 
363
  if "model-index" not in meta:
364
  continue
365
  # meta['model-index'][0]["results"] is list of elements like:
 
378
  # },
379
  # Use "get" instead of dict indexing to skip incompat metadata instead of erroring out
380
  if len(datasets) > 0:
381
+ task_results = [
382
+ sub_res
383
+ for sub_res in meta["model-index"][0]["results"]
384
+ if (sub_res.get("task", {}).get("type", "") in tasks)
385
+ and any(
386
+ [x in sub_res.get("dataset", {}).get("name", "") for x in datasets]
387
+ )
388
+ ]
389
  elif langs:
390
+ task_results = [
391
+ sub_res
392
+ for sub_res in meta["model-index"][0]["results"]
393
+ if (sub_res.get("task", {}).get("type", "") in tasks)
394
+ and (
395
+ sub_res.get("dataset", {}).get("config", "default")
396
+ in ("default", *langs)
397
+ )
398
+ ]
399
  else:
400
+ task_results = [
401
+ sub_res
402
+ for sub_res in meta["model-index"][0]["results"]
403
+ if (sub_res.get("task", {}).get("type", "") in tasks)
404
+ ]
405
  try:
406
+ out = [
407
+ {
408
+ res["dataset"]["name"].replace("MTEB ", ""): [
409
+ round(score["value"], 2)
410
+ for score in res["metrics"]
411
+ if filter_metric_fetched(
412
+ res["dataset"]["name"].replace("MTEB ", ""),
413
+ score["type"],
414
+ task_to_metric.get(res["task"]["type"]),
415
+ )
416
+ ][0]
417
+ }
418
+ for res in task_results
419
+ ]
420
  except Exception as e:
421
  print("ERROR", model.modelId, e)
422
  continue
 
427
  if add_emb_dim:
428
  # The except clause triggers on gated repos, we can use external metadata for those
429
  try:
430
+ MODEL_INFOS[model.modelId]["dim_seq_size"] = list(
431
+ get_dim_seq_size(model)
432
+ )
433
  except:
434
  name_without_org = model.modelId.split("/")[-1]
435
  # EXTERNAL_MODEL_TO_SIZE[name_without_org] refers to millions of parameters, so for memory usage
 
439
  EXTERNAL_MODEL_TO_DIM.get(name_without_org, ""),
440
  EXTERNAL_MODEL_TO_SEQLEN.get(name_without_org, ""),
441
  EXTERNAL_MODEL_TO_SIZE.get(name_without_org, ""),
442
+ round(
443
+ EXTERNAL_MODEL_TO_SIZE[name_without_org]
444
+ * 1e6
445
+ * 4
446
+ / 1024**3,
447
+ 2,
448
+ )
449
+ if name_without_org in EXTERNAL_MODEL_TO_SIZE
450
+ else "",
451
  )
452
+ (
453
+ out["Embedding Dimensions"],
454
+ out["Max Tokens"],
455
+ out["Model Size (Million Parameters)"],
456
+ out["Memory Usage (GB, fp32)"],
457
+ ) = tuple(MODEL_INFOS[model.modelId]["dim_seq_size"])
458
  df_list.append(out)
459
  model_siblings = model.siblings or []
460
+ if (
461
+ model.library_name == "sentence-transformers"
462
+ or "sentence-transformers" in model.tags
463
+ or "modules.json" in {file.rfilename for file in model_siblings}
464
+ ):
465
  SENTENCE_TRANSFORMERS_COMPATIBLE_MODELS.add(out["Model"])
466
 
467
  # # Save & cache MODEL_INFOS
 
474
  df = df.groupby("Model", as_index=False).first()
475
  # Put 'Model' column first
476
  cols = sorted(list(df.columns))
477
+ base_columns = [
478
+ "Model",
479
+ "Model Size (Million Parameters)",
480
+ "Memory Usage (GB, fp32)",
481
+ "Embedding Dimensions",
482
+ "Max Tokens",
483
+ ]
484
  if len(datasets) > 0:
485
  # Update legacy column names to be merged with newer ones
486
  # Update 'MLSUMClusteringP2P (fr)' with values from 'MLSUMClusteringP2P'
487
+ if ("MLSUMClusteringP2P (fr)" in datasets) and ("MLSUMClusteringP2P" in cols):
488
+ df["MLSUMClusteringP2P (fr)"] = df["MLSUMClusteringP2P (fr)"].fillna(
489
+ df["MLSUMClusteringP2P"]
490
+ )
491
+ datasets.remove("MLSUMClusteringP2P")
492
+ if ("MLSUMClusteringS2S (fr)" in datasets) and ("MLSUMClusteringS2S" in cols):
493
+ df["MLSUMClusteringS2S (fr)"] = df["MLSUMClusteringS2S (fr)"].fillna(
494
+ df["MLSUMClusteringS2S"]
495
+ )
496
+ datasets.remove("MLSUMClusteringS2S")
497
+ if ("PawsXPairClassification (fr)" in datasets) and ("PawsX (fr)" in cols):
498
+ # for the first bit no model has it, hence no column for it. We can remove this in a month or so
499
  if "PawsXPairClassification (fr)" not in cols:
500
+ df["PawsXPairClassification (fr)"] = df["PawsX (fr)"]
501
  else:
502
+ df["PawsXPairClassification (fr)"] = df[
503
+ "PawsXPairClassification (fr)"
504
+ ].fillna(df["PawsX (fr)"])
505
  # make all the columns the same
506
+ datasets.remove("PawsX (fr)")
507
+ cols.remove("PawsX (fr)")
508
+ df.drop(columns=["PawsX (fr)"], inplace=True)
509
+
 
510
  # Filter invalid columns
511
  cols = [col for col in cols if col in base_columns + datasets]
512
  i = 0
 
516
  i += 1
517
  df = df[cols]
518
  if rank:
519
+ df = add_rank(df)
520
  if fillna:
521
  df.fillna("", inplace=True)
522
  return df
 
524
 
525
  # Get dict with a task list for each task category
526
  # E.g. {"Classification": ["AmazonMassiveIntentClassification (en)", ...], "PairClassification": ["SprintDuplicateQuestions", ...]}
527
+ def get_mteb_average(task_dict: dict) -> tuple[Any, dict]:
528
  all_tasks = reduce(lambda x, y: x + y, task_dict.values())
529
  DATA_OVERALL = get_mteb_data(
530
  tasks=list(task_dict.keys()),
 
535
  )
536
  # Debugging:
537
  # DATA_OVERALL.to_csv("overall.csv")
538
+ DATA_OVERALL.insert(
539
+ 1,
540
+ f"Average ({len(all_tasks)} datasets)",
541
+ DATA_OVERALL[all_tasks].mean(axis=1, skipna=False),
542
+ )
543
  for i, (task_category, task_category_list) in enumerate(task_dict.items()):
544
+ DATA_OVERALL.insert(
545
+ i + 2,
546
+ f"{task_category} Average ({len(task_category_list)} datasets)",
547
+ DATA_OVERALL[task_category_list].mean(axis=1, skipna=False),
548
+ )
549
+ DATA_OVERALL.sort_values(
550
+ f"Average ({len(all_tasks)} datasets)", ascending=False, inplace=True
551
+ )
552
  # Start ranking from 1
553
  DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
554
 
 
556
 
557
  DATA_TASKS = {}
558
  for task_category, task_category_list in task_dict.items():
559
+ DATA_TASKS[task_category] = add_rank(
560
+ DATA_OVERALL[
561
+ ["Model", "Model Size (Million Parameters)", "Memory Usage (GB, fp32)"]
562
+ + task_category_list
563
+ ]
564
+ )
565
+ DATA_TASKS[task_category] = DATA_TASKS[task_category][
566
+ DATA_TASKS[task_category].iloc[:, 4:].ne("").any(axis=1)
567
+ ]
568
 
569
  # Fill NaN after averaging
570
  DATA_OVERALL.fillna("", inplace=True)
571
 
572
+ data_overall_rows = [
573
+ "Rank",
574
+ "Model",
575
+ "Model Size (Million Parameters)",
576
+ "Memory Usage (GB, fp32)",
577
+ "Embedding Dimensions",
578
+ "Max Tokens",
579
+ f"Average ({len(all_tasks)} datasets)",
580
+ ]
581
  for task_category, task_category_list in task_dict.items():
582
+ data_overall_rows.append(
583
+ f"{task_category} Average ({len(task_category_list)} datasets)"
584
+ )
585
 
586
  DATA_OVERALL = DATA_OVERALL[data_overall_rows]
587
  DATA_OVERALL = DATA_OVERALL[DATA_OVERALL.iloc[:, 5:].ne("").any(axis=1)]
 
589
  return DATA_OVERALL, DATA_TASKS
590
 
591
 
592
+ def refresh_leaderboard() -> tuple[list, dict]:
593
  """
594
  The main code to refresh and calculate results for the leaderboard. It does this by fetching the results from the
595
  external models and the models in the leaderboard, then calculating the average scores for each task category.
 
 
 
596
  """
597
 
598
  # get external model results and cache them
 
601
 
602
  boards_data = {}
603
  all_data_tasks = []
604
+ pbar_tasks = tqdm(
605
+ BOARDS_CONFIG.items(),
606
+ desc="Fetching leaderboard results for ???",
607
+ total=len(BOARDS_CONFIG),
608
+ leave=True,
609
+ )
610
  for board, board_config in pbar_tasks:
611
+ boards_data[board] = {"data_overall": None, "data_tasks": {}}
 
 
 
 
 
612
  pbar_tasks.set_description(f"Fetching leaderboard results for {board!r}")
613
  pbar_tasks.refresh()
614
  if board_config["has_overall"]:
 
618
  all_data_tasks.extend(data_tasks.values())
619
  else:
620
  for task_category, task_category_list in board_config["tasks"].items():
621
+ data_task_category = get_mteb_data(
622
+ tasks=[task_category], datasets=task_category_list
623
+ )
624
+ data_task_category.drop(
625
+ columns=["Embedding Dimensions", "Max Tokens"], inplace=True
626
+ )
627
  boards_data[board]["data_tasks"][task_category] = data_task_category
628
  all_data_tasks.append(data_task_category)
629
 
630
  return all_data_tasks, boards_data
631
 
632
 
633
+ def write_out_results(item: dict, item_name: str) -> None:
 
634
  """
635
  Due to their complex structure, let's recursively create subfolders until we reach the end
636
  of the item and then save the DFs as jsonl files
637
 
638
  Args:
639
+ item: The item to save
640
+ item_name: The name of the item
 
 
 
641
  """
642
  main_folder = item_name
643
 
644
+ if isinstance(item, list):
645
  for i, v in enumerate(item):
646
  write_out_results(v, os.path.join(main_folder, str(i)))
647
 
 
658
  elif isinstance(item, pd.DataFrame):
659
  print(f"Saving {main_folder} to {main_folder}/default.jsonl")
660
  os.makedirs(main_folder, exist_ok=True)
661
+
662
+ item.reset_index(inplace=True)
663
+ item.to_json(f"{main_folder}/default.jsonl", orient="records", lines=True)
664
 
665
  elif isinstance(item, str):
666
  print(f"Saving {main_folder} to {main_folder}/default.txt")
 
679
  raise Exception(f"Unknown type {type(item)}")
680
 
681
 
682
+ def load_results(data_path: str) -> list | dict | pd.DataFrame | str | None:
683
  """
684
  Do the reverse of `write_out_results` to reconstruct the item
685
 
686
  Args:
687
+ data_path: The path to the data to load
688
 
689
  Returns:
690
+ The loaded data
691
  """
692
  if os.path.isdir(data_path):
693
  # if the folder just has numbers from 0 to N, load as a list
694
  all_files_in_dir = list(os.listdir(data_path))
695
  if set(all_files_in_dir) == set([str(i) for i in range(len(all_files_in_dir))]):
696
  ### the list case
697
+ return [
698
+ load_results(os.path.join(data_path, str(i)))
699
+ for i in range(len(os.listdir(data_path)))
700
+ ]
701
  else:
702
  if len(all_files_in_dir) == 1:
703
  file_name = all_files_in_dir[0]
704
+ if file_name == "default.jsonl":
705
  return load_results(os.path.join(data_path, file_name))
706
+ else: ### the dict case
707
  return {file_name: load_results(os.path.join(data_path, file_name))}
708
  else:
709
+ return {
710
+ file_name: load_results(os.path.join(data_path, file_name))
711
+ for file_name in all_files_in_dir
712
+ }
713
+
714
  elif data_path.endswith(".jsonl"):
715
  df = pd.read_json(data_path, orient="records", lines=True)
716
  if "index" in df.columns:
717
  df = df.set_index("index")
718
  return df
719
+
720
  else:
721
  with open(data_path, "r") as f:
722
  data = f.read()
 
726
  return data
727
 
728
 
 
729
  if __name__ == "__main__":
730
+ print("Refreshing leaderboard statistics...")
731
  all_data_tasks, boards_data = refresh_leaderboard()
732
+ print("Done calculating, saving...")
733
  # save them so that the leaderboard can use them. They're quite complex though
734
+ # but we can't use pickle files because of git-lfs.
735
  write_out_results(all_data_tasks, "all_data_tasks")
736
  write_out_results(boards_data, "boards_data")
737
 
738
  # to load them use
739
  # all_data_tasks = load_results("all_data_tasks")
740
  # boards_data = load_results("boards_data")
741
+ print("Done saving results!")