Spaces:
AIR-Bench
/
Running on CPU Upgrade

hanhainebula commited on
Commit
257f64d
1 Parent(s): 30f9433

update part code for v24.05

Browse files

- add dependency air-benchmark>=0.0.4
- update benchmarks.py
- update about.py
- update layout of leaderboard

Files changed (4) hide show
  1. app.py +270 -266
  2. requirements.txt +1 -0
  3. src/about.py +3 -3
  4. src/benchmarks.py +2 -62
app.py CHANGED
@@ -131,303 +131,307 @@ with demo:
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
- with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
135
  with gr.Row():
136
- with gr.Column(min_width=320):
137
- # select domain
138
- with gr.Row():
139
- selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
140
- # select language
141
- with gr.Row():
142
- selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
143
-
144
- with gr.Column():
145
- with gr.Row():
146
- selected_version = get_version_dropdown()
147
- # select the metric
148
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
149
- with gr.Row():
150
- show_anonymous = get_anonymous_checkbox()
151
- with gr.Row():
152
- show_revision_and_timestamp = get_revision_and_ts_checkbox()
153
- with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
154
- with gr.TabItem("Retrieval + Reranking", id=10):
155
- with gr.Row():
156
- # search retrieval models
157
- with gr.Column():
158
- search_bar = get_search_bar()
159
- # select reranking models
160
- with gr.Column():
161
- selected_rerankings = get_reranking_dropdown(reranking_models)
162
- leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
163
- # Dummy leaderboard for handling the case when the user uses backspace key
164
- hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
165
-
166
- set_listeners(
167
- "qa",
168
- leaderboard_table,
169
- hidden_leaderboard_table_for_search,
170
- search_bar,
171
- selected_domains,
172
- selected_langs,
173
- selected_rerankings,
174
- show_anonymous,
175
- show_revision_and_timestamp,
176
- )
177
-
178
- # set metric listener
179
- selected_metric.change(
180
- update_metric_qa,
181
- [
182
- selected_metric,
183
  selected_domains,
184
  selected_langs,
185
  selected_rerankings,
186
- search_bar,
187
  show_anonymous,
188
  show_revision_and_timestamp,
189
- ],
190
- leaderboard_table,
191
- queue=True
192
- )
193
- with gr.TabItem("Retrieval Only", id=11):
194
- with gr.Row():
195
- with gr.Column(scale=1):
196
- search_bar_retriever = get_search_bar()
197
- with gr.Column(scale=1):
198
- selected_noreranker = get_noreranking_dropdown()
199
- lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
200
- lb_df_retriever = reset_rank(lb_df_retriever)
201
- lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
202
- # Dummy leaderboard for handling the case when the user uses backspace key
203
- hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
204
- hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
205
- hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
206
-
207
- set_listeners(
208
- "qa",
209
- lb_table_retriever,
210
- hidden_lb_table_retriever,
211
- search_bar_retriever,
212
- selected_domains,
213
- selected_langs,
214
- selected_noreranker,
215
- show_anonymous,
216
- show_revision_and_timestamp,
217
- )
218
-
219
- # set metric listener
220
- selected_metric.change(
221
- update_metric_qa,
222
- [
223
- selected_metric,
 
224
  selected_domains,
225
  selected_langs,
226
  selected_noreranker,
227
- search_bar_retriever,
228
  show_anonymous,
229
  show_revision_and_timestamp,
230
- ],
231
- lb_table_retriever,
232
- queue=True
233
- )
234
- with gr.TabItem("Reranking Only", id=12):
235
- lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
236
- lb_df_reranker = reset_rank(lb_df_reranker)
237
- reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
238
- with gr.Row():
239
- with gr.Column(scale=1):
240
- selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
241
- with gr.Column(scale=1):
242
- search_bar_reranker = gr.Textbox(show_label=False, visible=False)
243
- lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
244
- hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
245
- hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
246
- hidden_lb_table_reranker = get_leaderboard_table(
247
- hidden_lb_df_reranker, types_qa, visible=False
248
- )
249
-
250
- set_listeners(
251
- "qa",
252
- lb_table_reranker,
253
- hidden_lb_table_reranker,
254
- search_bar_reranker,
255
- selected_domains,
256
- selected_langs,
257
- selected_rerankings_reranker,
258
- show_anonymous,
259
- show_revision_and_timestamp,
260
- )
261
- # set metric listener
262
- selected_metric.change(
263
- update_metric_qa,
264
- [
265
- selected_metric,
 
 
266
  selected_domains,
267
  selected_langs,
268
  selected_rerankings_reranker,
269
- search_bar_reranker,
270
  show_anonymous,
271
  show_revision_and_timestamp,
272
- ],
273
- lb_table_reranker,
274
- queue=True
275
- )
276
- with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
277
- with gr.Row():
278
- with gr.Column(min_width=320):
279
- # select domain
280
- with gr.Row():
281
- selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
282
- # select language
283
- with gr.Row():
284
- selected_langs = get_language_dropdown(
285
- LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
286
  )
287
- with gr.Column():
288
- with gr.Row():
289
- selected_version = get_version_dropdown()
290
- # select the metric
291
- with gr.Row():
292
- selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
293
- with gr.Row():
294
- show_anonymous = get_anonymous_checkbox()
295
- with gr.Row():
296
- show_revision_and_timestamp = get_revision_and_ts_checkbox()
297
- with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
298
- with gr.TabItem("Retrieval + Reranking", id=20):
299
- with gr.Row():
300
- with gr.Column():
301
- search_bar = get_search_bar()
302
- # select reranking model
303
- with gr.Column():
304
- selected_rerankings = get_reranking_dropdown(reranking_models)
305
-
306
- lb_table = get_leaderboard_table(
307
- leaderboard_df_long_doc, types_long_doc
308
- )
309
-
310
- # Dummy leaderboard for handling the case when the user uses backspace key
311
- hidden_lb_table_for_search = get_leaderboard_table(
312
- original_df_long_doc, types_long_doc, visible=False
313
- )
314
-
315
- set_listeners(
316
- "long-doc",
317
- lb_table,
318
- hidden_lb_table_for_search,
319
- search_bar,
320
- selected_domains,
321
- selected_langs,
322
- selected_rerankings,
323
- show_anonymous,
324
- show_revision_and_timestamp,
325
- )
326
-
327
- # set metric listener
328
- selected_metric.change(
329
- update_metric_long_doc,
330
- [
331
- selected_metric,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  selected_domains,
333
  selected_langs,
334
  selected_rerankings,
335
- search_bar,
336
  show_anonymous,
337
- show_revision_and_timestamp
338
- ],
339
- lb_table,
340
- queue=True
341
- )
342
- with gr.TabItem("Retrieval Only", id=21):
343
- with gr.Row():
344
- with gr.Column(scale=1):
345
- search_bar_retriever = get_search_bar()
346
- with gr.Column(scale=1):
347
- selected_noreranker = get_noreranking_dropdown()
348
- lb_df_retriever_long_doc = leaderboard_df_long_doc[
349
- leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
350
- ]
351
- lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
352
- hidden_lb_db_retriever_long_doc = original_df_long_doc[
353
- original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
- ]
355
- hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
356
- lb_table_retriever_long_doc = get_leaderboard_table(
357
- lb_df_retriever_long_doc, types_long_doc)
358
- hidden_lb_table_retriever_long_doc = get_leaderboard_table(
359
- hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
360
- )
361
-
362
- set_listeners(
363
- "long-doc",
364
- lb_table_retriever_long_doc,
365
- hidden_lb_table_retriever_long_doc,
366
- search_bar_retriever,
367
- selected_domains,
368
- selected_langs,
369
- selected_noreranker,
370
- show_anonymous,
371
- show_revision_and_timestamp,
372
- )
373
-
374
- selected_metric.change(
375
- update_metric_long_doc,
376
- [
377
- selected_metric,
 
 
378
  selected_domains,
379
  selected_langs,
380
  selected_noreranker,
381
- search_bar_retriever,
382
  show_anonymous,
383
  show_revision_and_timestamp,
384
- ],
385
- lb_table_retriever_long_doc,
386
- queue=True
387
- )
388
- with gr.TabItem("Reranking Only", id=22):
389
- lb_df_reranker_ldoc = leaderboard_df_long_doc[
390
- leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
391
- ]
392
- lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
393
- reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
394
- with gr.Row():
395
- with gr.Column(scale=1):
396
- selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
397
- with gr.Column(scale=1):
398
- search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
399
- lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
400
- hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
401
- hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
402
- hidden_lb_table_reranker_ldoc = get_leaderboard_table(
403
- hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
404
- )
405
-
406
- set_listeners(
407
- "long-doc",
408
- lb_table_reranker_ldoc,
409
- hidden_lb_table_reranker_ldoc,
410
- search_bar_reranker_ldoc,
411
- selected_domains,
412
- selected_langs,
413
- selected_rerankings_reranker_ldoc,
414
- show_anonymous,
415
- show_revision_and_timestamp,
416
- )
417
- selected_metric.change(
418
- update_metric_long_doc,
419
- [
420
- selected_metric,
 
 
421
  selected_domains,
422
  selected_langs,
423
  selected_rerankings_reranker_ldoc,
424
- search_bar_reranker_ldoc,
425
  show_anonymous,
426
  show_revision_and_timestamp,
427
- ],
428
- lb_table_reranker_ldoc,
429
- queue=True
430
- )
 
 
 
 
 
 
 
 
 
 
 
431
 
432
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
433
  with gr.Column():
 
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
+ with gr.TabItem("Results", elem_id="results-tab-table"):
135
  with gr.Row():
136
+ selected_version = get_version_dropdown(BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION)
137
+
138
+ with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
139
+ with gr.Row():
140
+ with gr.Column(min_width=320):
141
+ # select domain
142
+ with gr.Row():
143
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
144
+ # select language
145
+ with gr.Row():
146
+ selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
147
+
148
+ with gr.Column():
149
+ with gr.Row():
150
+ selected_version = get_version_dropdown()
151
+ # select the metric
152
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
153
+ with gr.Row():
154
+ show_anonymous = get_anonymous_checkbox()
155
+ with gr.Row():
156
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
157
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
158
+ with gr.TabItem("Retrieval + Reranking", id=10):
159
+ with gr.Row():
160
+ # search retrieval models
161
+ with gr.Column():
162
+ search_bar = get_search_bar()
163
+ # select reranking models
164
+ with gr.Column():
165
+ selected_rerankings = get_reranking_dropdown(reranking_models)
166
+ leaderboard_table = get_leaderboard_table(leaderboard_df_qa, types_qa)
167
+ # Dummy leaderboard for handling the case when the user uses backspace key
168
+ hidden_leaderboard_table_for_search = get_leaderboard_table(original_df_qa, types_qa, visible=False)
169
+
170
+ set_listeners(
171
+ "qa",
172
+ leaderboard_table,
173
+ hidden_leaderboard_table_for_search,
174
+ search_bar,
 
 
 
 
 
 
 
 
175
  selected_domains,
176
  selected_langs,
177
  selected_rerankings,
 
178
  show_anonymous,
179
  show_revision_and_timestamp,
180
+ )
181
+
182
+ # set metric listener
183
+ selected_metric.change(
184
+ update_metric_qa,
185
+ [
186
+ selected_metric,
187
+ selected_domains,
188
+ selected_langs,
189
+ selected_rerankings,
190
+ search_bar,
191
+ show_anonymous,
192
+ show_revision_and_timestamp,
193
+ ],
194
+ leaderboard_table,
195
+ queue=True
196
+ )
197
+ with gr.TabItem("Retrieval Only", id=11):
198
+ with gr.Row():
199
+ with gr.Column(scale=1):
200
+ search_bar_retriever = get_search_bar()
201
+ with gr.Column(scale=1):
202
+ selected_noreranker = get_noreranking_dropdown()
203
+ lb_df_retriever = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
204
+ lb_df_retriever = reset_rank(lb_df_retriever)
205
+ lb_table_retriever = get_leaderboard_table(lb_df_retriever, types_qa)
206
+ # Dummy leaderboard for handling the case when the user uses backspace key
207
+ hidden_lb_df_retriever = original_df_qa[original_df_qa[COL_NAME_RERANKING_MODEL] == "NoReranker"]
208
+ hidden_lb_df_retriever = reset_rank(hidden_lb_df_retriever)
209
+ hidden_lb_table_retriever = get_leaderboard_table(hidden_lb_df_retriever, types_qa, visible=False)
210
+
211
+ set_listeners(
212
+ "qa",
213
+ lb_table_retriever,
214
+ hidden_lb_table_retriever,
215
+ search_bar_retriever,
216
  selected_domains,
217
  selected_langs,
218
  selected_noreranker,
 
219
  show_anonymous,
220
  show_revision_and_timestamp,
221
+ )
222
+
223
+ # set metric listener
224
+ selected_metric.change(
225
+ update_metric_qa,
226
+ [
227
+ selected_metric,
228
+ selected_domains,
229
+ selected_langs,
230
+ selected_noreranker,
231
+ search_bar_retriever,
232
+ show_anonymous,
233
+ show_revision_and_timestamp,
234
+ ],
235
+ lb_table_retriever,
236
+ queue=True
237
+ )
238
+ with gr.TabItem("Reranking Only", id=12):
239
+ lb_df_reranker = leaderboard_df_qa[leaderboard_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
240
+ lb_df_reranker = reset_rank(lb_df_reranker)
241
+ reranking_models_reranker = lb_df_reranker[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
242
+ with gr.Row():
243
+ with gr.Column(scale=1):
244
+ selected_rerankings_reranker = get_reranking_dropdown(reranking_models_reranker)
245
+ with gr.Column(scale=1):
246
+ search_bar_reranker = gr.Textbox(show_label=False, visible=False)
247
+ lb_table_reranker = get_leaderboard_table(lb_df_reranker, types_qa)
248
+ hidden_lb_df_reranker = original_df_qa[original_df_qa[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
249
+ hidden_lb_df_reranker = reset_rank(hidden_lb_df_reranker)
250
+ hidden_lb_table_reranker = get_leaderboard_table(
251
+ hidden_lb_df_reranker, types_qa, visible=False
252
+ )
253
+
254
+ set_listeners(
255
+ "qa",
256
+ lb_table_reranker,
257
+ hidden_lb_table_reranker,
258
+ search_bar_reranker,
259
  selected_domains,
260
  selected_langs,
261
  selected_rerankings_reranker,
 
262
  show_anonymous,
263
  show_revision_and_timestamp,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  )
265
+ # set metric listener
266
+ selected_metric.change(
267
+ update_metric_qa,
268
+ [
269
+ selected_metric,
270
+ selected_domains,
271
+ selected_langs,
272
+ selected_rerankings_reranker,
273
+ search_bar_reranker,
274
+ show_anonymous,
275
+ show_revision_and_timestamp,
276
+ ],
277
+ lb_table_reranker,
278
+ queue=True
279
+ )
280
+ with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
281
+ with gr.Row():
282
+ with gr.Column(min_width=320):
283
+ # select domain
284
+ with gr.Row():
285
+ selected_domains = get_domain_dropdown(DOMAIN_COLS_LONG_DOC, DOMAIN_COLS_LONG_DOC)
286
+ # select language
287
+ with gr.Row():
288
+ selected_langs = get_language_dropdown(
289
+ LANG_COLS_LONG_DOC, LANG_COLS_LONG_DOC
290
+ )
291
+ with gr.Column():
292
+ with gr.Row():
293
+ selected_version = get_version_dropdown()
294
+ # select the metric
295
+ with gr.Row():
296
+ selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
297
+ with gr.Row():
298
+ show_anonymous = get_anonymous_checkbox()
299
+ with gr.Row():
300
+ show_revision_and_timestamp = get_revision_and_ts_checkbox()
301
+ with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
302
+ with gr.TabItem("Retrieval + Reranking", id=20):
303
+ with gr.Row():
304
+ with gr.Column():
305
+ search_bar = get_search_bar()
306
+ # select reranking model
307
+ with gr.Column():
308
+ selected_rerankings = get_reranking_dropdown(reranking_models)
309
+
310
+ lb_table = get_leaderboard_table(
311
+ leaderboard_df_long_doc, types_long_doc
312
+ )
313
+
314
+ # Dummy leaderboard for handling the case when the user uses backspace key
315
+ hidden_lb_table_for_search = get_leaderboard_table(
316
+ original_df_long_doc, types_long_doc, visible=False
317
+ )
318
+
319
+ set_listeners(
320
+ "long-doc",
321
+ lb_table,
322
+ hidden_lb_table_for_search,
323
+ search_bar,
324
  selected_domains,
325
  selected_langs,
326
  selected_rerankings,
 
327
  show_anonymous,
328
+ show_revision_and_timestamp,
329
+ )
330
+
331
+ # set metric listener
332
+ selected_metric.change(
333
+ update_metric_long_doc,
334
+ [
335
+ selected_metric,
336
+ selected_domains,
337
+ selected_langs,
338
+ selected_rerankings,
339
+ search_bar,
340
+ show_anonymous,
341
+ show_revision_and_timestamp
342
+ ],
343
+ lb_table,
344
+ queue=True
345
+ )
346
+ with gr.TabItem("Retrieval Only", id=21):
347
+ with gr.Row():
348
+ with gr.Column(scale=1):
349
+ search_bar_retriever = get_search_bar()
350
+ with gr.Column(scale=1):
351
+ selected_noreranker = get_noreranking_dropdown()
352
+ lb_df_retriever_long_doc = leaderboard_df_long_doc[
353
+ leaderboard_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
354
+ ]
355
+ lb_df_retriever_long_doc = reset_rank(lb_df_retriever_long_doc)
356
+ hidden_lb_db_retriever_long_doc = original_df_long_doc[
357
+ original_df_long_doc[COL_NAME_RERANKING_MODEL] == "NoReranker"
358
+ ]
359
+ hidden_lb_db_retriever_long_doc = reset_rank(hidden_lb_db_retriever_long_doc)
360
+ lb_table_retriever_long_doc = get_leaderboard_table(
361
+ lb_df_retriever_long_doc, types_long_doc)
362
+ hidden_lb_table_retriever_long_doc = get_leaderboard_table(
363
+ hidden_lb_db_retriever_long_doc, types_long_doc, visible=False
364
+ )
365
+
366
+ set_listeners(
367
+ "long-doc",
368
+ lb_table_retriever_long_doc,
369
+ hidden_lb_table_retriever_long_doc,
370
+ search_bar_retriever,
371
  selected_domains,
372
  selected_langs,
373
  selected_noreranker,
 
374
  show_anonymous,
375
  show_revision_and_timestamp,
376
+ )
377
+
378
+ selected_metric.change(
379
+ update_metric_long_doc,
380
+ [
381
+ selected_metric,
382
+ selected_domains,
383
+ selected_langs,
384
+ selected_noreranker,
385
+ search_bar_retriever,
386
+ show_anonymous,
387
+ show_revision_and_timestamp,
388
+ ],
389
+ lb_table_retriever_long_doc,
390
+ queue=True
391
+ )
392
+ with gr.TabItem("Reranking Only", id=22):
393
+ lb_df_reranker_ldoc = leaderboard_df_long_doc[
394
+ leaderboard_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
395
+ ]
396
+ lb_df_reranker_ldoc = reset_rank(lb_df_reranker_ldoc)
397
+ reranking_models_reranker_ldoc = lb_df_reranker_ldoc[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
398
+ with gr.Row():
399
+ with gr.Column(scale=1):
400
+ selected_rerankings_reranker_ldoc = get_reranking_dropdown(reranking_models_reranker_ldoc)
401
+ with gr.Column(scale=1):
402
+ search_bar_reranker_ldoc = gr.Textbox(show_label=False, visible=False)
403
+ lb_table_reranker_ldoc = get_leaderboard_table(lb_df_reranker_ldoc, types_long_doc)
404
+ hidden_lb_df_reranker_ldoc = original_df_long_doc[original_df_long_doc[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
405
+ hidden_lb_df_reranker_ldoc = reset_rank(hidden_lb_df_reranker_ldoc)
406
+ hidden_lb_table_reranker_ldoc = get_leaderboard_table(
407
+ hidden_lb_df_reranker_ldoc, types_long_doc, visible=False
408
+ )
409
+
410
+ set_listeners(
411
+ "long-doc",
412
+ lb_table_reranker_ldoc,
413
+ hidden_lb_table_reranker_ldoc,
414
+ search_bar_reranker_ldoc,
415
  selected_domains,
416
  selected_langs,
417
  selected_rerankings_reranker_ldoc,
 
418
  show_anonymous,
419
  show_revision_and_timestamp,
420
+ )
421
+ selected_metric.change(
422
+ update_metric_long_doc,
423
+ [
424
+ selected_metric,
425
+ selected_domains,
426
+ selected_langs,
427
+ selected_rerankings_reranker_ldoc,
428
+ search_bar_reranker_ldoc,
429
+ show_anonymous,
430
+ show_revision_and_timestamp,
431
+ ],
432
+ lb_table_reranker_ldoc,
433
+ queue=True
434
+ )
435
 
436
  with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
437
  with gr.Column():
requirements.txt CHANGED
@@ -12,3 +12,4 @@ requests>=2.31.0
12
  tqdm>=4.65.0
13
  accelerate>=0.24.1
14
  socksio>=1.0.0
 
 
12
  tqdm>=4.65.0
13
  accelerate>=0.24.1
14
  socksio>=1.0.0
15
+ air-benchmark>=0.0.4
src/about.py CHANGED
@@ -1,6 +1,6 @@
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
- (v0.0.3) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
@@ -17,14 +17,14 @@ BENCHMARKS_TEXT = f"""
17
  - A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
18
 
19
  - Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
20
- - A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/available_evaluation_results.md#consistency-with-ms-marco) for details
21
 
22
  """
23
 
24
  EVALUATION_QUEUE_TEXT = """
25
  ## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
26
 
27
- ## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend).
28
 
29
  - If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
30
  - If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
 
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">AIR-Bench: Automated Heterogeneous Information Retrieval Benchmark
3
+ (v0.1.0.dev) </h1>"""
4
 
5
  # What does your leaderboard evaluate?
6
  INTRODUCTION_TEXT = """
 
17
  - A: Yes, we plan to release new datasets on regular basis. However, the update frequency is to be decided.
18
 
19
  - Q: As you are using models to do the quality control when generating the data, is it biased to the models that are used?
20
+ - A: Yes, the results is biased to the chosen models. However, we believe the datasets labeled by human are also biased to the human's preference. The key point to verify is whether the model's bias is consistent with the human's. We use our approach to generate test data using the well established MSMARCO datasets. We benchmark different models' performances using the generated dataset and the human-label DEV dataset. Comparing the ranking of different models on these two datasets, we observe the spearman correlation between them is 0.8211 (p-value=5e-5). This indicates that the models' perference is well aligned with the human. Please refer to [here](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/available_analysis_results.md#consistency-with-human-labeled-data) for details.
21
 
22
  """
23
 
24
  EVALUATION_QUEUE_TEXT = """
25
  ## Check out the submission steps at [our GitHub repo](https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md)
26
 
27
+ ## You can find the **STATUS of Your Submission** at the [Backend Space](https://huggingface.co/spaces/AIR-Bench/leaderboard_backend)
28
 
29
  - If the status is **✔️ Success**, then you can find your results at the [Leaderboard Space](https://huggingface.co/spaces/AIR-Bench/leaderboard) in no more than one hour.
30
  - If the status is **❌ Failed**, please check your submission steps and try again. If you have any questions, please feel free to open an issue [here](https://github.com/AIR-Bench/AIR-Bench/issues/new).
src/benchmarks.py CHANGED
@@ -1,5 +1,6 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
 
3
 
4
 
5
  def get_safe_name(name: str):
@@ -11,67 +12,6 @@ def get_safe_name(name: str):
11
  if (character.isalnum() or character == '_'))
12
 
13
 
14
- dataset_dict = {
15
- "qa": {
16
- "wiki": {
17
- "en": ["wikipedia_20240101", ],
18
- "zh": ["wikipedia_20240101", ]
19
- },
20
- "web": {
21
- "en": ["mC4", ],
22
- "zh": ["mC4", ]
23
- },
24
- "news": {
25
- "en": ["CC-News", ],
26
- "zh": ["CC-News", ]
27
- },
28
- "healthcare": {
29
- "en": ["PubMedQA", ],
30
- "zh": ["Huatuo-26M", ]
31
- },
32
- "law": {
33
- "en": ["pile-of-law", ],
34
- # "zh": ["flk_npc_gov_cn", ]
35
- },
36
- "finance": {
37
- "en": ["Reuters-Financial", ],
38
- "zh": ["FinCorpus", ]
39
- },
40
- "arxiv": {
41
- "en": ["Arxiv", ]},
42
- "msmarco": {
43
- "en": ["MS MARCO", ]},
44
- },
45
- "long-doc": {
46
- "arxiv": {
47
- "en": ["gpt3", "llama2", "llm-survey", "gemini"],
48
- },
49
- "book": {
50
- "en": [
51
- "origin-of-species_darwin",
52
- "a-brief-history-of-time_stephen-hawking"
53
- ]
54
- },
55
- "healthcare": {
56
- "en": [
57
- "pubmed_100k-200k_1",
58
- "pubmed_100k-200k_2",
59
- "pubmed_100k-200k_3",
60
- "pubmed_40k-50k_5-merged",
61
- "pubmed_30k-40k_10-merged"
62
- ]
63
- },
64
- "law": {
65
- "en": [
66
- "lex_files_300k-400k",
67
- "lex_files_400k-500k",
68
- "lex_files_500k-600k",
69
- "lex_files_600k-700k"
70
- ]
71
- }
72
- }
73
- }
74
-
75
  METRIC_LIST = [
76
  "ndcg_at_1",
77
  "ndcg_at_3",
@@ -118,7 +58,7 @@ class Benchmark:
118
 
119
  qa_benchmark_dict = {}
120
  long_doc_benchmark_dict = {}
121
- for task, domain_dict in dataset_dict.items():
122
  for domain, lang_dict in domain_dict.items():
123
  for lang, dataset_list in lang_dict.items():
124
  if task == "qa":
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
+ from air_benchmark.tasks.tasks import BenchmarkTable
4
 
5
 
6
  def get_safe_name(name: str):
 
12
  if (character.isalnum() or character == '_'))
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  METRIC_LIST = [
16
  "ndcg_at_1",
17
  "ndcg_at_3",
 
58
 
59
  qa_benchmark_dict = {}
60
  long_doc_benchmark_dict = {}
61
+ for task, domain_dict in BenchmarkTable['AIR-Bench_24.04'].items():
62
  for domain, lang_dict in domain_dict.items():
63
  for lang, dataset_list in lang_dict.items():
64
  if task == "qa":