Ruiyang1 commited on
Commit
3f05ff2
β€’
1 Parent(s): 1a86feb

Update space

Browse files
Files changed (4) hide show
  1. css_html.py +64 -0
  2. results.json +1106 -0
  3. text_content.py +45 -0
  4. utils.py +62 -0
css_html.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+ #changelog-text {
3
+ font-size: 16px !important;
4
+ }
5
+ #changelog-text h2 {
6
+ font-size: 18px !important;
7
+ }
8
+ .markdown-text {
9
+ font-size: 16px !important;
10
+ }
11
+ #models-to-add-text {
12
+ font-size: 18px !important;
13
+ }
14
+ #citation-button span {
15
+ font-size: 16px !important;
16
+ }
17
+ #citation-button textarea {
18
+ font-size: 16px !important;
19
+ }
20
+ #citation-button > label > button {
21
+ margin: 6px;
22
+ transform: scale(1.3);
23
+ }
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+ #leaderboard-table-lite {
28
+ margin-top: 15px
29
+ }
30
+ #search-bar-table-box > div:first-child {
31
+ background: none;
32
+ border: none;
33
+ }
34
+
35
+ #search-bar {
36
+ padding: 0px;
37
+ }
38
+ /* Hides the final AutoEvalColumn */
39
+ #llm-benchmark-tab-table table td:last-child,
40
+ #llm-benchmark-tab-table table th:last-child {
41
+ display: none;
42
+ }
43
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
44
+ table td:first-child,
45
+ table th:first-child {
46
+ max-width: 400px;
47
+ overflow: auto;
48
+ white-space: nowrap;
49
+ }
50
+ .tab-buttons button {
51
+ font-size: 20px;
52
+ }
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+ #scale-logo .download {
62
+ display: none;
63
+ }
64
+ """
results.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gpt-4o": {
3
+ "link": "",
4
+ "open-data": "NONE",
5
+ "pass@1": {
6
+ "cs_input": 70.1,
7
+ "cpp_input": 64.6,
8
+ "d_input": 71.6,
9
+ "go_input": 75.4,
10
+ "java_input": 69.8,
11
+ "js_input": 73.2,
12
+ "jl_input": 67.0,
13
+ "lua_input": 73.0,
14
+ "pl_input": 70.1,
15
+ "php_input": 74.8,
16
+ "py_input": 70.6,
17
+ "r_input": 74.4,
18
+ "rkt_input": 67.4,
19
+ "rb_input": 72.0,
20
+ "rs_input": 73.6,
21
+ "scala_input": 65.4,
22
+ "sh_input": 70.6,
23
+ "swift_input": 74.2,
24
+ "ts_input": 74.0,
25
+ "cs_output": 75.0,
26
+ "cpp_output": 74.8,
27
+ "d_output": 71.3,
28
+ "go_output": 77.0,
29
+ "java_output": 73.2,
30
+ "js_output": 77.6,
31
+ "jl_output": 73.6,
32
+ "lua_output": 74.8,
33
+ "pl_output": 74.0,
34
+ "php_output": 75.4,
35
+ "py_output": 75.4,
36
+ "r_output": 72.0,
37
+ "rkt_output": 70.8,
38
+ "rb_output": 74.0,
39
+ "rs_output": 74.4,
40
+ "scala_output": 71.8,
41
+ "sh_output": 71.6,
42
+ "swift_output": 76.0,
43
+ "ts_output": 76.4
44
+ },
45
+ "prompted": false,
46
+ "size": null
47
+ },
48
+ "gpt-4o-mini": {
49
+ "link": "",
50
+ "open-data": "NONE",
51
+ "pass@1": {
52
+ "cs_input": 58.8,
53
+ "cpp_input": 52.2,
54
+ "d_input": 60.6,
55
+ "go_input": 62.0,
56
+ "java_input": 57.2,
57
+ "js_input": 59.6,
58
+ "jl_input": 56.2,
59
+ "lua_input": 63.4,
60
+ "pl_input": 57.4,
61
+ "php_input": 61.0,
62
+ "py_input": 59.6,
63
+ "r_input": 60.4,
64
+ "rkt_input": 51.2,
65
+ "rb_input": 61.6,
66
+ "rs_input": 61.2,
67
+ "scala_input": 52.6,
68
+ "sh_input": 57.2,
69
+ "swift_input": 63.4,
70
+ "ts_input": 61.2,
71
+ "cs_output": 63.0,
72
+ "cpp_output": 63.0,
73
+ "d_output": 61.4,
74
+ "go_output": 63.4,
75
+ "java_output": 54.0,
76
+ "js_output": 61.8,
77
+ "jl_output": 57.8,
78
+ "lua_output": 60.0,
79
+ "pl_output": 57.4,
80
+ "php_output": 64.2,
81
+ "py_output": 61.6,
82
+ "r_output": 59.6,
83
+ "rkt_output": 56.6,
84
+ "rb_output": 61.2,
85
+ "rs_output": 61.8,
86
+ "scala_output": 61.2,
87
+ "sh_output": 56.2,
88
+ "swift_output": 63.0,
89
+ "ts_output": 61.2
90
+ },
91
+ "prompted": true,
92
+ "size": null
93
+ },
94
+ "gpt-3.5-turbo": {
95
+ "link": "",
96
+ "open-data": "NONE",
97
+ "pass@1": {
98
+ "cs_input": 52.2,
99
+ "cpp_input": 39.2,
100
+ "d_input": 50.2,
101
+ "go_input": 53.4,
102
+ "java_input": 55.4,
103
+ "js_input": 50.0,
104
+ "jl_input": 47.0,
105
+ "lua_input": 53.2,
106
+ "pl_input": 47.6,
107
+ "php_input": 52.2,
108
+ "py_input": 51.6,
109
+ "r_input": 48.6,
110
+ "rkt_input": 45.4,
111
+ "rb_input": 49.6,
112
+ "rs_input": 53.0,
113
+ "scala_input": 54.2,
114
+ "sh_input": 47.6,
115
+ "swift_input": 58.2,
116
+ "ts_input": 48.4,
117
+ "cs_output": 54.2,
118
+ "cpp_output": 43.2,
119
+ "d_output": 56.0,
120
+ "go_output": 53.2,
121
+ "java_output": 43.6,
122
+ "js_output": 56.2,
123
+ "jl_output": 54.2,
124
+ "lua_output": 54.6,
125
+ "pl_output": 51.8,
126
+ "php_output": 55.2,
127
+ "py_output": 57.2,
128
+ "r_output": 49.4,
129
+ "rkt_output": 48.0,
130
+ "rb_output": 56.4,
131
+ "rs_output": 54.6,
132
+ "scala_output": 56.4,
133
+ "sh_output": 51.0,
134
+ "swift_output": 57.8,
135
+ "ts_output": 53.6
136
+ },
137
+ "prompted": true,
138
+ "size": null
139
+ },
140
+ "DeepSeek-Coder-V2-0724": {
141
+ "link": "",
142
+ "open-data": "NONE",
143
+ "pass@1": {
144
+ "cs_input": 63.8,
145
+ "cpp_input": 57.0,
146
+ "d_input": 66.6,
147
+ "go_input": 64.0,
148
+ "java_input": 64.8,
149
+ "js_input": 67.0,
150
+ "jl_input": 58.4,
151
+ "lua_input": 62.0,
152
+ "pl_input": 61.4,
153
+ "php_input": 64.2,
154
+ "py_input": 64.0,
155
+ "r_input": 65.8,
156
+ "rkt_input": 58.0,
157
+ "rb_input": 63.2,
158
+ "rs_input": 63.6,
159
+ "scala_input": 58.2,
160
+ "sh_input": 62.4,
161
+ "swift_input": 62.6,
162
+ "ts_input": 66.6,
163
+ "cs_output": 66.6,
164
+ "cpp_output": 66.2,
165
+ "d_output": 63.4,
166
+ "go_output": 68.0,
167
+ "java_output": 67.6,
168
+ "js_output": 65.4,
169
+ "jl_output": 64.8,
170
+ "lua_output": 63.6,
171
+ "pl_output": 63.0,
172
+ "php_output": 67.4,
173
+ "py_output": 66.8,
174
+ "r_output": 63.0,
175
+ "rkt_output": 62.2,
176
+ "rb_output": 65.2,
177
+ "rs_output": 65.8,
178
+ "scala_output": 63.2,
179
+ "sh_output": 58.8,
180
+ "swift_output": 67.8,
181
+ "ts_output": 66.4
182
+ },
183
+ "prompted": true,
184
+ "size": 23
185
+ },
186
+ "Qwen2-72B-Instruct-GPTQ-Int4": {
187
+ "link": "",
188
+ "open-data": "NONE",
189
+ "pass@1": {
190
+ "cs_input": 52.0,
191
+ "cpp_input": 54.2,
192
+ "d_input": 49.6,
193
+ "go_input": 55.4,
194
+ "java_input": 50.0,
195
+ "js_input": 51.6,
196
+ "jl_input": 51.0,
197
+ "lua_input": 51.2,
198
+ "pl_input": 47.8,
199
+ "php_input": 55.2,
200
+ "py_input": 52.4,
201
+ "r_input": 53.2,
202
+ "rkt_input": 47.8,
203
+ "rb_input": 54.4,
204
+ "rs_input": 57.2,
205
+ "scala_input": 50.6,
206
+ "sh_input": 52.4,
207
+ "swift_input": 51.6,
208
+ "ts_input": 52.0,
209
+ "cs_output": 51.2,
210
+ "cpp_output": 50.2,
211
+ "d_output": 51.6,
212
+ "go_output": 53.6,
213
+ "java_output": 38.2,
214
+ "js_output": 52.0,
215
+ "jl_output": 51.0,
216
+ "lua_output": 49.0,
217
+ "pl_output": 45.8,
218
+ "php_output": 50.8,
219
+ "py_output": 51.2,
220
+ "r_output": 45.0,
221
+ "rkt_output": 46.8,
222
+ "rb_output": 50.8,
223
+ "rs_output": 51.0,
224
+ "scala_output": 51.0,
225
+ "sh_output": 45.6,
226
+ "swift_output": 50.4,
227
+ "ts_output": 53.2
228
+ },
229
+ "prompted": true,
230
+ "size": 72
231
+ },
232
+ "CodeLlama-34b-Python-hf": {
233
+ "link": "",
234
+ "open-data": "NONE",
235
+ "pass@1": {
236
+ "cs_input": 38.8,
237
+ "cpp_input": 40.0,
238
+ "d_input": 39.2,
239
+ "go_input": 39.0,
240
+ "java_input": 41.4,
241
+ "js_input": 45.8,
242
+ "jl_input": 44.8,
243
+ "lua_input": 45.0,
244
+ "pl_input": 43.2,
245
+ "php_input": 48.0,
246
+ "py_input": 46.8,
247
+ "r_input": 42.2,
248
+ "rkt_input": 38.8,
249
+ "rb_input": 44.0,
250
+ "rs_input": 44.2,
251
+ "scala_input": 43.0,
252
+ "sh_input": 44.6,
253
+ "swift_input": 45.0,
254
+ "ts_input": 44.0,
255
+ "cs_output": 41.4,
256
+ "cpp_output": 44.8,
257
+ "d_output": 45.6,
258
+ "go_output": 41.8,
259
+ "java_output": 41.4,
260
+ "js_output": 45.4,
261
+ "jl_output": 45.2,
262
+ "lua_output": 42.8,
263
+ "pl_output": 43.6,
264
+ "php_output": 43.8,
265
+ "py_output": 43.8,
266
+ "r_output": 42.4,
267
+ "rkt_output": 38.6,
268
+ "rb_output": 42.8,
269
+ "rs_output": 46.6,
270
+ "scala_output": 43.8,
271
+ "sh_output": 42.0,
272
+ "swift_output": 44.4,
273
+ "ts_output": 44.8
274
+ },
275
+ "prompted": true,
276
+ "size": 34
277
+ },
278
+ "CodeLlama-34b-Instruct-hf": {
279
+ "link": "",
280
+ "open-data": "NONE",
281
+ "pass@1": {
282
+ "cs_input": 44.6,
283
+ "cpp_input": 48.4,
284
+ "d_input": 43.8,
285
+ "go_input": 46.0,
286
+ "java_input": 44.4,
287
+ "js_input": 52.6,
288
+ "jl_input": 50.4,
289
+ "lua_input": 49.4,
290
+ "pl_input": 46.0,
291
+ "php_input": 52.0,
292
+ "py_input": 51.2,
293
+ "r_input": 48.4,
294
+ "rkt_input": 42.4,
295
+ "rb_input": 48.2,
296
+ "rs_input": 48.6,
297
+ "scala_input": 48.0,
298
+ "sh_input": 46.2,
299
+ "swift_input": 49.4,
300
+ "ts_input": 53.2,
301
+ "cs_output": 44.4,
302
+ "cpp_output": 46.2,
303
+ "d_output": 45.8,
304
+ "go_output": 46.8,
305
+ "java_output": 40.6,
306
+ "js_output": 47.4,
307
+ "jl_output": 45.6,
308
+ "lua_output": 42.8,
309
+ "pl_output": 44.0,
310
+ "php_output": 44.8,
311
+ "py_output": 44.0,
312
+ "r_output": 40.2,
313
+ "rkt_output": 38.2,
314
+ "rb_output": 44.2,
315
+ "rs_output": 46.4,
316
+ "scala_output": 43.8,
317
+ "sh_output": 40.6,
318
+ "swift_output": 45.2,
319
+ "ts_output": 45.0
320
+ },
321
+ "prompted": true,
322
+ "size": 34
323
+ },
324
+ "CodeLlama-34b-hf": {
325
+ "link": "",
326
+ "open-data": "NONE",
327
+ "pass@1": {
328
+ "cs_input": 40.4,
329
+ "cpp_input": 44.6,
330
+ "d_input": 45.6,
331
+ "go_input": 41.2,
332
+ "java_input": 39.0,
333
+ "js_input": 50.0,
334
+ "jl_input": 49.0,
335
+ "lua_input": 47.0,
336
+ "pl_input": 46.6,
337
+ "php_input": 48.8,
338
+ "py_input": 49.8,
339
+ "r_input": 47.6,
340
+ "rkt_input": 39.8,
341
+ "rb_input": 46.6,
342
+ "rs_input": 46.8,
343
+ "scala_input": 44.6,
344
+ "sh_input": 44.4,
345
+ "swift_input": 50.0,
346
+ "ts_input": 48.6,
347
+ "cs_output": 44.6,
348
+ "cpp_output": 47.8,
349
+ "d_output": 44.2,
350
+ "go_output": 45.2,
351
+ "java_output": 38.4,
352
+ "js_output": 47.0,
353
+ "jl_output": 45.8,
354
+ "lua_output": 42.8,
355
+ "pl_output": 43.8,
356
+ "php_output": 46.4,
357
+ "py_output": 46.4,
358
+ "r_output": 38.8,
359
+ "rkt_output": 38.4,
360
+ "rb_output": 45.4,
361
+ "rs_output": 47.2,
362
+ "scala_output": 47.4,
363
+ "sh_output": 43.8,
364
+ "swift_output": 47.6,
365
+ "ts_output": 47.4
366
+ },
367
+ "prompted": true,
368
+ "size": 34
369
+ },
370
+ "WizardCoder-33B-V1.1": {
371
+ "link": "",
372
+ "open-data": "NONE",
373
+ "pass@1": {
374
+ "cs_input": 44.8,
375
+ "cpp_input": 25.4,
376
+ "d_input": 46.4,
377
+ "go_input": 47.6,
378
+ "java_input": 48.4,
379
+ "js_input": 45.6,
380
+ "jl_input": 49.2,
381
+ "lua_input": 48.8,
382
+ "pl_input": 44.6,
383
+ "php_input": 50.0,
384
+ "py_input": 50.0,
385
+ "r_input": 45.0,
386
+ "rkt_input": 42.4,
387
+ "rb_input": 49.2,
388
+ "rs_input": 48.2,
389
+ "scala_input": 48.2,
390
+ "sh_input": 45.4,
391
+ "swift_input": 51.0,
392
+ "ts_input": 46.4,
393
+ "cs_output": 47.0,
394
+ "cpp_output": 46.8,
395
+ "d_output": 45.8,
396
+ "go_output": 44.2,
397
+ "java_output": 50.8,
398
+ "js_output": 50.0,
399
+ "jl_output": 47.0,
400
+ "lua_output": 46.0,
401
+ "pl_output": 45.2,
402
+ "php_output": 51.4,
403
+ "py_output": 49.6,
404
+ "r_output": 44.0,
405
+ "rkt_output": 42.4,
406
+ "rb_output": 48.2,
407
+ "rs_output": 47.8,
408
+ "scala_output": 45.0,
409
+ "sh_output": 44.4,
410
+ "swift_output": 48.0,
411
+ "ts_output": 49.8
412
+ },
413
+ "prompted": true,
414
+ "size": 33
415
+ },
416
+ "deepseek-coder-33b-instruct": {
417
+ "link": "",
418
+ "open-data": "NONE",
419
+ "pass@1": {
420
+ "cs_input": 46.0,
421
+ "cpp_input": 43.6,
422
+ "d_input": 49.8,
423
+ "go_input": 49.0,
424
+ "java_input": 46.8,
425
+ "js_input": 48.8,
426
+ "jl_input": 47.0,
427
+ "lua_input": 50.0,
428
+ "pl_input": 46.8,
429
+ "php_input": 52.0,
430
+ "py_input": 51.8,
431
+ "r_input": 48.2,
432
+ "rkt_input": 41.6,
433
+ "rb_input": 52.0,
434
+ "rs_input": 48.4,
435
+ "scala_input": 47.0,
436
+ "sh_input": 48.2,
437
+ "swift_input": 52.2,
438
+ "ts_input": 49.6,
439
+ "cs_output": 52.0,
440
+ "cpp_output": 51.4,
441
+ "d_output": 49.0,
442
+ "go_output": 48.8,
443
+ "java_output": 53.2,
444
+ "js_output": 55.0,
445
+ "jl_output": 50.4,
446
+ "lua_output": 50.4,
447
+ "pl_output": 50.0,
448
+ "php_output": 53.0,
449
+ "py_output": 52.2,
450
+ "r_output": 48.2,
451
+ "rkt_output": 46.6,
452
+ "rb_output": 52.8,
453
+ "rs_output": 50.6,
454
+ "scala_output": 48.0,
455
+ "sh_output": 49.4,
456
+ "swift_output": 52.8,
457
+ "ts_output": 53.6
458
+ },
459
+ "prompted": true,
460
+ "size": 33
461
+ },
462
+ "deepseek-coder-33b-base": {
463
+ "link": "",
464
+ "open-data": "NONE",
465
+ "pass@1": {
466
+ "cs_input": 41.2,
467
+ "cpp_input": 42.8,
468
+ "d_input": 43.2,
469
+ "go_input": 45.6,
470
+ "java_input": 43.8,
471
+ "js_input": 46.0,
472
+ "jl_input": 47.6,
473
+ "lua_input": 47.4,
474
+ "pl_input": 47.2,
475
+ "php_input": 48.6,
476
+ "py_input": 49.2,
477
+ "r_input": 50.6,
478
+ "rkt_input": 42.8,
479
+ "rb_input": 47.4,
480
+ "rs_input": 46.8,
481
+ "scala_input": 44.0,
482
+ "sh_input": 46.4,
483
+ "swift_input": 48.2,
484
+ "ts_input": 45.0,
485
+ "cs_output": 48.2,
486
+ "cpp_output": 50.0,
487
+ "d_output": 46.0,
488
+ "go_output": 48.6,
489
+ "java_output": 49.2,
490
+ "js_output": 51.4,
491
+ "jl_output": 46.8,
492
+ "lua_output": 48.0,
493
+ "pl_output": 48.4,
494
+ "php_output": 52.0,
495
+ "py_output": 49.8,
496
+ "r_output": 45.2,
497
+ "rkt_output": 46.4,
498
+ "rb_output": 49.0,
499
+ "rs_output": 46.2,
500
+ "scala_output": 47.6,
501
+ "sh_output": 46.0,
502
+ "swift_output": 49.2,
503
+ "ts_output": 51.2
504
+ },
505
+ "prompted": true,
506
+ "size": 33
507
+ },
508
+ "starcoder2-15b": {
509
+ "link": "",
510
+ "open-data": "NONE",
511
+ "pass@1": {
512
+ "cs_input": 41.4,
513
+ "cpp_input": 43.8,
514
+ "d_input": 51.6,
515
+ "go_input": 45.2,
516
+ "java_input": 42.6,
517
+ "js_input": 44.0,
518
+ "jl_input": 48.2,
519
+ "lua_input": 44.6,
520
+ "pl_input": 44.8,
521
+ "php_input": 49.8,
522
+ "py_input": 46.6,
523
+ "r_input": 45.8,
524
+ "rkt_input": 45.0,
525
+ "rb_input": 49.0,
526
+ "rs_input": 46.6,
527
+ "scala_input": 37.0,
528
+ "sh_input": 47.4,
529
+ "swift_input": 52.2,
530
+ "ts_input": 46.2,
531
+ "cs_output": 46.0,
532
+ "cpp_output": 47.4,
533
+ "d_output": 47.2,
534
+ "go_output": 49.0,
535
+ "java_output": 48.4,
536
+ "js_output": 50.0,
537
+ "jl_output": 49.2,
538
+ "lua_output": 44.8,
539
+ "pl_output": 49.4,
540
+ "php_output": 48.4,
541
+ "py_output": 48.4,
542
+ "r_output": 47.2,
543
+ "rkt_output": 45.0,
544
+ "rb_output": 51.0,
545
+ "rs_output": 48.8,
546
+ "scala_output": 45.2,
547
+ "sh_output": 45.8,
548
+ "swift_output": 49.6,
549
+ "ts_output": 48.6
550
+ },
551
+ "prompted": true,
552
+ "size": 15
553
+ },
554
+ "WizardCoder-15B-V1.0": {
555
+ "link": "",
556
+ "open-data": "NONE",
557
+ "pass@1": {
558
+ "cs_input": 29.2,
559
+ "cpp_input": 30.0,
560
+ "d_input": 30.6,
561
+ "go_input": 28.6,
562
+ "java_input": 29.6,
563
+ "js_input": 33.0,
564
+ "jl_input": 34.8,
565
+ "lua_input": 33.6,
566
+ "pl_input": 36.2,
567
+ "php_input": 36.8,
568
+ "py_input": 33.2,
569
+ "r_input": 33.4,
570
+ "rkt_input": 36.4,
571
+ "rb_input": 33.6,
572
+ "rs_input": 33.0,
573
+ "scala_input": 29.0,
574
+ "sh_input": 35.0,
575
+ "swift_input": 34.0,
576
+ "ts_input": 32.4,
577
+ "cs_output": 25.2,
578
+ "cpp_output": 30.0,
579
+ "d_output": 30.6,
580
+ "go_output": 33.2,
581
+ "java_output": 26.8,
582
+ "js_output": 33.6,
583
+ "jl_output": 30.2,
584
+ "lua_output": 30.2,
585
+ "pl_output": 31.0,
586
+ "php_output": 33.0,
587
+ "py_output": 34.0,
588
+ "r_output": 31.6,
589
+ "rkt_output": 29.6,
590
+ "rb_output": 32.8,
591
+ "rs_output": 31.2,
592
+ "scala_output": 31.2,
593
+ "sh_output": 29.8,
594
+ "swift_output": 34.2,
595
+ "ts_output": 34.0
596
+ },
597
+ "prompted": true,
598
+ "size": 15
599
+ },
600
+ "starcoder": {
601
+ "link": "",
602
+ "open-data": "NONE",
603
+ "pass@1": {
604
+ "cs_input": 28.2,
605
+ "cpp_input": 30.0,
606
+ "d_input": 33.0,
607
+ "go_input": 33.2,
608
+ "java_input": 33.4,
609
+ "js_input": 35.2,
610
+ "jl_input": 34.4,
611
+ "lua_input": 31.6,
612
+ "pl_input": 34.0,
613
+ "php_input": 36.4,
614
+ "py_input": 34.8,
615
+ "r_input": 33.4,
616
+ "rkt_input": 36.6,
617
+ "rb_input": 35.0,
618
+ "rs_input": 34.8,
619
+ "scala_input": 27.4,
620
+ "sh_input": 37.0,
621
+ "swift_input": 30.8,
622
+ "ts_input": 33.2,
623
+ "cs_output": 20.4,
624
+ "cpp_output": 31.6,
625
+ "d_output": 31.8,
626
+ "go_output": 31.0,
627
+ "java_output": 18.4,
628
+ "js_output": 33.4,
629
+ "jl_output": 32.2,
630
+ "lua_output": 31.8,
631
+ "pl_output": 29.8,
632
+ "php_output": 32.6,
633
+ "py_output": 32.6,
634
+ "r_output": 30.0,
635
+ "rkt_output": 29.2,
636
+ "rb_output": 33.4,
637
+ "rs_output": 32.6,
638
+ "scala_output": 30.0,
639
+ "sh_output": 30.2,
640
+ "swift_output": 33.0,
641
+ "ts_output": 33.0
642
+ },
643
+ "prompted": true,
644
+ "size": 15
645
+ },
646
+ "Phi-3-medium-4k-instruct": {
647
+ "link": "",
648
+ "open-data": "NONE",
649
+ "pass@1": {
650
+ "cs_input": 31.8,
651
+ "cpp_input": 26.0,
652
+ "d_input": 38.8,
653
+ "go_input": 36.4,
654
+ "java_input": 37.2,
655
+ "js_input": 42.4,
656
+ "jl_input": 36.2,
657
+ "lua_input": 37.2,
658
+ "pl_input": 35.6,
659
+ "php_input": 41.2,
660
+ "py_input": 43.4,
661
+ "r_input": 39.2,
662
+ "rkt_input": 24.4,
663
+ "rb_input": 36.0,
664
+ "rs_input": 36.8,
665
+ "scala_input": 38.0,
666
+ "sh_input": 33.6,
667
+ "swift_input": 41.2,
668
+ "ts_input": 42.8,
669
+ "cs_output": 34.2,
670
+ "cpp_output": 37.6,
671
+ "d_output": 39.0,
672
+ "go_output": 31.0,
673
+ "java_output": 34.2,
674
+ "js_output": 41.6,
675
+ "jl_output": 41.2,
676
+ "lua_output": 34.4,
677
+ "pl_output": 35.8,
678
+ "php_output": 37.8,
679
+ "py_output": 42.4,
680
+ "r_output": 36.6,
681
+ "rkt_output": 24.6,
682
+ "rb_output": 42.2,
683
+ "rs_output": 37.4,
684
+ "scala_output": 36.2,
685
+ "sh_output": 37.2,
686
+ "swift_output": 41.4,
687
+ "ts_output": 43.0
688
+ },
689
+ "prompted": true,
690
+ "size": 14
691
+ },
692
+ "Meta-Llama-3-8B-Instruct-hf": {
693
+ "link": "",
694
+ "open-data": "NONE",
695
+ "pass@1": {
696
+ "cs_input": 37.0,
697
+ "cpp_input": 36.4,
698
+ "d_input": 35.0,
699
+ "go_input": 38.6,
700
+ "java_input": 36.2,
701
+ "js_input": 38.4,
702
+ "jl_input": 39.6,
703
+ "lua_input": 40.0,
704
+ "pl_input": 36.2,
705
+ "php_input": 36.6,
706
+ "py_input": 38.4,
707
+ "r_input": 42.2,
708
+ "rkt_input": 24.2,
709
+ "rb_input": 35.8,
710
+ "rs_input": 37.6,
711
+ "scala_input": 38.0,
712
+ "sh_input": 31.6,
713
+ "swift_input": 42.2,
714
+ "ts_input": 38.2,
715
+ "cs_output": 32.0,
716
+ "cpp_output": 30.8,
717
+ "d_output": 31.2,
718
+ "go_output": 31.4,
719
+ "java_output": 25.0,
720
+ "js_output": 35.0,
721
+ "jl_output": 31.4,
722
+ "lua_output": 34.0,
723
+ "pl_output": 29.6,
724
+ "php_output": 27.0,
725
+ "py_output": 33.6,
726
+ "r_output": 27.2,
727
+ "rkt_output": 28.0,
728
+ "rb_output": 31.8,
729
+ "rs_output": 34.4,
730
+ "scala_output": 33.8,
731
+ "sh_output": 32.0,
732
+ "swift_output": 36.4,
733
+ "ts_output": 33.8
734
+ },
735
+ "prompted": true,
736
+ "size": 8
737
+ },
738
+ "CodeQwen1.5-7B-Chat": {
739
+ "link": "",
740
+ "open-data": "NONE",
741
+ "pass@1": {
742
+ "cs_input": 42.8,
743
+ "cpp_input": 42.0,
744
+ "d_input": 43.0,
745
+ "go_input": 46.4,
746
+ "java_input": 44.6,
747
+ "js_input": 43.8,
748
+ "jl_input": 42.2,
749
+ "lua_input": 42.8,
750
+ "pl_input": 41.6,
751
+ "php_input": 44.8,
752
+ "py_input": 43.0,
753
+ "r_input": 43.4,
754
+ "rkt_input": 38.2,
755
+ "rb_input": 43.6,
756
+ "rs_input": 42.0,
757
+ "scala_input": 39.4,
758
+ "sh_input": 46.6,
759
+ "swift_input": 45.8,
760
+ "ts_input": 43.6,
761
+ "cs_output": 37.8,
762
+ "cpp_output": 40.2,
763
+ "d_output": 40.2,
764
+ "go_output": 40.6,
765
+ "java_output": 35.4,
766
+ "js_output": 43.6,
767
+ "jl_output": 42.6,
768
+ "lua_output": 40.4,
769
+ "pl_output": 39.6,
770
+ "php_output": 43.0,
771
+ "py_output": 41.4,
772
+ "r_output": 38.2,
773
+ "rkt_output": 39.0,
774
+ "rb_output": 44.6,
775
+ "rs_output": 42.0,
776
+ "scala_output": 35.0,
777
+ "sh_output": 38.2,
778
+ "swift_output": 43.8,
779
+ "ts_output": 42.2
780
+ },
781
+ "prompted": true,
782
+ "size": 7
783
+ },
784
+ "CodeLlama-7b-Instruct-hf": {
785
+ "link": "",
786
+ "open-data": "NONE",
787
+ "pass@1": {
788
+ "cs_input": 38.6,
789
+ "cpp_input": 36.0,
790
+ "d_input": 38.4,
791
+ "go_input": 38.4,
792
+ "java_input": 38.2,
793
+ "js_input": 39.6,
794
+ "jl_input": 42.2,
795
+ "lua_input": 43.4,
796
+ "pl_input": 36.4,
797
+ "php_input": 40.4,
798
+ "py_input": 41.0,
799
+ "r_input": 41.0,
800
+ "rkt_input": 38.8,
801
+ "rb_input": 41.6,
802
+ "rs_input": 37.6,
803
+ "scala_input": 42.6,
804
+ "sh_input": 39.6,
805
+ "swift_input": 40.2,
806
+ "ts_input": 41.0,
807
+ "cs_output": 32.2,
808
+ "cpp_output": 35.6,
809
+ "d_output": 34.4,
810
+ "go_output": 35.0,
811
+ "java_output": 24.4,
812
+ "js_output": 38.2,
813
+ "jl_output": 35.2,
814
+ "lua_output": 32.2,
815
+ "pl_output": 34.2,
816
+ "php_output": 36.0,
817
+ "py_output": 35.4,
818
+ "r_output": 32.0,
819
+ "rkt_output": 29.6,
820
+ "rb_output": 37.0,
821
+ "rs_output": 37.4,
822
+ "scala_output": 33.0,
823
+ "sh_output": 33.0,
824
+ "swift_output": 34.6,
825
+ "ts_output": 38.8
826
+ },
827
+ "prompted": true,
828
+ "size": 7
829
+ },
830
+ "CodeLlama-7b-hf": {
831
+ "link": "",
832
+ "open-data": "NONE",
833
+ "pass@1": {
834
+ "cs_input": 36.4,
835
+ "cpp_input": 36.2,
836
+ "d_input": 36.8,
837
+ "go_input": 34.6,
838
+ "java_input": 36.4,
839
+ "js_input": 36.6,
840
+ "jl_input": 40.2,
841
+ "lua_input": 39.6,
842
+ "pl_input": 36.0,
843
+ "php_input": 39.4,
844
+ "py_input": 40.2,
845
+ "r_input": 40.0,
846
+ "rkt_input": 36.6,
847
+ "rb_input": 39.2,
848
+ "rs_input": 35.4,
849
+ "scala_input": 37.8,
850
+ "sh_input": 36.8,
851
+ "swift_input": 39.2,
852
+ "ts_input": 38.8,
853
+ "cs_output": 32.6,
854
+ "cpp_output": 34.4,
855
+ "d_output": 33.8,
856
+ "go_output": 33.4,
857
+ "java_output": 28.4,
858
+ "js_output": 38.0,
859
+ "jl_output": 35.2,
860
+ "lua_output": 34.4,
861
+ "pl_output": 35.2,
862
+ "php_output": 38.0,
863
+ "py_output": 34.4,
864
+ "r_output": 32.6,
865
+ "rkt_output": 30.8,
866
+ "rb_output": 34.8,
867
+ "rs_output": 36.8,
868
+ "scala_output": 33.4,
869
+ "sh_output": 31.0,
870
+ "swift_output": 35.0,
871
+ "ts_output": 38.2
872
+ },
873
+ "prompted": true,
874
+ "size": 7
875
+ },
876
+ "deepseek-coder-6.7b-instruct": {
877
+ "link": "",
878
+ "open-data": "NONE",
879
+ "pass@1": {
880
+ "cs_input": 35.0,
881
+ "cpp_input": 37.0,
882
+ "d_input": 35.6,
883
+ "go_input": 40.4,
884
+ "java_input": 35.0,
885
+ "js_input": 36.6,
886
+ "jl_input": 39.2,
887
+ "lua_input": 38.8,
888
+ "pl_input": 39.4,
889
+ "php_input": 42.2,
890
+ "py_input": 38.2,
891
+ "r_input": 42.0,
892
+ "rkt_input": 37.2,
893
+ "rb_input": 40.2,
894
+ "rs_input": 37.4,
895
+ "scala_input": 36.8,
896
+ "sh_input": 42.8,
897
+ "swift_input": 40.8,
898
+ "ts_input": 34.2,
899
+ "cs_output": 34.8,
900
+ "cpp_output": 41.8,
901
+ "d_output": 40.4,
902
+ "go_output": 39.4,
903
+ "java_output": 32.8,
904
+ "js_output": 47.6,
905
+ "jl_output": 42.6,
906
+ "lua_output": 38.8,
907
+ "pl_output": 42.0,
908
+ "php_output": 43.8,
909
+ "py_output": 43.6,
910
+ "r_output": 40.8,
911
+ "rkt_output": 39.2,
912
+ "rb_output": 43.2,
913
+ "rs_output": 41.8,
914
+ "scala_output": 40.6,
915
+ "sh_output": 37.8,
916
+ "swift_output": 43.2,
917
+ "ts_output": 44.0
918
+ },
919
+ "prompted": true,
920
+ "size": 6.7
921
+ },
922
+ "deepseek-coder-6.7b-base": {
923
+ "link": "",
924
+ "open-data": "NONE",
925
+ "pass@1": {
926
+ "cs_input": 38.8,
927
+ "cpp_input": 42.4,
928
+ "d_input": 41.2,
929
+ "go_input": 43.2,
930
+ "java_input": 40.4,
931
+ "js_input": 43.6,
932
+ "jl_input": 42.6,
933
+ "lua_input": 42.8,
934
+ "pl_input": 41.6,
935
+ "php_input": 46.4,
936
+ "py_input": 41.4,
937
+ "r_input": 46.2,
938
+ "rkt_input": 43.0,
939
+ "rb_input": 44.6,
940
+ "rs_input": 41.6,
941
+ "scala_input": 40.8,
942
+ "sh_input": 44.8,
943
+ "swift_input": 43.4,
944
+ "ts_input": 41.8,
945
+ "cs_output": 41.2,
946
+ "cpp_output": 46.2,
947
+ "d_output": 43.2,
948
+ "go_output": 42.8,
949
+ "java_output": 42.6,
950
+ "js_output": 44.8,
951
+ "jl_output": 46.0,
952
+ "lua_output": 41.0,
953
+ "pl_output": 40.4,
954
+ "php_output": 41.8,
955
+ "py_output": 44.8,
956
+ "r_output": 42.8,
957
+ "rkt_output": 43.0,
958
+ "rb_output": 42.6,
959
+ "rs_output": 42.0,
960
+ "scala_output": 43.2,
961
+ "sh_output": 40.6,
962
+ "swift_output": 47.6,
963
+ "ts_output": 45.4
964
+ },
965
+ "prompted": true,
966
+ "size": 6.7
967
+ },
968
+ "codegen-6B-multi": {
969
+ "link": "",
970
+ "open-data": "NONE",
971
+ "pass@1": {
972
+ "cs_input": 28.8,
973
+ "cpp_input": 25.4,
974
+ "d_input": 6.2,
975
+ "go_input": 25.6,
976
+ "java_input": 36.2,
977
+ "js_input": 25.2,
978
+ "jl_input": 17.4,
979
+ "lua_input": 24.4,
980
+ "pl_input": 38.4,
981
+ "php_input": 22.8,
982
+ "py_input": 22.6,
983
+ "r_input": 27.2,
984
+ "rkt_input": 16.2,
985
+ "rb_input": 6.4,
986
+ "rs_input": 18.8,
987
+ "scala_input": 31.0,
988
+ "sh_input": 48.6,
989
+ "swift_input": 32.4,
990
+ "ts_input": 25.2,
991
+ "cs_output": 21.4,
992
+ "cpp_output": 23.6,
993
+ "d_output": 25.0,
994
+ "go_output": 26.4,
995
+ "java_output": 21.6,
996
+ "js_output": 22.8,
997
+ "jl_output": 22.8,
998
+ "lua_output": 23.8,
999
+ "pl_output": 20.4,
1000
+ "php_output": 25.2,
1001
+ "py_output": 24.8,
1002
+ "r_output": 23.4,
1003
+ "rkt_output": 17.8,
1004
+ "rb_output": 24.0,
1005
+ "rs_output": 25.2,
1006
+ "scala_output": 22.0,
1007
+ "sh_output": 22.2,
1008
+ "swift_output": 25.0,
1009
+ "ts_output": 21.4
1010
+ },
1011
+ "prompted": true,
1012
+ "size": 6
1013
+ },
1014
+ "phi-1_5": {
1015
+ "link": "",
1016
+ "open-data": "NONE",
1017
+ "pass@1": {
1018
+ "cs_input": 29.2,
1019
+ "cpp_input": 16.0,
1020
+ "d_input": 13.2,
1021
+ "go_input": 25.8,
1022
+ "java_input": 26.8,
1023
+ "js_input": 9.8,
1024
+ "jl_input": 30.4,
1025
+ "lua_input": 26.6,
1026
+ "pl_input": 17.8,
1027
+ "php_input": 26.6,
1028
+ "py_input": 25.8,
1029
+ "r_input": 8.4,
1030
+ "rkt_input": 6.6,
1031
+ "rb_input": 1.4,
1032
+ "rs_input": 25.2,
1033
+ "scala_input": 30.4,
1034
+ "sh_input": 34.4,
1035
+ "swift_input": 26.6,
1036
+ "ts_input": 30.8,
1037
+ "cs_output": 16.0,
1038
+ "cpp_output": 26.0,
1039
+ "d_output": 24.8,
1040
+ "go_output": 22.6,
1041
+ "java_output": 15.8,
1042
+ "js_output": 23.0,
1043
+ "jl_output": 23.6,
1044
+ "lua_output": 21.2,
1045
+ "pl_output": 22.0,
1046
+ "php_output": 22.2,
1047
+ "py_output": 25.6,
1048
+ "r_output": 21.8,
1049
+ "rkt_output": 16.8,
1050
+ "rb_output": 19.6,
1051
+ "rs_output": 22.0,
1052
+ "scala_output": 21.6,
1053
+ "sh_output": 17.6,
1054
+ "swift_output": 25.6,
1055
+ "ts_output": 25.2
1056
+ },
1057
+ "prompted": true,
1058
+ "size": 1.3
1059
+ },
1060
+ "phi-1": {
1061
+ "link": "",
1062
+ "open-data": "NONE",
1063
+ "pass@1": {
1064
+ "cs_input": 0.2,
1065
+ "cpp_input": 7.0,
1066
+ "d_input": 9.6,
1067
+ "go_input": 3.6,
1068
+ "java_input": 2.8,
1069
+ "js_input": 17.0,
1070
+ "jl_input": 19.0,
1071
+ "lua_input": 17.4,
1072
+ "pl_input": 23.6,
1073
+ "php_input": 9.2,
1074
+ "py_input": 11.8,
1075
+ "r_input": 9.4,
1076
+ "rkt_input": 11.2,
1077
+ "rb_input": 6.8,
1078
+ "rs_input": 5.4,
1079
+ "scala_input": 1.8,
1080
+ "sh_input": 19.8,
1081
+ "swift_input": 14.0,
1082
+ "ts_input": 14.0,
1083
+ "cs_output": 5.8,
1084
+ "cpp_output": 9.0,
1085
+ "d_output": 13.2,
1086
+ "go_output": 14.8,
1087
+ "java_output": 4.6,
1088
+ "js_output": 20.8,
1089
+ "jl_output": 19.2,
1090
+ "lua_output": 15.8,
1091
+ "pl_output": 15.6,
1092
+ "php_output": 18.6,
1093
+ "py_output": 22.4,
1094
+ "r_output": 17.6,
1095
+ "rkt_output": 10.4,
1096
+ "rb_output": 18.0,
1097
+ "rs_output": 16.4,
1098
+ "scala_output": 11.0,
1099
+ "sh_output": 16.4,
1100
+ "swift_output": 19.2,
1101
+ "ts_output": 19.0
1102
+ },
1103
+ "prompted": true,
1104
+ "size": 1.3
1105
+ }
1106
+ }
text_content.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HEAD_TEXT = """
2
+ Based on the CRUXEVAL-X benchmark, we evaluated the executing and reasoning ability of different LLMs in 19 different programing languages.
3
+
4
+ More details about how to evalute the LLM are available in the [CRUXEVAL-X GitHub repository](https://github.com/CRUXEVAL-X/cruxeval-x). For a complete description of CRUXEVAL-X benchmark and related experimental analysis, please refer to the paper: [CRUXEval-X: A Benchmark for Multilingual Code Reasoning, Understanding and Execution](https://arxiv.org/abs/2408.13001). [![](https://img.shields.io/badge/arXiv-2408.13001-b31b1b.svg)](https://arxiv.org/abs/2408.13001)
5
+ **_Latest News_** πŸ”₯
6
+ - [24/08/26] We release our CRUXEVAL-X benchmark, leaderboard and paper.
7
+ """
8
+
9
+ ABOUT_TEXT = """# What is CRUXEVAL-X benchmark?
10
+ CRUXEVAL-X is a multilingual code reasoning, understanding and execution benchmark that focuses on code reasoning ability in different languages.
11
+ Its goal is to evaluate LLM's code reasoning (given input, reasoning output; and given output, reasoning input) ability.
12
+ # How to evaluate?
13
+ To facilitate evaluation on the CRUXEVAL-X benchmark, we provide the evaluation data and easy-to-use evaluation scripts in our [CRUXEVAL-X GitHub repository](https://github.com/CRUXEVAL-X/cruxeval-x).
14
+ Additionally, factors involving execution-based evaluation are conducted in a virtual environment to ensure evaluation security.
15
+ # Contact
16
+ If you have any questions, feel free to reach out to us at [xuruiyang2022@iscas.ac.cn](mailto:xuruiyang2022@iscas.ac.cn).
17
+ """
18
+
19
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
20
+
21
+ CITATION_BUTTON_TEXT = r"""
22
+ @misc{xu2024cruxevalxbenchmarkmultilingualcode,
23
+ title={CRUXEval-X: A Benchmark for Multilingual Code Reasoning, Understanding and Execution},
24
+ author={Ruiyang Xu and Jialun Cao and Yaojie Lu and Hongyu Lin and Xianpei Han and Ben He and Shing-Chi Cheung and Le Sun},
25
+ year={2024},
26
+ eprint={2408.13001},
27
+ archivePrefix={arXiv},
28
+ primaryClass={cs.AI},
29
+ url={https://arxiv.org/abs/2408.13001},
30
+ }
31
+ """
32
+
33
+ ACKNOWLEDGEMENT_TEXT = """
34
+ Inspired from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
35
+ """
36
+
37
+
38
+ NOTES_TEXT = """
39
+ **Notes:**
40
+ - Evaluate using pass@1 as the evaluation metric.
41
+ - `Average` denotes the average results of 19 different languages in a specific task.
42
+ - you can choose differt tasks in `⏬ Tasks`, `input reasoning` denotes given output, reasoning input, `output reasoning` denotes given input, reasoning output.
43
+ - `⏬ Languages` can choose languages you want to show in the leaderboard.
44
+ - For more explanation check the πŸ“ About section.
45
+ """
utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ lang_map = {
4
+ "java": "Java",
5
+ "cpp": "C++",
6
+ "go": "Go",
7
+ "cs": "C#",
8
+ "d": "D",
9
+ "jl": "Julia",
10
+ "js": "JavaScript",
11
+ "php": "PHP",
12
+ "pl": "Perl",
13
+ "py": "Python",
14
+ "r": "R",
15
+ "lua": "Lua",
16
+ "rb": "Ruby",
17
+ "rkt": "Racket",
18
+ "rs": "Rust",
19
+ "scala": "Scala",
20
+ "sh": "Shell",
21
+ "swift": "Swift",
22
+ "ts": "TypeScript"
23
+ }
24
+
25
+ @dataclass
26
+ class ColumnContent:
27
+ name: str
28
+ type: str
29
+ displayed_by_default: bool
30
+ hidden: bool = False
31
+
32
+
33
+ def fields(raw_class):
34
+ return [
35
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
36
+ ]
37
+
38
+
39
+ @dataclass(frozen=True)
40
+ class AutoEvalColumn: # Auto evals column
41
+ model = ColumnContent("Model", "markdown", True)
42
+ size = ColumnContent("Size", "number", True)
43
+ average = ColumnContent("Average", "number", True)
44
+ l_0 = ColumnContent("Java", "number", True)
45
+ l_1 = ColumnContent("C++", "number", True)
46
+ l_2 = ColumnContent("C#", "number", True)
47
+ l_3 = ColumnContent("D", "number", True)
48
+ l_4 = ColumnContent("Go", "number", True)
49
+ l_5 = ColumnContent("Julia", "number", True)
50
+ l_6 = ColumnContent("JavaScript", "number", True)
51
+ l_7 = ColumnContent("Lua", "number", True)
52
+ l_8 = ColumnContent("PHP", "number", True)
53
+ l_9 = ColumnContent("Perl", "number", True)
54
+ l_10 = ColumnContent("Python", "number", True)
55
+ l_11 = ColumnContent("R", "number", True)
56
+ l_12 = ColumnContent("Ruby", "number", True)
57
+ l_13 = ColumnContent("Racket", "number", True)
58
+ l_14 = ColumnContent("Rust", "number", True)
59
+ l_15 = ColumnContent("Scala", "number", True)
60
+ l_16 = ColumnContent("Shell", "number", True)
61
+ l_17 = ColumnContent("Swift", "number", True)
62
+ l_18 = ColumnContent("TypeScript", "number", True)