| eval-result:mmlu.qwen-2-5-72b.001 | eval-run:mmlu.qwen-2-5-72b.2024-09 | EvalRun |
| eval-result:human-eval.qwen-2-5-72b.001 | eval-run:human-eval.qwen-2-5-72b.2024-09 | EvalRun |
| eval-result:human-eval.qwen-2-5-coder-32b.001 | eval-run:human-eval.qwen-2-5-coder-32b.2024-11 | EvalRun |
| eval-result:livecodebench.qwen-2-5-coder-32b.001 | eval-run:livecodebench.qwen-2-5-coder-32b.2024-11 | EvalRun |
| eval-result:mbpp.qwen-2-5-coder-32b.001 | eval-run:mbpp.qwen-2-5-coder-32b.2024-11 | EvalRun |
| eval-result:swe-bench-verified.claude-haiku-4-5.001 | eval-run:swe-bench-verified.claude-haiku-4-5.2025-10 | EvalRun |
| eval-result:gpqa.claude-haiku-4-5.001 | eval-run:gpqa.claude-haiku-4-5.2025-10 | EvalRun |
| eval-result:human-eval.claude-sonnet-4-6.001 | eval-run:human-eval.claude-sonnet-4-6.2025-11 | EvalRun |
| eval-result:mmlu.claude-sonnet-4-6.001 | eval-run:mmlu.claude-sonnet-4-6.2025-11 | EvalRun |
| eval-result:bfcl.claude-sonnet-4-5.001 | eval-run:bfcl.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:gpqa-diamond.claude-opus-4-5.001 | eval-run:gpqa-diamond.claude-opus-4-5.2025-09 | EvalRun |
| eval-result:os-world.claude-sonnet-4-5.001 | eval-run:os-world.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:truthful-qa.claude-opus-4-5.001 | eval-run:truthful-qa.claude-opus-4-5.2025-09 | EvalRun |
| eval-result:human-eval-plus.claude-sonnet-4-5.001 | eval-run:human-eval-plus.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:harmbench.claude-opus-4-5.001 | eval-run:harmbench.claude-opus-4-5.2025-09 | EvalRun |
| eval-result:arc-challenge.claude-sonnet-4-5.001 | eval-run:arc-challenge.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:mmlu.deepseek-v3.001 | eval-run:mmlu.deepseek-v3.2024-12 | EvalRun |
| eval-result:human-eval.deepseek-v3.001 | eval-run:human-eval.deepseek-v3.2024-12 | EvalRun |
| eval-result:swe-bench.deepseek-v3.001 | eval-run:swe-bench.deepseek-v3.2024-12 | EvalRun |
| eval-result:mmlu.deepseek-r1.001 | eval-run:mmlu.deepseek-r1.2025-01 | EvalRun |
| eval-result:math.deepseek-r1.001 | eval-run:math.deepseek-r1.2025-01 | EvalRun |
| eval-result:gpqa.deepseek-r1.001 | eval-run:gpqa.deepseek-r1.2025-01 | EvalRun |
| eval-result:gpqa.gemini-2-5-pro.001 | eval-run:gpqa.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:livecodebench.gemini-2-5-pro.001 | eval-run:livecodebench.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:swe-bench-verified.gemini-2-5-flash.001 | eval-run:swe-bench-verified.gemini-2-5-flash.2025-06 | EvalRun |
| eval-result:gpqa-diamond.gemini-2-5-pro.001 | eval-run:gpqa-diamond.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:android-world.gemini-2-5-pro.001 | eval-run:android-world.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:mgsm.gemini-2-5-pro.001 | eval-run:mgsm.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:gpqa-diamond.gemini-3-1-pro.2026-02-19.accuracy | eval-run:gpqa-diamond.gemini-3-1-pro.2026-02-19 | EvalRun |
| eval-result:gpqa-diamond.gemini-3-pro.2025-11-18.accuracy | eval-run:gpqa-diamond.gemini-3-pro.2025-11-18 | EvalRun |
| eval-result:swe-bench-verified.llama-4-405b.001 | eval-run:swe-bench-verified.llama-4-405b.2024-07 | EvalRun |
| eval-result:human-eval.llama-4-405b.001 | eval-run:human-eval.llama-4-405b.2024-07 | EvalRun |
| eval-result:mmlu.llama-4-405b.001 | eval-run:mmlu.llama-4-405b.2024-07 | EvalRun |
| eval-result:swe-bench.llama-3-1-405b.001 | eval-run:swe-bench.llama-3-1-405b.2024-07 | EvalRun |
| eval-result:mmlu.llama-3-1-405b.001 | eval-run:mmlu.llama-3-1-405b.2024-07 | EvalRun |
| eval-result:human-eval.llama-3-1-405b.001 | eval-run:human-eval.llama-3-1-405b.2024-07 | EvalRun |
| eval-result:mmlu.llama-3-3-70b.001 | eval-run:mmlu.llama-3-3-70b.2024-12 | EvalRun |
| eval-result:human-eval.llama-3-3-70b.001 | eval-run:human-eval.llama-3-3-70b.2024-12 | EvalRun |
| eval-result:mmlu.mistral-large-2.001 | eval-run:mmlu.mistral-large-2.2024-07 | EvalRun |
| eval-result:human-eval.mistral-large-2.001 | eval-run:human-eval.mistral-large-2.2024-07 | EvalRun |
| eval-result:human-eval.codestral-25-01.001 | eval-run:human-eval.codestral-25-01.2025-01 | EvalRun |
| eval-result:multipl-e.codestral-25-01.001 | eval-run:multipl-e.codestral-25-01.2025-01 | EvalRun |
| eval-result:gpqa.gpt-5.001 | eval-run:gpqa.gpt-5.2025-08 | EvalRun |
| eval-result:human-eval.gpt-5.001 | eval-run:human-eval.gpt-5.2025-08 | EvalRun |
| eval-result:mmlu.o1.001 | eval-run:mmlu.o1.2024-12 | EvalRun |
| eval-result:math.o3.001 | eval-run:math.o3.2025-04 | EvalRun |
| eval-result:bfcl.gpt-5.001 | eval-run:bfcl.gpt-5.2025-08 | EvalRun |
| eval-result:gpqa-diamond.gpt-5.001 | eval-run:gpqa-diamond.gpt-5.2025-08 | EvalRun |
| eval-result:human-eval-plus.gpt-5.001 | eval-run:human-eval-plus.gpt-5.2025-08 | EvalRun |
| eval-result:gpqa-diamond.gpt-5-4.2026-03-17.accuracy | eval-run:gpqa-diamond.gpt-5-4.2026-03-17 | EvalRun |
| eval-result:gpqa-diamond.gpt-5-4-mini.2026-03-17.accuracy | eval-run:gpqa-diamond.gpt-5-4-mini.2026-03-17 | EvalRun |
| eval-result:mmlu.phi-3-medium.001 | eval-run:mmlu.phi-3-medium.2024-05 | EvalRun |
| eval-result:mmlu.gemma-2-27b.001 | eval-run:mmlu.gemma-2-27b.2024-06 | EvalRun |
| eval-result:gsm8k.gemma-2-27b.001 | eval-run:gsm8k.gemma-2-27b.2024-06 | EvalRun |
| eval-result:mmlu.command-r-plus.001 | eval-run:mmlu.command-r-plus.2024-08 | EvalRun |
| eval-result:swe-bench-verified.claude-opus-4-5.001 | eval-run:swe-bench-verified.claude-opus-4-5.2025-09 | EvalRun |
| eval-result:swe-bench-verified.claude-opus-4-7.001 | eval-run:swe-bench-verified.claude-opus-4-7.2026-01 | EvalRun |
| eval-result:gpqa.claude-sonnet-4-5.001 | eval-run:gpqa.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:swe-bench-verified.gpt-5.headline | eval-run:swe-bench-verified.gpt-5.2025-08 | EvalRun |
| eval-result:livecodebench.gpt-5.001 | eval-run:livecodebench.gpt-5.2025-08 | EvalRun |
| eval-result:swe-bench-verified.o3.001 | eval-run:swe-bench-verified.o3.2025-04 | EvalRun |
| eval-result:swe-bench-verified.gemini-2-5-pro.001 | eval-run:swe-bench-verified.gemini-2-5-pro.2025-06 | EvalRun |
| eval-result:gsm8k.claude-sonnet-4-5.001 | eval-run:gsm8k.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:hellaswag.claude-opus-4-5.001 | eval-run:hellaswag.claude-opus-4-5.2025-09 | EvalRun |
| eval-result:math.gpt-5.001 | eval-run:math.gpt-5.2025-08 | EvalRun |
| eval-result:evalplus.gpt-5.001 | eval-run:evalplus.gpt-5.2025-08 | EvalRun |
| eval-result:terminal-bench.claude-sonnet-4-5.001 | eval-run:terminal-bench.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:gaia.claude-code.001 | eval-run:gaia.claude-code.2025 | EvalRun |
| eval-result:swe-bench.claude-code.001 | eval-run:swe-bench.claude-code@1.x.2025-04-29 | EvalRun |
| eval-result:swe-bench-verified.claude-sonnet-4-5.high-compute.001 | eval-run:swe-bench-verified.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:swe-bench-verified.claude-sonnet-4-5.001 | eval-run:swe-bench-verified.claude-sonnet-4-5.2025-09 | EvalRun |
| eval-result:swe-bench-verified.gpt-5.headline.001 | eval-run:swe-bench-verified.gpt-5.2025-08 | EvalRun |
| eval-result:swe-bench-verified.gpt-5.001 | eval-run:swe-bench-verified.gpt-5.2025-08 | EvalRun |