| eval-run:gpqa.claude-haiku-4-5.2025-10 | eval-run:gpqa.claude-haiku-4-5.2025-10 | benchmarks |
| eval-run:gpqa.claude-sonnet-4-5.2025-09 | eval-run:gpqa.claude-sonnet-4-5.2025-09 | benchmarks |
| eval-run:gpqa.gemini-2-5-pro.2025-06 | eval-run:gpqa.gemini-2-5-pro.2025-06 | benchmarks |
| eval-run:gpqa.gpt-5.2025-08 | eval-run:gpqa.gpt-5.2025-08 | benchmarks |
| eval-run:mmlu.claude-sonnet-4-6.2025-11 | eval-run:mmlu.claude-sonnet-4-6.2025-11 | benchmarks |
| eval-run:mmlu.command-r-plus.2024-08 | eval-run:mmlu.command-r-plus.2024-08 | benchmarks |
| eval-run:mmlu.deepseek-r1.2025-01 | eval-run:mmlu.deepseek-r1.2025-01 | benchmarks |
| eval-run:mmlu.deepseek-v3.2024-12 | eval-run:mmlu.deepseek-v3.2024-12 | benchmarks |
| eval-run:mmlu.gemma-2-27b.2024-06 | eval-run:mmlu.gemma-2-27b.2024-06 | benchmarks |
| eval-run:mmlu.llama-3-1-405b.2024-07 | eval-run:mmlu.llama-3-1-405b.2024-07 | benchmarks |
| eval-run:mmlu.llama-3-3-70b.2024-12 | eval-run:mmlu.llama-3-3-70b.2024-12 | benchmarks |
| eval-run:mmlu.llama-4-405b.2024-07 | eval-run:mmlu.llama-4-405b.2024-07 | benchmarks |
| eval-run:mmlu.mistral-large-2.2024-07 | eval-run:mmlu.mistral-large-2.2024-07 | benchmarks |
| eval-run:mmlu.o1.2024-12 | eval-run:mmlu.o1.2024-12 | benchmarks |
| eval-run:mmlu.phi-3-medium.2024-05 | eval-run:mmlu.phi-3-medium.2024-05 | benchmarks |
| eval-run:mmlu.qwen-2-5-72b.2024-09 | eval-run:mmlu.qwen-2-5-72b.2024-09 | benchmarks |