| eval-run:evalplus.gpt-5.2025-08 | eval-run:evalplus.gpt-5.2025-08 | benchmarks |
| eval-run:gpqa.claude-haiku-4-5.2025-10 | eval-run:gpqa.claude-haiku-4-5.2025-10 | benchmarks |
| eval-run:gpqa.claude-sonnet-4-5.2025-09 | eval-run:gpqa.claude-sonnet-4-5.2025-09 | benchmarks |
| eval-run:gpqa.gemini-2-5-pro.2025-06 | eval-run:gpqa.gemini-2-5-pro.2025-06 | benchmarks |
| eval-run:gpqa.gpt-5.2025-08 | eval-run:gpqa.gpt-5.2025-08 | benchmarks |
| eval-run:gsm8k.claude-sonnet-4-5.2025-09 | eval-run:gsm8k.claude-sonnet-4-5.2025-09 | benchmarks |
| eval-run:hellaswag.claude-opus-4-5.2025-09 | eval-run:hellaswag.claude-opus-4-5.2025-09 | benchmarks |
| eval-run:human-eval.claude-sonnet-4-6.2025-11 | eval-run:human-eval.claude-sonnet-4-6.2025-11 | benchmarks |
| eval-run:human-eval.gpt-5.2025-08 | eval-run:human-eval.gpt-5.2025-08 | benchmarks |
| eval-run:human-eval.llama-4-405b.2024-07 | eval-run:human-eval.llama-4-405b.2024-07 | benchmarks |
| eval-run:livecodebench.gemini-2-5-pro.2025-06 | eval-run:livecodebench.gemini-2-5-pro.2025-06 | benchmarks |
| eval-run:livecodebench.gpt-5.2025-08 | eval-run:livecodebench.gpt-5.2025-08 | benchmarks |
| eval-run:math.gpt-5.2025-08 | eval-run:math.gpt-5.2025-08 | benchmarks |
| eval-run:math.o3.2025-04 | eval-run:math.o3.2025-04 | benchmarks |
| eval-run:mmlu.claude-sonnet-4-6.2025-11 | eval-run:mmlu.claude-sonnet-4-6.2025-11 | benchmarks |
| eval-run:mmlu.llama-4-405b.2024-07 | eval-run:mmlu.llama-4-405b.2024-07 | benchmarks |
| eval-run:mmlu.o1.2024-12 | eval-run:mmlu.o1.2024-12 | benchmarks |
| eval-run:swe-bench-verified.claude-haiku-4-5.2025-10 | eval-run:swe-bench-verified.claude-haiku-4-5.2025-10 | benchmarks |
| eval-run:swe-bench-verified.claude-opus-4-5.2025-09 | eval-run:swe-bench-verified.claude-opus-4-5.2025-09 | benchmarks |
| eval-run:swe-bench-verified.claude-opus-4-7.2026-01 | eval-run:swe-bench-verified.claude-opus-4-7.2026-01 | benchmarks |
| eval-run:swe-bench-verified.claude-sonnet-4-5.2025-09 | eval-run:swe-bench-verified.claude-sonnet-4-5.2025-09 | benchmarks |
| eval-run:swe-bench-verified.gemini-2-5-flash.2025-06 | eval-run:swe-bench-verified.gemini-2-5-flash.2025-06 | benchmarks |
| eval-run:swe-bench-verified.gemini-2-5-pro.2025-06 | eval-run:swe-bench-verified.gemini-2-5-pro.2025-06 | benchmarks |
| eval-run:swe-bench-verified.gpt-5.2025-08 | eval-run:swe-bench-verified.gpt-5.2025-08 | benchmarks |
| eval-run:swe-bench-verified.llama-4-405b.2024-07 | eval-run:swe-bench-verified.llama-4-405b.2024-07 | benchmarks |
| eval-run:swe-bench-verified.o3.2025-04 | eval-run:swe-bench-verified.o3.2025-04 | benchmarks |
| eval-run:swe-bench.claude-code@1.x.2025-04-29 | eval-run:swe-bench.claude-code@1.x.2025-04-29 | benchmarks |
| eval-run:swe-bench.deepseek-v3.2024-12 | eval-run:swe-bench.deepseek-v3.2024-12 | benchmarks |
| eval-run:swe-bench.llama-3-1-405b.2024-07 | eval-run:swe-bench.llama-3-1-405b.2024-07 | benchmarks |
| eval-run:terminal-bench.claude-sonnet-4-5.2025-09 | eval-run:terminal-bench.claude-sonnet-4-5.2025-09 | benchmarks |