| eval-run:bfcl.gpt-5.2025-08 | eval-run:bfcl.gpt-5.2025-08 | benchmarks |
| eval-run:evalplus.gpt-5.2025-08 | eval-run:evalplus.gpt-5.2025-08 | benchmarks |
| eval-run:gpqa-diamond.gpt-5.2025-08 | eval-run:gpqa-diamond.gpt-5.2025-08 | benchmarks |
| eval-run:gpqa.gpt-5.2025-08 | eval-run:gpqa.gpt-5.2025-08 | benchmarks |
| eval-run:human-eval-plus.gpt-5.2025-08 | eval-run:human-eval-plus.gpt-5.2025-08 | benchmarks |
| eval-run:human-eval.gpt-5.2025-08 | eval-run:human-eval.gpt-5.2025-08 | benchmarks |
| eval-run:livecodebench.gpt-5.2025-08 | eval-run:livecodebench.gpt-5.2025-08 | benchmarks |
| eval-run:math.gpt-5.2025-08 | eval-run:math.gpt-5.2025-08 | benchmarks |
| eval-run:swe-bench-verified.gpt-5.2025-08 | eval-run:swe-bench-verified.gpt-5.2025-08 | benchmarks |