| eval-result:evalplus.gpt-5.001 | eval-result:evalplus.gpt-5.001 | benchmarks |
| eval-result:human-eval.claude-sonnet-4-6.001 | eval-result:human-eval.claude-sonnet-4-6.001 | benchmarks |
| eval-result:human-eval.gpt-5.001 | eval-result:human-eval.gpt-5.001 | benchmarks |
| eval-result:human-eval.llama-4-405b.001 | eval-result:human-eval.llama-4-405b.001 | benchmarks |
| eval-result:livecodebench.gemini-2-5-pro.001 | eval-result:livecodebench.gemini-2-5-pro.001 | benchmarks |
| eval-result:livecodebench.gpt-5.001 | eval-result:livecodebench.gpt-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-haiku-4-5.001 | eval-result:swe-bench-verified.claude-haiku-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-opus-4-5.001 | eval-result:swe-bench-verified.claude-opus-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-opus-4-7.001 | eval-result:swe-bench-verified.claude-opus-4-7.001 | benchmarks |
| eval-result:swe-bench-verified.claude-sonnet-4-5.001 | eval-result:swe-bench-verified.claude-sonnet-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.gemini-2-5-flash.001 | eval-result:swe-bench-verified.gemini-2-5-flash.001 | benchmarks |
| eval-result:swe-bench-verified.gemini-2-5-pro.001 | eval-result:swe-bench-verified.gemini-2-5-pro.001 | benchmarks |
| eval-result:swe-bench-verified.gpt-5.001 | eval-result:swe-bench-verified.gpt-5.001 | benchmarks |
| eval-result:swe-bench-verified.gpt-5.headline | eval-result:swe-bench-verified.gpt-5.headline | benchmarks |
| eval-result:swe-bench-verified.llama-4-405b.001 | eval-result:swe-bench-verified.llama-4-405b.001 | benchmarks |
| eval-result:swe-bench-verified.o3.001 | eval-result:swe-bench-verified.o3.001 | benchmarks |
| eval-result:swe-bench.claude-code.001 | eval-result:swe-bench.claude-code.001 | benchmarks |
| eval-result:terminal-bench.claude-sonnet-4-5.001 | eval-result:terminal-bench.claude-sonnet-4-5.001 | benchmarks |