| eval-result:mmlu.mistral-large-2.001 | eval-result:mmlu.mistral-large-2.001 | benchmarks |
| eval-result:mmlu.o1.001 | eval-result:mmlu.o1.001 | benchmarks |
| eval-result:mmlu.phi-3-medium.001 | eval-result:mmlu.phi-3-medium.001 | benchmarks |
| eval-result:mmlu.qwen-2-5-72b.001 | eval-result:mmlu.qwen-2-5-72b.001 | benchmarks |
| eval-result:multipl-e.codestral-25-01.001 | eval-result:multipl-e.codestral-25-01.001 | benchmarks |
| eval-result:os-world.claude-sonnet-4-5.001 | eval-result:os-world.claude-sonnet-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-haiku-4-5.001 | eval-result:swe-bench-verified.claude-haiku-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-opus-4-5.001 | eval-result:swe-bench-verified.claude-opus-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-opus-4-7.001 | eval-result:swe-bench-verified.claude-opus-4-7.001 | benchmarks |
| eval-result:swe-bench-verified.claude-sonnet-4-5.001 | eval-result:swe-bench-verified.claude-sonnet-4-5.001 | benchmarks |
| eval-result:swe-bench-verified.claude-sonnet-4-5.high-compute.001 | eval-result:swe-bench-verified.claude-sonnet-4-5.high-compute.001 | benchmarks |
| eval-result:swe-bench-verified.gemini-2-5-flash.001 | eval-result:swe-bench-verified.gemini-2-5-flash.001 | benchmarks |
| eval-result:swe-bench-verified.gemini-2-5-pro.001 | eval-result:swe-bench-verified.gemini-2-5-pro.001 | benchmarks |
| eval-result:swe-bench-verified.gpt-5.001 | eval-result:swe-bench-verified.gpt-5.001 | benchmarks |
| eval-result:swe-bench-verified.gpt-5.headline | eval-result:swe-bench-verified.gpt-5.headline | benchmarks |
| eval-result:swe-bench-verified.gpt-5.headline.001 | eval-result:swe-bench-verified.gpt-5.headline.001 | benchmarks |
| eval-result:swe-bench-verified.llama-4-405b.001 | eval-result:swe-bench-verified.llama-4-405b.001 | benchmarks |
| eval-result:swe-bench-verified.o3.001 | eval-result:swe-bench-verified.o3.001 | benchmarks |
| eval-result:swe-bench.claude-code.001 | eval-result:swe-bench.claude-code.001 | benchmarks |
| eval-result:swe-bench.deepseek-v3.001 | eval-result:swe-bench.deepseek-v3.001 | benchmarks |
| eval-result:swe-bench.llama-3-1-405b.001 | eval-result:swe-bench.llama-3-1-405b.001 | benchmarks |
| eval-result:terminal-bench.claude-sonnet-4-5.001 | eval-result:terminal-bench.claude-sonnet-4-5.001 | benchmarks |
| eval-result:truthful-qa.claude-opus-4-5.001 | eval-result:truthful-qa.claude-opus-4-5.001 | benchmarks |