| benchmark:os-world | skill-area:desktop-automation | SkillArea |
| benchmark:android-world | skill-area:android-native | SkillArea |
| benchmark:re-bench | skill-area:autonomous-research-engineering | SkillArea |
| benchmark:re-bench | skill-area:ml-fine-tuning | SkillArea |
| benchmark:appworld | skill-area:multi-app-orchestration | SkillArea |
| benchmark:appworld | skill-area:multi-turn-tool-use | SkillArea |
| benchmark:assistant-bench | skill-area:deep-web-research | SkillArea |
| benchmark:assistant-bench | skill-area:agentic-loops | SkillArea |
| benchmark:the-agent-company | skill-area:multi-app-orchestration | SkillArea |
| benchmark:the-agent-company | skill-area:bug-fixing-from-issues | SkillArea |
| benchmark:agentclinic | skill-area:medical-agent | SkillArea |
| benchmark:travelplanner | skill-area:travel-itinerary-planning | SkillArea |
| benchmark:browse-comp | skill-area:deep-web-research | SkillArea |
| benchmark:browse-comp | skill-area:browser-automation | SkillArea |
| benchmark:mind2web-2 | skill-area:web-action-grounding | SkillArea |
| benchmark:mind2web-2 | skill-area:browser-automation | SkillArea |
| benchmark:workarena | skill-area:web-action-grounding | SkillArea |
| benchmark:workarena | skill-area:browser-automation | SkillArea |
| benchmark:webvoyager | skill-area:browser-automation | SkillArea |
| benchmark:webvoyager | skill-area:web-action-grounding | SkillArea |
| benchmark:visualwebarena | skill-area:browser-automation | SkillArea |
| benchmark:visualwebarena | skill-area:vision-extraction | SkillArea |
| benchmark:swe-lancer | skill-area:autonomous-coding-engagement | SkillArea |
| benchmark:swe-lancer | skill-area:bug-fixing-from-issues | SkillArea |
| benchmark:aider-polyglot | skill-area:python-implementation | SkillArea |
| benchmark:aider-polyglot | skill-area:bug-fixing-from-issues | SkillArea |
| benchmark:fin-bench | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:m-mmlu | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:flores-200 | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:xnli | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:olympiad-bench | skill-area:mathematical-reasoning | SkillArea |
| benchmark:promptbench | skill-area:prompt-engineering | SkillArea |
| benchmark:bias-bench | skill-area:safety-redteaming | SkillArea |
| benchmark:lmsys-arena | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:gsm8k | skill-area:mathematical-reasoning | SkillArea |
| benchmark:gsm-symbolic | skill-area:mathematical-reasoning | SkillArea |
| benchmark:hle | skill-area:closed-book-frontier-reasoning | SkillArea |
| benchmark:hle | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:frontier-math | skill-area:mathematical-reasoning | SkillArea |
| benchmark:frontier-math | skill-area:closed-book-frontier-reasoning | SkillArea |
| benchmark:bbh | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:arc-agi-3 | skill-area:visual-pattern-induction | SkillArea |
| benchmark:arc-agi-3 | skill-area:agentic-loops | SkillArea |
| benchmark:mt-bench | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:legal-bench | skill-area:closed-book-frontier-reasoning | SkillArea |
| benchmark:medqa | skill-area:medical-agent | SkillArea |
| benchmark:harmbench | skill-area:safety-redteaming | SkillArea |
| benchmark:jailbreakbench | skill-area:safety-redteaming | SkillArea |
| benchmark:advbench | skill-area:safety-redteaming | SkillArea |
| benchmark:toolbench | skill-area:tool-use | SkillArea |
| benchmark:toolbench | skill-area:multi-turn-tool-use | SkillArea |
| benchmark:berkeley-function-calling | skill-area:tool-use | SkillArea |
| benchmark:gaia | skill-area:agentic-loops | SkillArea |
| benchmark:human-eval | skill-area:python-implementation | SkillArea |
| benchmark:mmlu | skill-area:general-knowledge-reasoning | SkillArea |
| benchmark:swe-bench-verified | skill-area:bug-fixing-from-issues | SkillArea |
| benchmark:swe-bench | skill-area:bug-fixing-from-issues | SkillArea |
| benchmark:tau-bench | skill-area:multi-turn-tool-use | SkillArea |
| benchmark:tau-bench | skill-area:agentic-loops | SkillArea |
| benchmark:terminal-bench | skill-area:cli-design | SkillArea |
| benchmark:webarena | skill-area:browser-automation | SkillArea |