II.
TestSet JSON
Structured · livetest-set:terminal-bench-v1
Terminal-Bench v1 json
Inspect the normalized record payload exactly as the atlas UI reads it.
{
"id": "test-set:terminal-bench-v1",
"_kind": "TestSet",
"_file": "benchmarks/test-sets/terminal-bench-v1.yaml",
"_cluster": "benchmarks",
"attributes": {
"displayName": "Terminal-Bench v1",
"benchmarkId": "benchmark:terminal-bench",
"caseCount": 80,
"releasedAt": "2024-10-01",
"composition": "The v1 release of Terminal-Bench from Stanford NLP / Princeton.\nEach task is a multi-step shell scenario evaluated end-to-end in a\nDocker sandbox; success requires the agent to reach a target file\nstate via real shell commands.\n",
"homepageUrl": "https://www.tbench.ai/",
"description": "Canonical Terminal-Bench v1 set referenced in the original paper\nand the public leaderboard.\n"
},
"outgoingEdges": [
{
"from": "test-set:terminal-bench-v1",
"to": "benchmark:terminal-bench",
"kind": "belongs_to_benchmark"
}
],
"incomingEdges": []
}