II.
TestSet JSON
Structured · livetest-set:gpqa-diamond
GPQA Diamond json
Inspect the normalized record payload exactly as the atlas UI reads it.
{
"id": "test-set:gpqa-diamond",
"_kind": "TestSet",
"_file": "benchmarks/test-sets/gpqa-diamond.yaml",
"_cluster": "benchmarks",
"attributes": {
"displayName": "GPQA Diamond",
"benchmarkId": "benchmark:mmlu",
"caseCount": 198,
"releasedAt": "2023-11-29",
"composition": "The \"diamond\" subset of GPQA — 198 graduate-level questions in\nbiology, chemistry, and physics, written and validated by domain\nexperts. Diamond is the hardest tier; experts (PhD students in the\nsame field) achieve ~65% accuracy.\n",
"homepageUrl": "https://github.com/idavidrein/gpqa",
"description": "GPQA Diamond is the hardest subset of the Graduate-level Google-Proof\nQ&A (GPQA) benchmark introduced by Rein et al., 2023.\n"
},
"outgoingEdges": [],
"incomingEdges": [
{
"from": "benchmark:gpqa",
"to": "test-set:gpqa-diamond",
"kind": "uses_test_set",
"attributes": {}
}
]
}