II.
Benchmark JSON
Structured · livebenchmark:gpqa
GPQA json
Inspect the normalized record payload exactly as the atlas UI reads it.
{
"id": "benchmark:gpqa",
"_kind": "Benchmark",
"_file": "benchmarks/benchmarks/benchmarks-knowledge.yaml",
"_cluster": "benchmarks",
"attributes": {
"displayName": "GPQA",
"homepageUrl": "https://github.com/idavidrein/gpqa",
"kind": "model-only",
"targetsKind": "ModelVersion",
"description": "GPQA (Graduate-Level Google-Proof Q&A) by Rein et al. (2023) is a\n448-question multiple-choice benchmark in biology, chemistry, and\nphysics written and validated by domain-expert PhDs. Designed to be\n\"Google-proof\" — non-experts with web access score ~34%, in-domain\nPhDs score ~65%. The Diamond subset (198 questions) is the hardest\ntier and is the standard reported number in vendor announcements.\n"
},
"outgoingEdges": [
{
"from": "benchmark:gpqa",
"to": "test-set:gpqa-diamond",
"kind": "uses_test_set",
"attributes": {}
},
{
"from": "benchmark:gpqa",
"to": "model:claude-opus-4-7@current",
"kind": "targets",
"attributes": {}
},
{
"from": "benchmark:gpqa",
"to": "model:claude-opus-4-6@current",
"kind": "targets",
"attributes": {}
}
],
"incomingEdges": [
{
"from": "eval-result:gpqa-diamond.claude-opus-4-5.001",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa.deepseek-r1.001",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gemini-2-5-pro.001",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gemini-3-1-pro.2026-02-19.accuracy",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gemini-3-pro.2025-11-18.accuracy",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gpt-5.001",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gpt-5-4.2026-03-17.accuracy",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-result:gpqa-diamond.gpt-5-4-mini.2026-03-17.accuracy",
"to": "benchmark:gpqa",
"kind": "scored_against",
"attributes": {}
},
{
"from": "eval-run:gpqa.claude-haiku-4-5.2025-10",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.claude-opus-4-5.2025-09",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa.deepseek-r1.2025-01",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa.gemini-2-5-pro.2025-06",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gemini-2-5-pro.2025-06",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gemini-3-1-pro.2026-02-19",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gemini-3-pro.2025-11-18",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa.gpt-5.2025-08",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gpt-5.2025-08",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gpt-5-4.2026-03-17",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa-diamond.gpt-5-4-mini.2026-03-17",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
},
{
"from": "eval-run:gpqa.claude-sonnet-4-5.2025-09",
"to": "benchmark:gpqa",
"kind": "for_benchmark"
}
]
}