Agentic AI Atlas

II.
Benchmark JSON
benchmark:gpqa
Structured · live
GPQA json

Inspect the normalized record payload exactly as the atlas UI reads it.
File · benchmarks/benchmarks/benchmarks-knowledge.yamlCluster · benchmarks
Record JSON
{
  "id": "benchmark:gpqa",
  "_kind": "Benchmark",
  "_file": "benchmarks/benchmarks/benchmarks-knowledge.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "GPQA",
    "homepageUrl": "https://github.com/idavidrein/gpqa",
    "kind": "model-only",
    "targetsKind": "ModelVersion",
    "description": "GPQA (Graduate-Level Google-Proof Q&A) by Rein et al. (2023) is a\n448-question multiple-choice benchmark in biology, chemistry, and\nphysics written and validated by domain-expert PhDs. Designed to be\n\"Google-proof\" — non-experts with web access score ~34%, in-domain\nPhDs score ~65%. The Diamond subset (198 questions) is the hardest\ntier and is the standard reported number in vendor announcements.\n"
  },
  "outgoingEdges": [
    {
      "from": "benchmark:gpqa",
      "to": "test-set:gpqa-diamond",
      "kind": "uses_test_set",
      "attributes": {}
    },
    {
      "from": "benchmark:gpqa",
      "to": "model:claude-opus-4-7@current",
      "kind": "targets",
      "attributes": {}
    },
    {
      "from": "benchmark:gpqa",
      "to": "model:claude-opus-4-6@current",
      "kind": "targets",
      "attributes": {}
    }
  ],
  "incomingEdges": [
    {
      "from": "eval-result:gpqa-diamond.claude-opus-4-5.001",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa.deepseek-r1.001",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gemini-2-5-pro.001",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gemini-3-1-pro.2026-02-19.accuracy",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gemini-3-pro.2025-11-18.accuracy",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gpt-5.001",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gpt-5-4.2026-03-17.accuracy",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:gpqa-diamond.gpt-5-4-mini.2026-03-17.accuracy",
      "to": "benchmark:gpqa",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-run:gpqa.claude-haiku-4-5.2025-10",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.claude-opus-4-5.2025-09",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa.deepseek-r1.2025-01",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa.gemini-2-5-pro.2025-06",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-2-5-pro.2025-06",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-3-1-pro.2026-02-19",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-3-pro.2025-11-18",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa.gpt-5.2025-08",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5.2025-08",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5-4.2026-03-17",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5-4-mini.2026-03-17",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:gpqa.claude-sonnet-4-5.2025-09",
      "to": "benchmark:gpqa",
      "kind": "for_benchmark"
    }
  ]
}