Agentic AI Atlas

II.

TestSet JSON

test-set:gpqa-diamond-2024

Structured · live

GPQA Diamond — 2024 release json

Inspect the normalized record payload exactly as the atlas UI reads it.

File · benchmarks/test-sets/test-sets.yamlCluster · benchmarks

Record JSON

{
  "id": "test-set:gpqa-diamond-2024",
  "_kind": "TestSet",
  "_file": "benchmarks/test-sets/test-sets.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "GPQA Diamond — 2024 release",
    "benchmarkId": "benchmark:gpqa",
    "caseCount": 198,
    "releasedAt": "2023-11-29",
    "composition": "The \"diamond\" subset of GPQA — 198 graduate-level questions in\nbiology, chemistry, and physics written and validated by domain\nexperts. Diamond is the hardest tier; in-domain experts (PhD\nstudents) achieve ~65% accuracy, while non-expert humans with\nweb access score ~34%.\n",
    "homepageUrl": "https://github.com/idavidrein/gpqa",
    "description": "The frozen Diamond split is the standard reported number for\nvendor announcements (Anthropic, OpenAI, Google) when citing\n\"GPQA Diamond\" scores.\n"
  },
  "outgoingEdges": [],
  "incomingEdges": [
    {
      "from": "eval-run:gpqa-diamond.claude-opus-4-5.2025-09",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa.deepseek-r1.2025-01",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-2-5-pro.2025-06",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-3-1-pro.2026-02-19",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gemini-3-pro.2025-11-18",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5.2025-08",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5-4.2026-03-17",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:gpqa-diamond.gpt-5-4-mini.2026-03-17",
      "to": "test-set:gpqa-diamond-2024",
      "kind": "uses_test_set"
    }
  ]
}