Agentic AI Atlas

II.

TestSet JSON

test-set:gaia-validation

Structured · live

GAIA validation split json

Inspect the normalized record payload exactly as the atlas UI reads it.

File · benchmarks/test-sets/gaia-validation.yamlCluster · benchmarks

Record JSON

{
  "id": "test-set:gaia-validation",
  "_kind": "TestSet",
  "_file": "benchmarks/test-sets/gaia-validation.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "GAIA validation split",
    "benchmarkId": "benchmark:gaia",
    "splitName": "validation",
    "itemCount": 165,
    "description": "Validation split of the GAIA benchmark (Mialon et al., 2023). 165\nheld-out questions across three difficulty levels. Used as the\npublic-leaderboard split because the test split is hidden.\n",
    "sourceUrl": "https://huggingface.co/datasets/gaia-benchmark/GAIA"
  },
  "outgoingEdges": [
    {
      "from": "test-set:gaia-validation",
      "to": "benchmark:gaia",
      "kind": "split_of",
      "attributes": {}
    }
  ],
  "incomingEdges": [
    {
      "from": "eval-run:gaia.claude-code.2025",
      "to": "test-set:gaia-validation",
      "kind": "uses_test_set"
    }
  ]
}