Agentic AI Atlas

II.

Benchmark JSON

benchmark:math

Structured · live

MATH json

Inspect the normalized record payload exactly as the atlas UI reads it.

File · benchmarks/benchmarks/benchmarks-math.yamlCluster · benchmarks

Record JSON

{
  "id": "benchmark:math",
  "_kind": "Benchmark",
  "_file": "benchmarks/benchmarks/benchmarks-math.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "MATH",
    "homepageUrl": "https://github.com/hendrycks/math",
    "kind": "math",
    "targetsKind": "ModelVersion",
    "description": "The MATH benchmark (Hendrycks et al., 2021) is 12,500 competition\nmathematics problems drawn from AMC, AIME, and similar contests,\nwith full step-by-step solutions.\n"
  },
  "outgoingEdges": [],
  "incomingEdges": [
    {
      "from": "eval-result:math.deepseek-r1.001",
      "to": "benchmark:math",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:math.o3.001",
      "to": "benchmark:math",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-result:math.gpt-5.001",
      "to": "benchmark:math",
      "kind": "scored_against",
      "attributes": {}
    },
    {
      "from": "eval-run:math.deepseek-r1.2025-01",
      "to": "benchmark:math",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:math.o3.2025-04",
      "to": "benchmark:math",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:math.gpt-5.2025-08",
      "to": "benchmark:math",
      "kind": "for_benchmark"
    },
    {
      "from": "test-set:math-test",
      "to": "benchmark:math",
      "kind": "belongs_to_benchmark"
    },
    {
      "from": "scope-boundary:math.scope",
      "to": "benchmark:math",
      "kind": "bounds_subject"
    }
  ]
}