Agentic AI Atlas

II.

TestSet JSON

test-set:bfcl-v3

Structured · live

Berkeley Function Calling Leaderboard v3 json

Inspect the normalized record payload exactly as the atlas UI reads it.

File · benchmarks/test-sets/test-sets.yamlCluster · benchmarks

Record JSON

{
  "id": "test-set:bfcl-v3",
  "_kind": "TestSet",
  "_file": "benchmarks/test-sets/test-sets.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "Berkeley Function Calling Leaderboard v3",
    "benchmarkId": "benchmark:berkeley-function-calling",
    "caseCount": 4951,
    "releasedAt": "2024-09-19",
    "composition": "BFCL v3 extends the leaderboard with multi-turn and multi-step\nfunction-calling categories alongside the v1 simple/parallel/\nmultiple categories and the v2 \"live\" user-contributed prompts.\nThe aggregate v3 test bank totals ~4,951 cases across all\ncategories.\n",
    "homepageUrl": "https://gorilla.cs.berkeley.edu/leaderboard.html",
    "description": "BFCL v3 is the canonical multi-turn extension of the Berkeley\nFunction Calling Leaderboard, released in September 2024. It is\nthe standard public reference for LLM function-calling accuracy.\n"
  },
  "outgoingEdges": [],
  "incomingEdges": [
    {
      "from": "eval-run:bfcl.claude-sonnet-4-5.2025-09",
      "to": "test-set:bfcl-v3",
      "kind": "uses_test_set"
    },
    {
      "from": "eval-run:bfcl.gpt-5.2025-08",
      "to": "test-set:bfcl-v3",
      "kind": "uses_test_set"
    }
  ]
}