EvalPlus
benchmark:bigcode-evalplus

Benchmarkbenchmarks/benchmarks/benchmarks-coding.yaml·Open in Graph →
{
  "id": "benchmark:bigcode-evalplus",
  "_kind": "Benchmark",
  "_file": "benchmarks/benchmarks/benchmarks-coding.yaml",
  "_cluster": "benchmarks",
  "attributes": {
    "displayName": "EvalPlus",
    "homepageUrl": "https://evalplus.github.io/",
    "kind": "code-functional-correctness",
    "targetsKind": "ModelVersion",
    "description": "EvalPlus extends HumanEval and MBPP with 80x more high-quality\ntests per task to expose flaky correctness in LLM-generated code,\nyielding HumanEval+ and MBPP+ leaderboards.\n"
  },
  "outgoingEdges": [],
  "incomingEdges": [
    {
      "from": "eval-run:human-eval-plus.claude-sonnet-4-5.2025-09",
      "to": "benchmark:bigcode-evalplus",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:human-eval-plus.gpt-5.2025-08",
      "to": "benchmark:bigcode-evalplus",
      "kind": "for_benchmark"
    },
    {
      "from": "eval-run:evalplus.gpt-5.2025-08",
      "to": "benchmark:bigcode-evalplus",
      "kind": "for_benchmark"
    },
    {
      "from": "test-set:bigcode-evalplus",
      "to": "benchmark:bigcode-evalplus",
      "kind": "belongs_to_benchmark"
    },
    {
      "from": "scope-boundary:bigcode-evalplus.scope",
      "to": "benchmark:bigcode-evalplus",
      "kind": "bounds_subject"
    }
  ]
}