{
"id": "benchmark:bigcode-evalplus",
"_kind": "Benchmark",
"_file": "benchmarks/benchmarks/benchmarks-coding.yaml",
"_cluster": "benchmarks",
"attributes": {
"displayName": "EvalPlus",
"homepageUrl": "https://evalplus.github.io/",
"kind": "code-functional-correctness",
"targetsKind": "ModelVersion",
"description": "EvalPlus extends HumanEval and MBPP with 80x more high-quality\ntests per task to expose flaky correctness in LLM-generated code,\nyielding HumanEval+ and MBPP+ leaderboards.\n"
},
"outgoingEdges": [],
"incomingEdges": [
{
"from": "eval-run:human-eval-plus.claude-sonnet-4-5.2025-09",
"to": "benchmark:bigcode-evalplus",
"kind": "for_benchmark"
},
{
"from": "eval-run:human-eval-plus.gpt-5.2025-08",
"to": "benchmark:bigcode-evalplus",
"kind": "for_benchmark"
},
{
"from": "eval-run:evalplus.gpt-5.2025-08",
"to": "benchmark:bigcode-evalplus",
"kind": "for_benchmark"
},
{
"from": "test-set:bigcode-evalplus",
"to": "benchmark:bigcode-evalplus",
"kind": "belongs_to_benchmark"
},
{
"from": "scope-boundary:bigcode-evalplus.scope",
"to": "benchmark:bigcode-evalplus",
"kind": "bounds_subject"
}
]
}