{
"id": "capability:vision-input",
"_kind": "Capability",
"_file": "capabilities/capabilities/vision-input.yaml",
"_cluster": "capabilities",
"attributes": {
"displayName": "Vision input",
"description": "Ability to accept image inputs (screenshots, photos, pasted clipboard\nimages) as part of a user/agent turn. Required by interaction primitives\nthat paste or attach visual content. Distinct from `text-streaming` or\nother text-only input modalities.\n",
"appliesToNodeKinds": [
"ModelVersion",
"AgentRuntimeImpl"
],
"category": "modality"
},
"outgoingEdges": [],
"incomingEdges": [
{
"from": "interaction-primitive:paste-image",
"to": "capability:vision-input",
"kind": "requires_capability"
},
{
"from": "agent-version:claude-code@1.x",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Image input is backed by official Claude multimodal documentation, with lower confidence than the Claude Code-specific docs used elsewhere."
}
},
{
"from": "agent-version:copilot-cli@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Public image-input evidence is strongest in the Copilot SDK rather than CLI docs."
}
},
{
"from": "agent-version:cursor@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Cursor's agent overview documents image-capable reads and image generation."
}
},
{
"from": "agent-version:gemini-cli@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Public Gemini CLI evidence describes multimodal CLI support."
}
},
{
"from": "agent-version:omp@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Public README advertises image support; technical detail is thinner."
}
},
{
"from": "agent-version:openclaw@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "OpenClaw fallback metadata."
}
},
{
"from": "agent-version:opencode@1.x",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Public OpenCode docs lack strong technical detail for image input."
}
},
{
"from": "agent-version:pi@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=0.0.0",
"level": "full",
"notes": "Pi fallback harness."
}
},
{
"from": "model:claude-opus-4-7@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=4.7.0 <5.0.0",
"level": "full",
"notes": "Opus 4.7 accepts image modality inputs per its modalities list. ",
"evidenceSourceIds": [
"evidence:anthropic-models-doc"
]
}
},
{
"from": "model:gpt-4o@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": "*",
"level": "full",
"evidenceSourceIds": [
"evidence:openai-responses-api"
]
}
},
{
"from": "model:gpt-5@current",
"to": "capability:vision-input",
"kind": "supports",
"attributes": {
"versionRange": ">=5.0.0 <6.0.0",
"level": "full",
"notes": "GPT-5 accepts image modality inputs.",
"evidenceSourceIds": [
"evidence:openai-responses-api"
]
}
}
]
}