Call the Claude computer use API to let Claude control a desktop. Full Python example: screenshot, send to Claude, execute mouse/keyboard actions, loop.
Safety: Anthropic recommends running computer use inside a sandboxed Docker container or VM, not on your personal machine. Claude can click and type anywhere on screen.
pip install anthropic pyautogui Pillow
tool_result with the computer_use_20250124 tool declared.tool_use block: an action (click, type, screenshot, scroll) and coordinates.import anthropic, base64, time
from io import BytesIO
from PIL import ImageGrab
import pyautogui
client = anthropic.Anthropic()
SCREEN_W, SCREEN_H = pyautogui.size()
def take_screenshot():
img = ImageGrab.grab()
buf = BytesIO()
img.save(buf, format="PNG")
return base64.standard_b64encode(buf.getvalue()).decode("utf-8")
def execute_action(action):
kind = action.get("type") or action.get("action")
if kind == "screenshot":
return
elif kind == "left_click":
pyautogui.click(*action["coordinate"])
elif kind == "right_click":
pyautogui.rightClick(*action["coordinate"])
elif kind == "double_click":
pyautogui.doubleClick(*action["coordinate"])
elif kind == "type":
pyautogui.write(action["text"], interval=0.02)
elif kind == "key":
pyautogui.hotkey(*action["text"].split("+"))
elif kind == "scroll":
x, y = action["coordinate"]
d = action.get("direction", "down")
pyautogui.scroll(action.get("amount", 3) * (1 if d=="up" else -1), x=x, y=y)
time.sleep(0.5)
def run(task, max_steps=20):
messages = []
last_tool_id = None
for step in range(max_steps):
shot = take_screenshot()
if step == 0:
messages.append({"role": "user", "content": [
{"type": "tool_result", "tool_use_id": "initial",
"content": [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": shot}}]},
{"type": "text", "text": task}
]})
else:
messages.append({"role": "user", "content": [
{"type": "tool_result", "tool_use_id": last_tool_id,
"content": [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": shot}}]}
]})
resp = client.beta.messages.create(
model="claude-sonnet-4-6-20251001",
max_tokens=4096,
tools=[{"type": "computer_20250124", "name": "computer",
"display_width_px": SCREEN_W, "display_height_px": SCREEN_H, "display_number": 1}],
messages=messages,
betas=["computer-use-2025-10-01"],
)
messages.append({"role": "assistant", "content": resp.content})
tool_uses = [b for b in resp.content if b.type == "tool_use"]
if not tool_uses:
for b in resp.content:
if hasattr(b, "text"): print(b.text)
break
for tu in tool_uses:
last_tool_id = tu.id
print(f"Step {step+1}: {tu.input}")
execute_action(tu.input)
if __name__ == "__main__":
run("Open a text editor and type 'Hello from Claude!'")
| Action | Required fields | Description |
|---|---|---|
screenshot | — | Claude requests a fresh screenshot |
left_click | coordinate: [x,y] | Single left click at pixel coords |
right_click | coordinate: [x,y] | Right click |
double_click | coordinate: [x,y] | Double click |
type | text: "string" | Type text at current focus |
key | text: "ctrl+c" | Press key combination |
scroll | coordinate, direction, amount | Scroll up/down |
docker pull ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest
docker run -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY -p 5900:5900 -p 8501:8501 ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest
Open http://localhost:8501 for the Streamlit demo or VNC to localhost:5900 to watch Claude work.
Each 1280×800 screenshot costs ~1,600–2,000 input tokens. At Sonnet 4.6 pricing ($3/M tokens) that is ~$0.005 per screenshot. Estimate total task costs with the Claude API Cost Calculator.