Analyze images with Claude in Python. Pass base64 images or URLs, extract text, describe scenes, and process PDFs with the Anthropic vision API.
Claude's vision capability lets you analyze images inline in the messages array — no separate endpoint required.
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "url",
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png"
}
},
{"type": "text", "text": "What do you see in this image? Be specific."}
]
}
]
)
print(response.content[0].text)
import anthropic
import base64
from pathlib import Path
client = anthropic.Anthropic()
def analyze_local_image(image_path: str, prompt: str) -> str:
image_data = base64.standard_b64encode(Path(image_path).read_bytes()).decode()
# Detect media type from extension
ext = Path(image_path).suffix.lower()
media_type = {"jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".webp": "image/webp"}.get(ext, "image/jpeg")
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {"type": "base64", "media_type": media_type, "data": image_data}
},
{"type": "text", "text": prompt}
]
}
]
)
return response.content[0].text
# Usage
result = analyze_local_image("screenshot.png", "Extract all text visible in this screenshot.")
print(result)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{"type": "image", "source": {"type": "url", "url": "https://example.com/before.jpg"}},
{"type": "text", "text": "Before:"},
{"type": "image", "source": {"type": "url", "url": "https://example.com/after.jpg"}},
{"type": "text", "text": "After: What changed between these two images?"}
]
}
]
)
Use the Cost Calculator to estimate token costs for image workloads. See the Python quickstart for non-vision API basics.