Extract text, summarize, and analyze PDFs with the Claude API in Python. Pass PDFs as base64 documents or URLs and ask questions about the content.
Claude can read, summarize, extract data from, and answer questions about PDFs — no pre-processing or chunking required for typical document sizes.
import anthropic
import base64
from pathlib import Path
client = anthropic.Anthropic()
def analyze_pdf(pdf_path: str, question: str) -> str:
pdf_data = base64.standard_b64encode(Path(pdf_path).read_bytes()).decode()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": pdf_data
}
},
{"type": "text", "text": question}
]
}]
)
return response.content[0].text
summary = analyze_pdf("report.pdf", "Summarize the key findings in 3 bullet points.")
print(summary)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "url",
"url": "https://arxiv.org/pdf/2310.06825.pdf"
}
},
{"type": "text", "text": "What is the main contribution of this paper?"}
]
}]
)
import json
def extract_invoice_data(pdf_path: str) -> dict:
pdf_data = base64.standard_b64encode(Path(pdf_path).read_bytes()).decode()
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
temperature=0,
messages=[{
"role": "user",
"content": [
{"type": "document",
"source": {"type": "base64", "media_type": "application/pdf", "data": pdf_data}},
{"type": "text",
"text": 'Extract invoice data as JSON: {"invoice_number": str, "date": str, "vendor": str, "total_amount": float, "line_items": [{"description": str, "amount": float}]}. Return only JSON.'}
]
}]
)
return json.loads(response.content[0].text)
data = extract_invoice_data("invoice.pdf")
print(f"Invoice #{data['invoice_number']}: ${data['total_amount']}")
pdf_data = base64.standard_b64encode(Path("large_report.pdf").read_bytes()).decode()
def ask_about_pdf(question: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {"type": "base64", "media_type": "application/pdf", "data": pdf_data},
"cache_control": {"type": "ephemeral"} # cache the PDF for 5 min
},
{"type": "text", "text": question}
]
}]
)
return response.content[0].text
# First call: cache_write (25% premium)
print(ask_about_pdf("What is the executive summary?"))
# Second call: cache_read (90% discount — large PDF = big savings)
print(ask_about_pdf("List all recommendations."))
Caching large PDFs with cache_control is especially cost-effective — see the prompt caching example. For vision-based image analysis, see the vision API example.