How to use Claude's prompt caching feature in Python. Cache large system prompts or documents and save up to 90% on repeated input tokens.
Prompt caching dramatically reduces costs for workloads that reuse large prompts — system prompts, documents, or conversation history.
| Token type | Sonnet 4.6 cost per 1M | vs standard input |
|---|---|---|
| Standard input | $3.00 | baseline |
| cache_write | $3.75 | +25% (one-time write cost) |
| cache_read | $0.30 | −90% (every subsequent call) |
import anthropic
client = anthropic.Anthropic()
# Large document or system prompt — must be ≥2,048 tokens for Sonnet/Opus
LARGE_CONTEXT = """
[Your large document, code base, or lengthy system prompt goes here.
The content must be at least 2,048 tokens to qualify for caching.
Typical use cases: legal documents, codebases, long instructions, RAG context.]
""" * 20 # Repeat to hit the token threshold for this demo
def ask(question: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=[
{
"type": "text",
"text": LARGE_CONTEXT,
"cache_control": {"type": "ephemeral"} # mark for caching
}
],
messages=[{"role": "user", "content": question}]
)
usage = response.usage
print(f"cache_read={getattr(usage, 'cache_read_input_tokens', 0)} "
f"cache_write={getattr(usage, 'cache_creation_input_tokens', 0)}")
return response.content[0].text
# First call: cache_write (1.25× cost)
print(ask("What are the main topics covered in this document?"))
# Second call: cache_read (0.1× cost — 90% off)
print(ask("Summarize section 3 of the document."))
messages = [
{"role": "user", "content": "Tell me about the Anthropic API."},
{"role": "assistant", "content": "The Anthropic API provides access to Claude..."},
# Mark the last turn for caching before adding new turns
]
# Add cache_control to the last message you want cached
messages[-1]["content"] = [
{
"type": "text",
"text": messages[-1]["content"],
"cache_control": {"type": "ephemeral"}
}
]
# New user turn (not cached yet)
messages.append({"role": "user", "content": "What are the pricing tiers?"})
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
messages=messages
)
For full pricing details, see prompt caching explained. Use the Cost Calculator to see cache savings on your actual session logs.