Fix Anthropic API rate limit errors (429, overloaded_error) in Python. Working retry logic with exponential backoff, token-per-minute tracking, and concurrency controls.
Anthropic enforces requests-per-minute (RPM) and tokens-per-minute (TPM) limits. When exceeded, the API returns HTTP 429. Here's how to handle rate limits robustly in Python.
import anthropic
# SDK retries 429 automatically up to max_retries times
client = anthropic.Anthropic(max_retries=5)
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=512,
messages=[{"role": "user", "content": "Hello"}]
)
print(response.content[0].text)
import time
import random
import anthropic
from anthropic import RateLimitError, APIStatusError
client = anthropic.Anthropic(max_retries=0) # handle manually
def call_with_backoff(messages, model="claude-haiku-4-5-20251001", max_tokens=512,
max_attempts=7):
delay = 1.0
for attempt in range(max_attempts):
try:
return client.messages.create(
model=model, max_tokens=max_tokens, messages=messages
)
except RateLimitError:
if attempt == max_attempts - 1:
raise
jitter = random.uniform(0, 1)
wait = min(delay + jitter, 60)
print(f"Rate limited. Waiting {wait:.1f}s (attempt {attempt + 1})")
time.sleep(wait)
delay = min(delay * 2, 60)
except APIStatusError as e:
if e.status_code == 529: # overloaded
time.sleep(min(delay + random.uniform(0, 1), 60))
delay *= 2
else:
raise
import time
import threading
from collections import deque
import anthropic
class TokenBudgetedClient:
"""Stays under a tokens-per-minute cap."""
def __init__(self, tpm_limit=80_000):
self.client = anthropic.Anthropic(max_retries=3)
self.tpm_limit = tpm_limit
self.window = deque() # (timestamp, tokens_used)
self.lock = threading.Lock()
def _tokens_in_last_minute(self):
cutoff = time.time() - 60
while self.window and self.window[0][0] < cutoff:
self.window.popleft()
return sum(t for _, t in self.window)
def create(self, messages, model="claude-haiku-4-5-20251001", max_tokens=512):
while True:
with self.lock:
used = self._tokens_in_last_minute()
# conservative: assume max_tokens output
if used + max_tokens < self.tpm_limit:
break
time.sleep(2)
response = self.client.messages.create(
model=model, max_tokens=max_tokens, messages=messages
)
total = response.usage.input_tokens + response.usage.output_tokens
with self.lock:
self.window.append((time.time(), total))
return response
# Usage
tbc = TokenBudgetedClient(tpm_limit=80_000)
resp = tbc.create([{"role": "user", "content": "Summarize this."}])
import asyncio
import anthropic
async def process_batch(prompts, concurrency=5, model="claude-haiku-4-5-20251001"):
client = anthropic.AsyncAnthropic(max_retries=4)
sem = asyncio.Semaphore(concurrency) # max concurrent calls
async def call(prompt):
async with sem:
return await client.messages.create(
model=model,
max_tokens=256,
messages=[{"role": "user", "content": prompt}]
)
results = await asyncio.gather(*[call(p) for p in prompts], return_exceptions=True)
await client.close()
return results
# Run
prompts = ["Translate 'hello' to French.", "Translate 'hello' to German."]
results = asyncio.run(process_batch(prompts))
| Plan | RPM | TPM (Haiku) | TPM (Sonnet) | TPM (Opus) |
|---|---|---|---|---|
| Free | 5 | 25K | 25K | 10K |
| Build | 50 | 100K | 80K | 20K |
| Scale | Custom | Custom | Custom | Custom |
Track spending and token usage in real time with the Claude API Cost Calculator. For Batch API workloads (50% discount, async), see the cost optimization guide.