Log every Claude API call, track token usage and latency, set cost alerts, and integrate with Datadog, Prometheus, or custom dashboards. Production-ready Python patterns.
Once your Claude integration goes to production, you need visibility into cost, latency, and error rate. The Anthropic SDK doesn't ship built-in telemetry — this guide shows how to add it.
import anthropic
import time
import logging
import json
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
logger = logging.getLogger("claude")
client = anthropic.Anthropic()
def tracked_create(messages, model="claude-haiku-4-5-20251001", max_tokens=512, **kwargs):
"""Drop-in replacement for client.messages.create() with structured logging."""
start = time.monotonic()
error = None
response = None
try:
response = client.messages.create(
model=model,
max_tokens=max_tokens,
messages=messages,
**kwargs
)
return response
except Exception as e:
error = str(e)
raise
finally:
latency_ms = round((time.monotonic() - start) * 1000)
log_record = {
"model": model,
"latency_ms": latency_ms,
"input_tokens": response.usage.input_tokens if response else None,
"output_tokens": response.usage.output_tokens if response else None,
"stop_reason": response.stop_reason if response else None,
"error": error,
}
logger.info(json.dumps(log_record))
from dataclasses import dataclass, field
from threading import Lock
# Token prices per million (check claude-cost-calc.vercel.app for current rates)
PRICES = {
"claude-haiku-4-5-20251001": {"input": 0.80, "output": 4.00},
"claude-sonnet-4-6": {"input": 3.00, "output": 15.00},
"claude-opus-4-7": {"input": 15.00, "output": 75.00},
}
@dataclass
class CostAccumulator:
_lock: Lock = field(default_factory=Lock, repr=False)
total_input_tokens: int = 0
total_output_tokens: int = 0
total_cost_usd: float = 0.0
def record(self, model: str, input_tokens: int, output_tokens: int):
price = PRICES.get(model, {"input": 3.0, "output": 15.0})
cost = (input_tokens * price["input"] + output_tokens * price["output"]) / 1_000_000
with self._lock:
self.total_input_tokens += input_tokens
self.total_output_tokens += output_tokens
self.total_cost_usd += cost
def report(self):
return {
"input_tokens": self.total_input_tokens,
"output_tokens": self.total_output_tokens,
"total_cost_usd": round(self.total_cost_usd, 6)
}
accumulator = CostAccumulator()
def tracked_create_with_cost(messages, model="claude-sonnet-4-6", max_tokens=512, **kwargs):
response = client.messages.create(model=model, max_tokens=max_tokens, messages=messages, **kwargs)
accumulator.record(model, response.usage.input_tokens, response.usage.output_tokens)
return response
# At end of job / request handler:
# print(accumulator.report())
# → {"input_tokens": 12430, "output_tokens": 3820, "total_cost_usd": 0.094290}
pip install prometheus-client
from prometheus_client import Counter, Histogram, start_http_server
import time, anthropic
client = anthropic.Anthropic()
REQUEST_COUNT = Counter("claude_requests_total", "Total Claude API requests", ["model", "stop_reason"])
TOKEN_COUNTER = Counter("claude_tokens_total", "Total tokens consumed", ["model", "direction"])
LATENCY = Histogram("claude_request_duration_seconds", "Request latency", ["model"],
buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60])
ERROR_COUNT = Counter("claude_errors_total", "API errors", ["model", "error_type"])
def instrumented_create(messages, model="claude-sonnet-4-6", max_tokens=512, **kwargs):
start = time.monotonic()
try:
response = client.messages.create(model=model, max_tokens=max_tokens, messages=messages, **kwargs)
LATENCY.labels(model=model).observe(time.monotonic() - start)
REQUEST_COUNT.labels(model=model, stop_reason=response.stop_reason).inc()
TOKEN_COUNTER.labels(model=model, direction="input").inc(response.usage.input_tokens)
TOKEN_COUNTER.labels(model=model, direction="output").inc(response.usage.output_tokens)
return response
except anthropic.RateLimitError:
ERROR_COUNT.labels(model=model, error_type="rate_limit").inc()
raise
except anthropic.APIStatusError as e:
ERROR_COUNT.labels(model=model, error_type=f"status_{e.status_code}").inc()
raise
except anthropic.APIConnectionError:
ERROR_COUNT.labels(model=model, error_type="connection").inc()
raise
# Expose /metrics endpoint on port 8001
start_http_server(8001) # add to app startup
pip install anthropic
import anthropic
# Swap base_url → all requests proxy through Helicone
# Sign up at helicone.ai for a free API key
client = anthropic.Anthropic(
base_url="https://anthropic.helicone.ai",
default_headers={
"Helicone-Auth": "Bearer sk-helicone-xxxx",
}
)
# Everything else is identical — no other code changes
message = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=256,
messages=[{"role": "user", "content": "Hello"}]
)
print(message.content[0].text)
| Signal | Alert threshold | Likely cause |
|---|---|---|
| Error rate | >1% over 5 min | API overload (529) or bad prompt (400) |
| p99 latency | >30s for Haiku, >60s for Sonnet | Long max_tokens or peak load |
| Daily cost | >120% of 7-day average | Traffic spike or prompt regression |
| 429 rate | >5% of requests | Approaching rate limit — add backoff or upgrade tier |
For current per-model token prices to keep your cost calculations accurate, check the Claude API Cost Calculator. For rate limit patterns, see the rate limits guide.