How to unit test and integration test Python apps that call the Claude API. Mock the Anthropic client with unittest.mock, write deterministic tests, and add CI test coverage.
Testing LLM applications requires two complementary strategies: fast, deterministic unit tests with mocked API calls, and occasional integration tests against the real API. This guide shows both.
pip install anthropic pytest pytest-asyncio
import pytest
from unittest.mock import MagicMock, patch
import anthropic
# --- Application code under test ---
def classify_sentiment(client: anthropic.Anthropic, text: str) -> str:
"""Returns 'positive', 'negative', or 'neutral'."""
msg = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=10,
system="Reply with exactly one word: positive, negative, or neutral.",
messages=[{"role": "user", "content": text}]
)
return msg.content[0].text.strip().lower()
# --- Unit tests ---
def make_mock_response(text: str):
"""Helper: build a mock Message with content[0].text = text."""
block = MagicMock()
block.text = text
msg = MagicMock()
msg.content = [block]
return msg
def test_positive_sentiment():
client = MagicMock(spec=anthropic.Anthropic)
client.messages.create.return_value = make_mock_response("positive")
result = classify_sentiment(client, "I love this product!")
assert result == "positive"
client.messages.create.assert_called_once()
def test_negative_sentiment():
client = MagicMock(spec=anthropic.Anthropic)
client.messages.create.return_value = make_mock_response("negative")
assert classify_sentiment(client, "Terrible experience.") == "negative"
def test_model_parameter_passed():
"""Verify the function uses the correct (cheap) model."""
client = MagicMock(spec=anthropic.Anthropic)
client.messages.create.return_value = make_mock_response("neutral")
classify_sentiment(client, "It is what it is.")
call_kwargs = client.messages.create.call_args.kwargs
assert call_kwargs["model"] == "claude-haiku-4-5-20251001"
assert call_kwargs["max_tokens"] == 10
from unittest.mock import patch, MagicMock
# --- App code ---
def summarize_text(text: str) -> str:
client = anthropic.Anthropic() # created inside the function
msg = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=100,
messages=[{"role": "user", "content": f"Summarize: {text}"}]
)
return msg.content[0].text
@patch("anthropic.Anthropic")
def test_summarize(mock_anthropic_class):
mock_instance = MagicMock()
mock_anthropic_class.return_value = mock_instance
block = MagicMock(); block.text = "Short summary."
mock_instance.messages.create.return_value = MagicMock(content=[block])
result = summarize_text("A very long document...")
assert result == "Short summary."
mock_instance.messages.create.assert_called_once()
from contextlib import contextmanager
def stream_response(client: anthropic.Anthropic, prompt: str) -> str:
full = ""
with client.messages.stream(
model="claude-haiku-4-5-20251001",
max_tokens=200,
messages=[{"role": "user", "content": prompt}]
) as stream:
for text in stream.text_stream:
full += text
return full
def test_streaming():
client = MagicMock(spec=anthropic.Anthropic)
# Mock the context manager
mock_stream = MagicMock()
mock_stream.text_stream = iter(["Hello", ", ", "world", "!"])
mock_stream.__enter__ = lambda s: mock_stream
mock_stream.__exit__ = MagicMock(return_value=False)
client.messages.stream.return_value = mock_stream
result = stream_response(client, "Say hello")
assert result == "Hello, world!"
import os
import pytest
import anthropic
# Gate: only run if env var is set (set in CI secrets, never locally by default)
pytestmark = pytest.mark.skipif(
not os.environ.get("RUN_INTEGRATION_TESTS"),
reason="Set RUN_INTEGRATION_TESTS=true to run against real API"
)
@pytest.fixture(scope="module")
def client():
return anthropic.Anthropic() # uses ANTHROPIC_API_KEY from env
def test_real_api_basic(client):
msg = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=20,
messages=[{"role": "user", "content": "Reply with exactly: OK"}]
)
assert "OK" in msg.content[0].text
def test_real_api_json_output(client):
import json
msg = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=50,
system="Reply with valid JSON only. No markdown.",
messages=[{"role": "user", "content": 'Return {"status": "ok", "value": 42}'}]
)
data = json.loads(msg.content[0].text)
assert data["status"] == "ok"
assert data["value"] == 42
# .github/workflows/test.yml
name: Tests
on: [push, pull_request]
jobs:
unit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: "3.12" }
- run: pip install anthropic pytest
- run: pytest tests/unit/ -v
integration:
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main' # only on merge to main
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
RUN_INTEGRATION_TESTS: "true"
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with: { python-version: "3.12" }
- run: pip install anthropic pytest
- run: pytest tests/integration/ -v
| Approach | Speed | Cost | Reliability | Best for |
|---|---|---|---|---|
| Mocked unit tests | <1s | $0 | Deterministic | Business logic, prompt construction |
| Integration tests (Haiku) | 3–8s | ~$0.0001/test | Real API behaviour | Prompt validation, output format |
| Recorded cassettes (VCR.py) | <1s | $0 | Recorded responses | Regression testing without API calls |
| Eval frameworks (promptfoo) | Minutes | $0.01–$1 | Statistical | Quality regression across model upgrades |
For async testing with asyncio, see the async Python guide. For cost estimates before running tests, see the Claude API Cost Calculator.