Build a RAG pipeline with the Claude API in Python. Embed documents, store in a vector database, retrieve relevant chunks, and pass them to Claude as context. Working code for 2026.
RAG (Retrieval Augmented Generation) is the most widely deployed LLM architecture in production. You embed your documents, retrieve relevant chunks at query time, and inject them into Claude's context. This guide shows end-to-end Python code with Chroma (zero-server prototype) and a production path using Pinecone.
pip install anthropic chromadb sentence-transformers
import anthropic
import chromadb
from sentence_transformers import SentenceTransformer
# --- 1. Embed and store documents ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") # free, runs locally
chroma = chromadb.Client()
collection = chroma.create_collection("docs")
documents = [
"Claude is developed by Anthropic and released in 2023.",
"The Claude API uses model strings like claude-sonnet-4-6.",
"Claude supports tool use, vision, and extended thinking.",
"Anthropic's mission is AI safety and beneficial AI.",
"The Anthropic API key is set via ANTHROPIC_API_KEY environment variable.",
]
embeddings = embedding_model.encode(documents).tolist()
collection.add(
documents=documents,
embeddings=embeddings,
ids=[f"doc_{i}" for i in range(len(documents))]
)
# --- 2. Query: embed question, retrieve top chunks ---
def retrieve(query: str, k: int = 3) -> list[str]:
q_embedding = embedding_model.encode([query]).tolist()
results = collection.query(query_embeddings=q_embedding, n_results=k)
return results["documents"][0]
# --- 3. Generate: inject chunks into Claude's context ---
client = anthropic.Anthropic()
def rag_query(question: str) -> str:
chunks = retrieve(question, k=3)
context = "
".join(f"[{i+1}] {chunk}" for i, chunk in enumerate(chunks))
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=(
"You are a helpful assistant. Answer the user's question using ONLY "
"the provided context. If the context does not contain the answer, say so.
"
f"Context:
{context}"
),
messages=[{"role": "user", "content": question}]
)
return response.content[0].text
print(rag_query("What model string should I use for the Claude API?"))
# pip install anthropic openai pinecone-client
import anthropic
import openai
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="PINECONE_API_KEY")
index_name = "claude-rag-demo"
# Create index once
if index_name not in [i.name for i in pc.list_indexes()]:
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
index = pc.Index(index_name)
embed_client = openai.OpenAI()
claude = anthropic.Anthropic()
def embed(texts: list[str]) -> list[list[float]]:
resp = embed_client.embeddings.create(model="text-embedding-3-small", input=texts)
return [d.embedding for d in resp.data]
def upsert_documents(docs: list[str]) -> None:
vectors = embed(docs)
index.upsert(vectors=[(f"doc_{i}", v, {"text": d}) for i, (v, d) in enumerate(zip(vectors, docs))])
def rag_query(question: str, k: int = 4) -> str:
q_vec = embed([question])[0]
results = index.query(vector=q_vec, top_k=k, include_metadata=True)
chunks = [m.metadata["text"] for m in results.matches]
context = "
".join(f"[{i+1}] {c}" for i, c in enumerate(chunks))
resp = claude.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=f"Answer using only the provided context.
Context:
{context}",
messages=[{"role": "user", "content": question}]
)
return resp.content[0].text
def chunk_text(text: str, chunk_size: int = 400, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks for better retrieval coverage."""
words = text.split()
chunks = []
start = 0
while start < len(words):
end = min(start + chunk_size, len(words))
chunks.append(" ".join(words[start:end]))
start += chunk_size - overlap
return chunks
# For PDF documents:
# pip install pypdf
from pypdf import PdfReader
def pdf_to_chunks(path: str) -> list[str]:
reader = PdfReader(path)
full_text = "
".join(page.extract_text() or "" for page in reader.pages)
return chunk_text(full_text)
| Decision | Prototype choice | Production choice |
|---|---|---|
| Embedding model | sentence-transformers (free, local) | OpenAI text-embedding-3-small or Voyage voyage-3 |
| Vector DB | Chroma (in-memory/local) | Pinecone Serverless or pgvector (PostgreSQL) |
| Chunk size | 200-400 tokens | 300-500 tokens with 50-token overlap |
| Top-k retrieval | 3 | 4-6 with MMR reranking |
| Claude model | claude-haiku-4-5-20251001 (fast/cheap) | claude-sonnet-4-6 (quality/cost balance) |
For PDF document analysis without RAG overhead, see the Claude PDF Analysis guide. For the Files API approach (upload once, reuse), see the Claude Files API example. To estimate token costs for your RAG pipeline, use the Claude API Cost Calculator.