Working Python code to extract structured data from unstructured text using the Claude API in 2026. Extract entities, invoice fields, contract terms, and custom schemas to JSON.
Claude is highly accurate at extracting structured data from unstructured text — invoices, contracts, emails, research papers, support tickets. No training data required; just describe your schema in the prompt.
pip install anthropic pydantic
import anthropic
import json
client = anthropic.Anthropic()
def extract(text: str, schema_description: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
temperature=0,
system=(
"You are a data extraction engine. "
f"Extract the following fields and return ONLY valid JSON: {schema_description}. "
"Use null for missing fields."
),
messages=[{"role": "user", "content": text}],
)
return json.loads(response.content[0].text)
# Example: extract contact info from an email signature
email_sig = """
John Smith
Senior Engineer, Acme Corp
📧 john.smith@acme.com | 📞 +1 (555) 123-4567
linkedin.com/in/johnsmith
"""
result = extract(
email_sig,
"name (string), title (string), company (string), email (string), phone (string), linkedin_url (string)"
)
print(result)
# {
# "name": "John Smith",
# "title": "Senior Engineer",
# "company": "Acme Corp",
# "email": "john.smith@acme.com",
# "phone": "+1 (555) 123-4567",
# "linkedin_url": "linkedin.com/in/johnsmith"
# }
from pydantic import BaseModel, validator
from typing import Optional
import anthropic, json
client = anthropic.Anthropic()
class LineItem(BaseModel):
description: str
quantity: float
unit_price: float
total: float
class Invoice(BaseModel):
invoice_number: Optional[str]
invoice_date: Optional[str] # ISO 8601
vendor_name: Optional[str]
vendor_address: Optional[str]
customer_name: Optional[str]
line_items: list[LineItem]
subtotal: Optional[float]
tax: Optional[float]
total_due: Optional[float]
currency: Optional[str]
due_date: Optional[str]
def extract_invoice(text: str) -> Invoice:
schema_json = Invoice.schema_json(indent=2)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
temperature=0,
system=(
"You are an invoice extraction engine. "
f"Extract all invoice fields and return ONLY valid JSON matching this schema:\n{schema_json}. "
"Use null for missing fields."
),
messages=[{"role": "user", "content": text}],
)
data = json.loads(response.content[0].text)
return Invoice(**data) # Pydantic validates types
invoice_text = """
INVOICE #INV-2026-0042
Date: 2026-05-10 | Due: 2026-06-10
Vendor: TechSupplies Ltd, 123 Main St, Austin TX 78701
Bill To: Acme Corp, 456 Oak Ave, Denver CO 80201
Services:
Cloud storage setup 1 unit $500.00 $500.00
API integration dev 8 hours $150.00 $1,200.00
Support (3 months) 1 unit $300.00 $300.00
Subtotal: $2,000.00 | Tax (8.25%): $165.00 | Total Due: $2,165.00
"""
invoice = extract_invoice(invoice_text)
print(invoice.total_due) # 2165.0
print(invoice.line_items[0].description) # "Cloud storage setup"
def extract_entities(text: str) -> dict:
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=512,
temperature=0,
system=(
"Extract named entities and return ONLY valid JSON: "
'{"people": [], "organizations": [], "locations": [], "dates": [], "monetary_values": []}'
),
messages=[{"role": "user", "content": text}],
)
return json.loads(response.content[0].text)
news = (
"Apple CEO Tim Cook announced a $500M investment in Austin, Texas on May 10, 2026, "
"partnering with Dell Technologies to expand AI infrastructure."
)
entities = extract_entities(news)
print(entities["people"]) # ["Tim Cook"]
print(entities["organizations"]) # ["Apple", "Dell Technologies"]
print(entities["locations"]) # ["Austin, Texas"]
print(entities["monetary_values"])# ["$500M"]
def extract_contract_terms(contract_text: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2048,
temperature=0,
system=(
"You are a contract analysis engine. Extract key terms and return ONLY valid JSON: "
'{"parties": [], "effective_date": null, "termination_date": null, '
'"payment_terms": null, "notice_period_days": null, '
'"governing_law": null, "non_compete_months": null, '
'"liability_cap": null, "auto_renewal": null}'
),
messages=[{"role": "user", "content": contract_text[:8000]}], # trim very long contracts
)
return json.loads(response.content[0].text)
documents = ["invoice text 1...", "invoice text 2...", ...] # thousands of docs
requests = [
{
"custom_id": f"doc-{i}",
"params": {
"model": "claude-haiku-4-5-20251001",
"max_tokens": 512,
"temperature": 0,
"system": "Extract vendor, amount, date. Return ONLY JSON: {vendor, amount, date}",
"messages": [{"role": "user", "content": doc}],
},
}
for i, doc in enumerate(documents)
]
batch = client.messages.batches.create(requests=requests)
print(f"Batch submitted: {batch.id}") # poll within 24h for 50% cheaper results
| Document type | Avg input tokens | Output tokens | Cost / doc (Sonnet) | Cost / doc (Haiku) |
|---|---|---|---|---|
| Short email | 300 | 150 | $0.00054 | $0.000094 |
| Invoice (1 page) | 800 | 300 | $0.00180 | $0.000275 |
| Contract (10 pages) | 8,000 | 500 | $0.01575 | $0.00213 |
| Research paper | 15,000 | 600 | $0.02850 | $0.00390 |
Use the Claude API Cost Calculator to price your specific document volume. For classification (labeling documents rather than extracting fields), see the text classification guide. For summarizing long documents, see the summarization guide.