Convert natural language questions to SQL queries using Claude API. Schema injection, safety guardrails, multi-table joins, self-healing queries, and BigQuery support — all with Python code examples.
Text-to-SQL is the most common LLM use case for data teams. Claude converts natural-language questions into SQL queries when given schema context — no fine-tuning required. This guide covers everything from a minimal prototype to production-ready patterns with safety guardrails.
import anthropic
client = anthropic.Anthropic()
SCHEMA = """
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER,
total_amount DECIMAL(10,2),
status TEXT, -- 'pending', 'shipped', 'delivered', 'cancelled'
created_at TIMESTAMP
);
CREATE TABLE customers (
id INTEGER PRIMARY KEY,
name TEXT,
email TEXT,
country TEXT
);
"""
def text_to_sql(question: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=f"""You are a SQL expert. Given a database schema and a question, return ONLY the SQL query with no explanation.
Database schema:
{SCHEMA}
Rules:
- Only generate SELECT statements.
- Never use DROP, DELETE, UPDATE, INSERT, or any DDL/DML.
- Use standard SQL (compatible with SQLite and PostgreSQL).""",
messages=[{"role": "user", "content": question}],
)
return response.content[0].text.strip()
# Example
sql = text_to_sql("How many orders were placed in the last 30 days by customers from Germany?")
print(sql)
# SELECT COUNT(*) FROM orders o
# JOIN customers c ON o.customer_id = c.id
# WHERE c.country = 'Germany'
# AND o.created_at >= datetime('now', '-30 days')
import json
def text_to_sql_structured(question: str, schema: str) -> dict:
"""Returns {sql, explanation, tables_used, confidence}."""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=f"""You are a SQL expert. Return a JSON object with these keys:
- sql: the SELECT query (string)
- explanation: one-sentence plain-English description of what the query does
- tables_used: list of table names referenced
- confidence: 'high' | 'medium' | 'low' (low if question is ambiguous)
Database schema:
{schema}
Only return valid JSON. Only generate SELECT statements.""",
messages=[{"role": "user", "content": question}],
)
return json.loads(response.content[0].text)
import re
BLOCKED_KEYWORDS = re.compile(
r"(DROP|DELETE|UPDATE|INSERT|TRUNCATE|ALTER|CREATE|REPLACE|MERGE|EXEC|EXECUTE)",
re.IGNORECASE
)
def is_safe_sql(sql: str) -> bool:
"""Return True only if the SQL is a pure SELECT statement."""
stripped = sql.strip().upper()
if not stripped.startswith("SELECT") and not stripped.startswith("WITH"):
return False
if BLOCKED_KEYWORDS.search(sql):
return False
return True
def safe_text_to_sql(question: str, schema: str) -> str:
sql = text_to_sql(question)
if not is_safe_sql(sql):
raise ValueError(f"Unsafe SQL generated: {sql}")
return sql
import sqlite3
def execute_with_healing(question: str, schema: str, db_path: str, max_retries: int = 2) -> list:
"""Generate SQL, execute it, auto-fix on error, return rows."""
conn = sqlite3.connect(db_path)
sql = text_to_sql(question)
history = [{"role": "user", "content": question}, {"role": "assistant", "content": sql}]
for attempt in range(max_retries + 1):
try:
cursor = conn.execute(sql)
return cursor.fetchall()
except sqlite3.Error as e:
if attempt == max_retries:
raise
error_msg = f"This SQL produced an error: {e}
SQL:
{sql}
Please return a corrected SQL query only."
history.append({"role": "user", "content": error_msg})
fix_response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=f"You are a SQL expert. Database schema:
{schema}
Return corrected SQL only.",
messages=history,
)
sql = fix_response.content[0].text.strip()
history.append({"role": "assistant", "content": sql})
conn.close()
def text_to_bigquery_sql(question: str, schema: str) -> str:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=512,
system=f"""You are a BigQuery SQL expert. Return ONLY the SQL query.
Rules:
- Use BigQuery SQL syntax (backtick identifiers, SAFE_DIVIDE, DATE_TRUNC with DATE type, etc.)
- Use standard project.dataset.table notation when possible
- Only SELECT statements; no DDL or DML
Schema:
{schema}""",
messages=[{"role": "user", "content": question}],
)
return response.content[0].text.strip()
| Approach | Accuracy | Setup complexity | Schema size limit |
|---|---|---|---|
| Claude with schema injection (this guide) | High for most queries | Low — just a system prompt | ~200K tokens (hundreds of tables) |
| Fine-tuned model (Spider/BIRD) | Very high on benchmarks | Very high — training pipeline | Fixed at training time |
| Template-based (NLTK/spaCy) | Low — brittle on natural language | Medium | Scales poorly |
| OpenAI GPT-4o | Comparable to Claude | Same as Claude | 128K tokens |
For cost estimates on text-to-SQL pipelines, use the Claude API Cost Calculator. For extracting structured data beyond SQL generation, see the data extraction guide.