AI Cost Optimization
At scale, LLM costs explode fast. A RAG app with 10K daily users at ~$0.01/query = $3,000/month. These techniques reduce that by 60–90%.
Where the Money Goes
Typical RAG request cost breakdown:
├── Embedding query ~$0.00002 (tiny)
├── Embedding for indexing one-time (amortized)
├── LLM input tokens (context) ~$0.003 (BIGGEST cost)
│ ├── System prompt ~200 tokens
│ ├── Retrieved chunks ~800 tokens
│ └── Chat history ~200 tokens
└── LLM output tokens ~$0.006 (~300 tokens)
Total per request: ~$0.0091. Token Counting with tiktoken
Know exactly what you're sending before you send it.
pythonimport tiktoken
enc = tiktoken.encoding_for_model("gpt-4o")
def count_tokens(text: str, model: str = "gpt-4o") -> int:
enc = tiktoken.encoding_for_model(model)
return len(enc.encode(text))
def count_messages_tokens(messages: list[dict], model: str = "gpt-4o") -> int:
enc = tiktoken.encoding_for_model(model)
tokens = 3 # every reply primed with <|start|>assistant<|message|>
for m in messages:
tokens += 4 # role + message overhead
tokens += len(enc.encode(m.get("content", "")))
return tokens
# Budget-aware retrieval: only retrieve as many chunks as fit in your token budget
def budget_chunks(chunks: list, budget: int = 2000) -> list:
kept, total = [], 0
for chunk in chunks:
t = count_tokens(chunk.page_content)
if total + t > budget:
break
kept.append(chunk)
total += t
return kept2. Model Routing — Use Small Models When Possible
Route simple queries to cheap models, complex ones to expensive models.
pythonfrom openai import OpenAI
import re
client = OpenAI()
COMPLEX_SIGNALS = [
r'\b(analyze|compare|design|architect|tradeoff|explain.*detail)\b',
r'\b(step.by.step|comprehensive|in.depth)\b',
]
def route_model(query: str, context_length: int) -> str:
"""Choose model based on query complexity and context size."""
is_complex = any(re.search(p, query, re.I) for p in COMPLEX_SIGNALS)
is_long_context = context_length > 3000
if is_complex or is_long_context:
return "gpt-4o" # $5/1M input
return "gpt-4o-mini" # $0.15/1M input — 33x cheaper
def generate(query: str, context: str) -> str:
model = route_model(query, len(context.split()))
resp = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"Context:\n{context}"},
{"role": "user", "content": query},
],
max_tokens=512,
)
return resp.choices[0].message.content
# Result: ~80% of queries go to gpt-4o-mini, saving ~90% of LLM cost3. Prompt Compression — LLMLingua
LLMLingua uses a small model to compress prompts by 2–10x while preserving semantics.
pythonfrom llmlingua import PromptCompressor
compressor = PromptCompressor(
model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
use_llmlingua2=True,
device_map="cpu",
)
long_context = "..." * 50 # your retrieved chunks
compressed = compressor.compress_prompt(
context=[long_context],
question="What is HNSW?",
ratio=0.5, # compress to 50% of original tokens
)
# compressed["compressed_prompt"] — use this instead of the full context
print(f"Original: {compressed['origin_tokens']} tokens")
print(f"Compressed: {compressed['compressed_tokens']} tokens")
print(f"Ratio: {compressed['ratio']:.1f}x")4. Exact Caching — Cache Identical Requests
Hash the prompt and cache the response. Even a 20% cache hit rate cuts costs significantly.
pythonimport hashlib
import json
import redis
r = redis.Redis(host="localhost", port=6379, db=0)
CACHE_TTL = 60 * 60 * 24 # 24 hours
def cache_key(model: str, messages: list) -> str:
payload = json.dumps({"model": model, "messages": messages}, sort_keys=True)
return "llm:" + hashlib.sha256(payload.encode()).hexdigest()
def cached_llm_call(model: str, messages: list) -> str:
key = cache_key(model, messages)
cached = r.get(key)
if cached:
return cached.decode()
resp = client.chat.completions.create(model=model, messages=messages)
answer = resp.choices[0].message.content
r.setex(key, CACHE_TTL, answer)
return answer5. Semantic Caching — Cache Similar Questions
Embed the query and check if a semantically similar question was already answered.
pythonimport numpy as np
from dataclasses import dataclass
@dataclass
class CachedQuery:
query: str
answer: str
embedding: np.ndarray
class SemanticCache:
def __init__(self, threshold: float = 0.92):
self.cache: list[CachedQuery] = []
self.threshold = threshold
def _embed(self, text: str) -> np.ndarray:
resp = client.embeddings.create(model="text-embedding-3-small", input=text)
emb = np.array(resp.data[0].embedding)
return emb / np.linalg.norm(emb)
def get(self, query: str) -> str | None:
if not self.cache:
return None
q_emb = self._embed(query)
matrix = np.stack([c.embedding for c in self.cache])
sims = matrix @ q_emb
best_idx = np.argmax(sims)
if sims[best_idx] >= self.threshold:
return self.cache[best_idx].answer
return None
def set(self, query: str, answer: str):
self.cache.append(CachedQuery(
query=query,
answer=answer,
embedding=self._embed(query),
))
# Production: use GPTCache or a vector DB for persistent semantic cache
cache = SemanticCache(threshold=0.92)
def answer_with_cache(query: str) -> str:
cached = cache.get(query)
if cached:
return cached # free!
answer = rag_pipeline(query)
cache.set(query, answer)
return answerGPTCache (production-grade semantic cache):
pythonfrom gptcache import cache
from gptcache.adapter import openai as gptcache_openai
cache.init() # uses SQLite + FAISS by default
# Drop-in replacement for openai
resp = gptcache_openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "What is RAG?"}],
)
# Second identical or similar call is served from cache — $0 cost6. Streaming for Perceived Performance (Not Cost)
Streaming doesn't reduce cost but dramatically improves perceived latency — users see text flowing instead of waiting.
pythondef stream_answer(query: str, context: str):
stream = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": f"Context:\n{context}"},
{"role": "user", "content": query},
],
stream=True,
max_tokens=512,
)
for chunk in stream:
if chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
# FastAPI SSE endpoint
from fastapi.responses import StreamingResponse
@app.get("/answer")
async def answer(q: str):
context = retrieve(q)
return StreamingResponse(stream_answer(q, context), media_type="text/event-stream")7. Anthropic Prompt Caching
Anthropic caches the first N tokens of your prompt at 90% discount if reused within 5 minutes.
pythonimport anthropic
client = anthropic.Anthropic()
# Mark the system prompt (static part) for caching
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": "You are a helpful assistant for a RAG system.",
},
{
"type": "text",
"text": long_document_context, # large static context
"cache_control": {"type": "ephemeral"}, # cache this block
},
],
messages=[{"role": "user", "content": "What is HNSW?"}],
)
# usage.cache_read_input_tokens — tokens served from cache (90% cheaper)
# usage.cache_creation_input_tokens — tokens used to populate cache
print(response.usage)Cost Dashboard Query
sql-- PostgreSQL: daily LLM spend by model
SELECT
date_trunc('day', created_at) AS day,
model,
SUM(input_tokens) AS total_input_tokens,
SUM(output_tokens) AS total_output_tokens,
SUM(cost_usd) AS total_cost_usd
FROM llm_requests
GROUP BY 1, 2
ORDER BY 1 DESC, 4 DESC;Cost Reduction Checklist
| Optimization | Expected saving |
|---|---|
| Route 80% traffic to gpt-4o-mini | ~80% |
| Semantic cache with 0.92 threshold | 20–40% of remaining |
| Trim context to 1500 tokens max | 30% on input |
| Prompt caching (Anthropic) | 90% on cached tokens |
| LLMLingua compression (0.5 ratio) | 50% on context tokens |
| Exact cache for FAQ queries | 100% for matched queries |
Stack these and a $3,000/month bill becomes ~$200–400/month.