5 min read
Multimodal AI — Vision, Documents & Audio
What is Multimodal AI?
Traditional LLM: text in → text out
Multimodal LLM: text + image + audio + video + PDF → text (or image) out
Current capabilities (2025):
Vision: Analyze images, screenshots, charts, UI wireframes
Documents: Read PDFs, invoices, forms, scanned documents
Audio: Transcribe speech, analyze tone, translate
Video: Describe video frames, analyze recordings
Output: Text + (DALL-E/Stable Diffusion for image generation)Vision: Image Understanding
pythonimport anthropic
import base64
from pathlib import Path
client = anthropic.Anthropic()
# 1. URL-based (easiest)
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "url",
"url": "https://example.com/chart.png",
},
},
{
"type": "text",
"text": "Analyze this chart. What are the key trends? What conclusions can you draw?"
}
],
}
],
)
# 2. Base64 (local files, private images)
def encode_image(image_path: str) -> tuple[str, str]:
"""Returns (base64_data, media_type)"""
path = Path(image_path)
media_types = {".jpg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp"}
media_type = media_types.get(path.suffix.lower(), "image/jpeg")
with open(image_path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8"), media_type
image_data, media_type = encode_image("invoice.png")
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{
"type": "text",
"text": "Extract all line items from this invoice as JSON: {items: [{description, quantity, unit_price, total}], subtotal, tax, grand_total}"
}
],
}],
)
# 3. OpenAI Vision
from openai import OpenAI
openai_client = OpenAI()
response = openai_client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:{media_type};base64,{image_data}"}
},
{"type": "text", "text": "Describe any UI issues in this screenshot"}
],
}],
)Document Understanding (PDFs)
python# Option 1: Native PDF support (Claude)
with open("contract.pdf", "rb") as f:
pdf_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": pdf_data,
},
},
{
"type": "text",
"text": "List all parties to this contract, the key obligations of each, and any penalty clauses."
}
],
}],
)
# Option 2: Document parsing libraries (for long/complex docs)
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
# Extract structured elements from PDF
elements = partition_pdf(
filename="annual_report.pdf",
strategy="hi_res", # Use OCR for scanned PDFs
extract_images_in_pdf=True, # Also extract embedded images
infer_table_structure=True, # Detect tables
)
# Chunk by logical sections (maintains heading hierarchy)
chunks = chunk_by_title(elements, max_characters=1000)
# Option 3: LlamaParse (cloud service, very accurate)
from llama_parse import LlamaParse
parser = LlamaParse(
api_key="llx-...",
result_type="markdown", # or "text"
parsing_instruction="Extract all tables and financial figures accurately",
)
documents = await parser.aload_data("financial_report.pdf")
# Option 4: Docling (open source, local)
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("complex_report.pdf")
markdown_output = result.document.export_to_markdown()Audio: Speech-to-Text
pythonfrom openai import OpenAI
client = OpenAI()
# 1. Transcription (Whisper)
with open("meeting_recording.mp3", "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json", # Includes timestamps
timestamp_granularities=["segment", "word"],
)
# Access segments with timestamps
for segment in transcript.segments:
print(f"[{segment.start:.1f}s - {segment.end:.1f}s] {segment.text}")
# 2. Translation to English
with open("german_call.mp3", "rb") as audio_file:
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file,
)
print(translation.text) # English translation of German audio
# 3. Process audio in chunks (for long recordings)
from pydub import AudioSegment
import io
def transcribe_long_audio(file_path: str, chunk_minutes: int = 10) -> str:
audio = AudioSegment.from_file(file_path)
chunk_ms = chunk_minutes * 60 * 1000
transcripts = []
for i, start in enumerate(range(0, len(audio), chunk_ms)):
chunk = audio[start:start + chunk_ms]
buffer = io.BytesIO()
chunk.export(buffer, format="mp3")
buffer.name = f"chunk_{i}.mp3" # Required for file type detection
buffer.seek(0)
result = client.audio.transcriptions.create(
model="whisper-1",
file=buffer,
)
transcripts.append(result.text)
return " ".join(transcripts)
# 4. Text-to-speech
speech_response = client.audio.speech.create(
model="tts-1-hd",
voice="alloy", # alloy, echo, fable, onyx, nova, shimmer
input="Welcome to our AI assistant. How can I help you today?",
response_format="mp3",
speed=1.0, # 0.25 to 4.0
)
speech_response.stream_to_file("output.mp3")Production Patterns
Screenshot → Bug Report
pythonasync def analyze_screenshot_for_bugs(screenshot_path: str) -> dict:
"""Automatically detect UI issues from screenshot."""
image_data, media_type = encode_image(screenshot_path)
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": """Analyze this UI screenshot for issues.
Return JSON: {
"issues": [{"severity": "critical/high/medium/low", "type": "layout/text/accessibility/data", "description": "...", "location": "..."}],
"overall_quality": "good/needs_improvement/broken",
"summary": "..."
}"""}
],
}],
)
return json.loads(response.content[0].text)
# Usage in automated testing
async def visual_regression_test(page_url: str, baseline_path: str) -> bool:
# Capture current screenshot
current_screenshot = await capture_screenshot(page_url)
# Ask AI to compare with baseline
prompt = """Compare these two UI screenshots (baseline vs current).
Are there any visual regressions? Return JSON: {
"has_regression": bool,
"changes": ["description of change"],
"severity": "none/minor/major/breaking"
}"""
# Pass both images to Claude for comparison
...Invoice/Receipt Processing Pipeline
pythonfrom pydantic import BaseModel
class LineItem(BaseModel):
description: str
quantity: float
unit_price: float
total: float
class InvoiceData(BaseModel):
invoice_number: str
vendor_name: str
date: str
line_items: list[LineItem]
subtotal: float
tax: float
total: float
async def process_invoice(image_path: str) -> InvoiceData:
"""Extract structured data from invoice image."""
image_data, media_type = encode_image(image_path)
# Use structured output
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=2048,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": f"Extract invoice data as JSON matching this schema: {InvoiceData.model_json_schema()}"}
],
}],
)
data = json.loads(response.content[0].text)
return InvoiceData(**data)Meeting Transcription + Action Items
pythonasync def process_meeting_recording(audio_path: str) -> dict:
"""Transcribe meeting and extract action items."""
# Step 1: Transcribe
transcript = await transcribe_long_audio(audio_path)
# Step 2: Extract structure
response = client.messages.create(
model="claude-opus-4-6",
max_tokens=4096,
messages=[{
"role": "user",
"content": f"""Analyze this meeting transcript and extract:
1. Key decisions made
2. Action items (owner + deadline if mentioned)
3. Open questions requiring follow-up
4. Summary (3-5 sentences)
Transcript:
{transcript}
Return as JSON: {{
"summary": "...",
"decisions": ["..."],
"action_items": [{{"owner": "...", "task": "...", "deadline": "..."}}],
"open_questions": ["..."]
}}"""
}],
)
return json.loads(response.content[0].text)Multimodal Models Comparison
| Model | Images | PDFs | Audio | Video | Context | Best For |
|---|---|---|---|---|---|---|
| Claude 3.5/4 | ✓ | ✓ | ✗ | ✗ | 200k | Documents, code, analysis |
| GPT-4o | ✓ | ✓* | ✓ (via API) | ✗ | 128k | General vision, audio |
| Gemini 1.5 Pro | ✓ | ✓ | ✓ | ✓ | 1M | Long video, multimodal |
| Gemini 2.0 Flash | ✓ | ✓ | ✓ | ✓ | 1M | Speed + multimodal |
| Whisper | ✗ | ✗ | ✓ | ✗ | – | Audio transcription only |
*GPT-4o with PDF: requires text extraction first (not native PDF)
[prev·next]