Python for AI — Interview Questions
Python Fundamentals (AI Context)
Q: What is the GIL and how does it affect AI/ML workloads?
The Global Interpreter Lock is a mutex in CPython that allows only one thread to execute Python bytecode at a time. For CPU-bound ML code (NumPy, PyTorch), the GIL doesn't matter — NumPy/PyTorch release the GIL during C/CUDA operations, so they run in true parallel. For pure Python code (data preprocessing loops), threading is limited — use multiprocessing instead. For I/O-bound work (LLM API calls), asyncio or threading works fine since threads yield the GIL while waiting for network.
python# GIL doesn't affect NumPy (NumPy releases GIL for C operations)
import numpy as np
from concurrent.futures import ThreadPoolExecutor
def compute(arr):
return np.sum(arr ** 2) # releases GIL during this
# These truly run in parallel:
with ThreadPoolExecutor(max_workers=4) as exe:
results = list(exe.map(compute, [np.random.randn(10_000) for _ in range(4)]))
# Pure Python loops — GIL is a bottleneck, use multiprocessing
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=4) as exe:
results = list(exe.map(slow_python_fn, data))Q: Write a Python function to chunk a list of documents for RAG — fixed size with overlap.
pythondef chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks of roughly `chunk_size` characters."""
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i : i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap # step back by overlap amount
return chunks
# Sentence-aware version (don't split mid-sentence)
import re
def chunk_sentences(text: str, max_chars: int = 500, overlap_sentences: int = 1) -> list[str]:
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current = []
current_len = 0
for sent in sentences:
if current_len + len(sent) > max_chars and current:
chunks.append(" ".join(current))
current = current[-overlap_sentences:] if overlap_sentences else []
current_len = sum(len(s) for s in current)
current.append(sent)
current_len += len(sent)
if current:
chunks.append(" ".join(current))
return chunksQ: How do you implement a simple in-memory vector store in Python (for interviews)?
pythonimport numpy as np
from typing import NamedTuple
class Document(NamedTuple):
id: str
text: str
embedding: np.ndarray
metadata: dict
class SimpleVectorStore:
def __init__(self):
self.docs: list[Document] = []
def add(self, id: str, text: str, embedding: list[float], metadata: dict = {}):
emb = np.array(embedding, dtype=np.float32)
emb = emb / np.linalg.norm(emb) # normalize for cosine similarity
self.docs.append(Document(id, text, emb, metadata))
def search(self, query_embedding: list[float], top_k: int = 5) -> list[tuple[Document, float]]:
q = np.array(query_embedding, dtype=np.float32)
q = q / np.linalg.norm(q)
# Compute cosine similarity to all docs at once (vectorized)
embeddings = np.stack([d.embedding for d in self.docs]) # shape (N, dim)
scores = embeddings @ q # shape (N,) — cosine similarity
top_indices = np.argsort(scores)[::-1][:top_k]
return [(self.docs[i], float(scores[i])) for i in top_indices]
# Usage
store = SimpleVectorStore()
store.add("1", "Python is great", [0.1, 0.2, 0.3, ...])
results = store.search(query_embedding=[0.1, 0.2, 0.3, ...], top_k=3)Q: What is the difference between @staticmethod, @classmethod, and instance methods?
pythonclass EmbeddingModel:
DEFAULT_DIM = 1536
def __init__(self, model_name: str):
self.model_name = model_name
# Instance method — has access to self (instance)
def embed(self, text: str) -> list[float]:
return self._call_api(text)
# Class method — has access to cls (class, not instance)
# Used for alternative constructors
@classmethod
def from_env(cls) -> "EmbeddingModel":
return cls(model_name=os.environ["EMBED_MODEL"])
# Static method — no access to instance or class
# Pure utility function that happens to live here
@staticmethod
def normalize(embedding: list[float]) -> list[float]:
arr = np.array(embedding)
return (arr / np.linalg.norm(arr)).tolist()Q: How would you implement an LRU cache from scratch in Python?
pythonfrom collections import OrderedDict
class LRUCache:
def __init__(self, capacity: int):
self.capacity = capacity
self.cache = OrderedDict() # maintains insertion order
def get(self, key: str) -> str | None:
if key not in self.cache:
return None
self.cache.move_to_end(key) # mark as most recently used
return self.cache[key]
def put(self, key: str, value: str):
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.capacity:
self.cache.popitem(last=False) # evict least recently used (first item)
# Use case: cache LLM responses (same prompt = same response)
response_cache = LRUCache(capacity=1000)
def cached_llm(prompt: str) -> str:
cached = response_cache.get(prompt)
if cached:
return cached
result = call_llm(prompt)
response_cache.put(prompt, result)
return result
# Built-in alternative for pure functions
from functools import lru_cache
@lru_cache(maxsize=1000)
def get_embedding(text: str) -> tuple: # must be hashable → use tuple
return tuple(embed(text))Q: How do you handle rate limits when calling OpenAI/Anthropic APIs in Python?
pythonimport asyncio
import time
import random
from openai import AsyncOpenAI, RateLimitError
async def call_with_backoff(
client: AsyncOpenAI,
prompt: str,
max_retries: int = 5,
base_delay: float = 1.0,
) -> str:
for attempt in range(max_retries):
try:
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
except RateLimitError as e:
if attempt == max_retries - 1:
raise
# Exponential backoff with jitter
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Rate limited. Waiting {delay:.1f}s (attempt {attempt+1}/{max_retries})")
await asyncio.sleep(delay)
# Semaphore limits concurrent requests
sem = asyncio.Semaphore(10) # max 10 concurrent calls
async def process_all(prompts: list[str]) -> list[str]:
client = AsyncOpenAI()
async def safe_call(prompt: str) -> str:
async with sem:
return await call_with_backoff(client, prompt)
return await asyncio.gather(*[safe_call(p) for p in prompts], return_exceptions=True)Q: Explain generators and how they help with large-scale data processing in ML.
python# Memory problem: loading entire dataset at once
dataset = [load_document(path) for path in all_paths] # 10GB in RAM
# Generator solution: lazy evaluation, one item at a time
def load_documents(paths: list[str]):
for path in paths:
yield load_document(path) # loads one, frees previous from memory
# Pipeline of generators (each transforms on-the-fly)
def tokenize_stream(docs):
for doc in docs:
yield tokenize(doc)
def embed_stream(tokens, batch_size=32):
batch = []
for token_list in tokens:
batch.append(token_list)
if len(batch) == batch_size:
yield embed_batch(batch)
batch = []
if batch:
yield embed_batch(batch)
# The full pipeline — processes data WITHOUT loading all into memory
docs = load_documents(all_paths) # generator
tokens = tokenize_stream(docs) # generator
embeddings = embed_stream(tokens, 32) # generator
# Only materializes in memory during iteration
for batch_embeddings in embeddings:
store_in_vector_db(batch_embeddings)Q: What is __slots__ and when would you use it in an ML codebase?
python# Normal class — each instance has a __dict__ (hash map for attributes)
# Heavy when you have millions of instances
class Embedding:
def __init__(self, id: str, vector: list):
self.id = id
self.vector = vector
# sys.getsizeof(Embedding("a", [1,2,3])) ≈ 48 bytes + __dict__ overhead
# __slots__ — replaces __dict__ with fixed-size structure
class EmbeddingSlots:
__slots__ = ("id", "vector") # no __dict__
def __init__(self, id: str, vector: list):
self.id = id
self.vector = vector
# 30-50% less memory, 20-30% faster attribute access
# Tradeoff: can't add new attributes, no __dict__, no multiple inheritance easily
# When to use: when creating millions of lightweight objects
# e.g., storing 1M document metadata objects alongside a vector indexQ: How do you profile and optimize slow Python AI code?
python# 1. cProfile — function-level profiling
import cProfile
cProfile.run("my_function()")
# 2. line_profiler — line-by-line (pip install line_profiler)
from line_profiler import LineProfiler
lp = LineProfiler()
lp.add_function(my_function)
lp.run("my_function(data)")
lp.print_stats()
# 3. memory_profiler — memory usage per line (pip install memory_profiler)
from memory_profiler import profile
@profile
def process_data(df):
...
# 4. timeit — micro benchmarks
import timeit
timeit.timeit("sum(range(1000))", number=10_000)
# 5. Quick wins for AI code
# ❌ Loop over rows
for i, row in df.iterrows():
result = row["a"] + row["b"]
# ✅ Vectorize
result = df["a"] + df["b"]
# ❌ Python for loop over tensor
for i in range(len(tensor)):
result = tensor[i] * 2
# ✅ Vectorized PyTorch
result = tensor * 2
# ❌ One API call per item
results = [call_llm(p) for p in prompts]
# ✅ Async batch
results = await asyncio.gather(*[call_llm(p) for p in prompts])