Python for LLMs — OpenAI, Anthropic & HuggingFace SDKs

OpenAI SDK

pythonfrom openai import OpenAI, AsyncOpenAI

# Sync client
client = OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
    # base_url="https://api.openai.com/v1",  # custom endpoint / proxy
    # timeout=30,
    # max_retries=3,
)

# Basic chat
def chat(prompt: str, system: str = "You are a helpful assistant.") -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt},
        ],
        temperature=0.7,
        max_tokens=1000,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
    )
    return response.choices[0].message.content

# Multi-turn conversation
def multi_turn():
    messages = [{"role": "system", "content": "You are a helpful assistant."}]

    while True:
        user_input = input("You: ")
        if user_input == "quit": break

        messages.append({"role": "user", "content": user_input})

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages
        )
        assistant_msg = response.choices[0].message.content
        messages.append({"role": "assistant", "content": assistant_msg})
        print(f"AI: {assistant_msg}")

Streaming

python# Sync streaming
def stream_chat(prompt: str):
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )
    for chunk in stream:
        content = chunk.choices[0].delta.content
        if content:
            print(content, end="", flush=True)

# Async streaming (for FastAPI / web servers)
async def async_stream(prompt: str):
    async_client = AsyncOpenAI()
    async with async_client.chat.completions.stream(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    ) as stream:
        async for chunk in stream:
            content = chunk.choices[0].delta.content
            if content:
                yield content

Function Calling / Tool Use

pythonimport json

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather in a city",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string", "description": "The city name"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["city"],
            },
        }
    }
]

def get_weather(city: str, unit: str = "celsius") -> dict:
    return {"city": city, "temp": 22, "unit": unit, "condition": "sunny"}

def chat_with_tools(user_message: str) -> str:
    messages = [{"role": "user", "content": user_message}]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        tools=tools,
        tool_choice="auto",  # "none", "auto", or specific tool
    )

    msg = response.choices[0].message

    # If the model wants to call a tool
    if msg.tool_calls:
        messages.append(msg)  # add assistant message with tool_calls

        for tool_call in msg.tool_calls:
            fn_name = tool_call.function.name
            fn_args = json.loads(tool_call.function.arguments)

            # Call the actual function
            if fn_name == "get_weather":
                result = get_weather(**fn_args)

            messages.append({
                "role": "tool",
                "tool_call_id": tool_call.id,
                "content": json.dumps(result),
            })

        # Second call with tool results
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            tools=tools,
        )

    return response.choices[0].message.content

Structured Output (JSON Mode)

pythonfrom pydantic import BaseModel
from typing import List

class SentimentResult(BaseModel):
    sentiment: str           # positive, negative, neutral
    confidence: float
    key_phrases: List[str]

# Option 1: JSON mode (any valid JSON)
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": f"Analyze sentiment: {text}"}],
    response_format={"type": "json_object"},
)
result = json.loads(response.choices[0].message.content)

# Option 2: Structured output with Pydantic (gpt-4o-mini and above)
response = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[{"role": "user", "content": f"Analyze: {text}"}],
    response_format=SentimentResult,
)
parsed = response.choices[0].message.parsed
print(parsed.sentiment, parsed.confidence)

Anthropic (Claude) SDK

pythonimport anthropic

client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

# Basic message
def ask_claude(prompt: str) -> str:
    message = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    return message.content[0].text

# With system prompt
def ask_claude_with_system(prompt: str, system: str) -> str:
    message = client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        system=system,  # system is a top-level param, not a message
        messages=[{"role": "user", "content": prompt}]
    )
    return message.content[0].text

# Streaming
def stream_claude(prompt: str):
    with client.messages.stream(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    ) as stream:
        for text in stream.text_stream:
            print(text, end="", flush=True)

# Async
async def async_claude(prompt: str) -> str:
    async_client = anthropic.AsyncAnthropic()
    message = await async_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    return message.content[0].text

# Tool use (Claude's version of function calling)
tools = [{
    "name": "search_docs",
    "description": "Search internal documentation",
    "input_schema": {
        "type": "object",
        "properties": {
            "query": {"type": "string"},
            "max_results": {"type": "integer", "default": 5}
        },
        "required": ["query"]
    }
}]

response = client.messages.create(
    model="claude-sonnet-4-6",
    max_tokens=1024,
    tools=tools,
    messages=[{"role": "user", "content": "What does our docs say about auth?"}]
)

if response.stop_reason == "tool_use":
    for block in response.content:
        if block.type == "tool_use":
            result = search_docs(block.input["query"])
            # Continue conversation with tool result

pythonimport base64

def analyze_image(image_path: str, question: str) -> str:
    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")

    message = client.messages.create(
        model="claude-opus-4-6",
        max_tokens=1024,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": image_data,
                    }
                },
                {"type": "text", "text": question}
            ]
        }]
    )
    return message.content[0].text

# URL-based image
message = client.messages.create(
    model="claude-opus-4-6",
    max_tokens=1024,
    messages=[{
        "role": "user",
        "content": [
            {"type": "image", "source": {"type": "url", "url": "https://..."}},
            {"type": "text", "text": "What is in this image?"}
        ]
    }]
)

HuggingFace Transformers

pythonfrom transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

# High-level pipeline API (easiest)
# Text generation
generator = pipeline("text-generation", model="gpt2")
result = generator("Once upon a time", max_new_tokens=50, num_return_sequences=3)
for r in result:
    print(r["generated_text"])

# Text classification
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("This movie is amazing!")
# [{"label": "POSITIVE", "score": 0.9998}]

# Named entity recognition
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
result = ner("Tarun Singh works at Google in San Francisco")
# [{"entity_group": "PER", "word": "Tarun Singh"}, ...]

# Zero-shot classification (no fine-tuning needed)
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
result = classifier(
    "I love playing cricket",
    candidate_labels=["sports", "politics", "technology", "food"]
)
print(result["labels"][0])  # "sports"

# Sentence embeddings
embedder = pipeline("feature-extraction", model="BAAI/bge-small-en-v1.5")
embeddings = embedder(["Hello world", "Hi there"], return_tensors=True)

Low-level API (for fine-tuning / custom models)

pythonmodel_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # fp16 saves memory
    device_map="auto",          # auto-assign to GPU/CPU
)

# Inference
inputs = tokenizer("The capital of France is", return_tensors="pt")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

HuggingFace Inference API (no local GPU needed)

pythonimport requests

API_URL = "https://api-inference.huggingface.co/models/gpt2"
headers = {"Authorization": f"Bearer {os.environ['HF_API_TOKEN']}"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

output = query({"inputs": "The answer to life is", "parameters": {"max_new_tokens": 50}})

sentence-transformers (for embeddings in RAG)

pythonfrom sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("BAAI/bge-large-en-v1.5")

# Encode documents
docs = ["Machine learning is great", "Python is awesome", "I like coffee"]
doc_embeddings = model.encode(docs, normalize_embeddings=True, convert_to_tensor=True)

# Encode query
query = "What programming languages are good?"
query_embedding = model.encode(query, normalize_embeddings=True, convert_to_tensor=True)

# Semantic search
hits = util.semantic_search(query_embedding, doc_embeddings, top_k=2)
for hit in hits[0]:
    print(f"{docs[hit['corpus_id']]}: {hit['score']:.3f}")

# Batch encoding (much faster for large datasets)
batch_size = 64
all_embeddings = model.encode(large_doc_list, batch_size=batch_size, show_progress_bar=True)

Production Patterns

python# 1. Retry with exponential backoff
import time
from openai import RateLimitError, APITimeoutError

def robust_chat(prompt: str, max_retries: int = 3) -> str:
    for attempt in range(max_retries):
        try:
            return client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}]
            ).choices[0].message.content
        except RateLimitError:
            wait = 2 ** attempt + random.uniform(0, 1)
            time.sleep(wait)
        except APITimeoutError:
            if attempt == max_retries - 1: raise
    raise Exception("Max retries exceeded")

# 2. Async batch processing with semaphore
import asyncio

async def process_batch(prompts: list[str], concurrency: int = 5) -> list[str]:
    sem = asyncio.Semaphore(concurrency)
    async_client = AsyncOpenAI()

    async def process_one(prompt: str) -> str:
        async with sem:
            resp = await async_client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}]
            )
            return resp.choices[0].message.content

    return await asyncio.gather(*[process_one(p) for p in prompts])

# 3. Token counting before calling API (avoid expensive overflows)
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4o")
def count_tokens(text: str) -> int:
    return len(enc.encode(text))

def safe_chat(messages: list[dict], max_context: int = 100_000) -> str:
    total_tokens = sum(count_tokens(m["content"]) for m in messages)
    if total_tokens > max_context:
        raise ValueError(f"Context too large: {total_tokens} tokens")
    return client.chat.completions.create(model="gpt-4o", messages=messages).choices[0].message.content

Python for LLMs — OpenAI, Anthropic & HuggingFace SDKs

OpenAI SDK

Streaming

Function Calling / Tool Use

Structured Output (JSON Mode)

Anthropic (Claude) SDK

Multi-modal (Images)

HuggingFace Transformers

Low-level API (for fine-tuning / custom models)

HuggingFace Inference API (no local GPU needed)

sentence-transformers (for embeddings in RAG)

Production Patterns