Installation
bash
# Install Modal client and authenticate
pip install modal
modal setup # Opens browser for authentication
Hello World — Run a Function on GPU
python
# hello_gpu.py — Run a simple function on a cloud GPU
import modal
app = modal.App("hello-gpu")
@app.function(gpu="T4")
def gpu_info():
import subprocess
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
return result.stdout
@app.local_entrypoint()
def main():
print(gpu_info.remote())
bash
# Run it
modal run hello_gpu.py
Custom Container Images
python
# custom_image.py — Define a custom container with ML dependencies
import modal
image = modal.Image.debian_slim(python_version="3.11").pip_install(
"torch==2.1.0",
"transformers==4.36.0",
"accelerate",
"bitsandbytes",
)
app = modal.App("ml-inference", image=image)
@app.function(gpu="A100", timeout=300)
def generate(prompt: str) -> str:
from transformers import pipeline
pipe = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", device=0)
result = pipe(prompt, max_new_tokens=200)
return result[0]["generated_text"]
@app.local_entrypoint()
def main():
print(generate.remote("The meaning of life is"))
Model Serving with Web Endpoints
python
# serve_model.py — Deploy an inference API with automatic scaling
import modal
app = modal.App("llm-server")
image = modal.Image.debian_slim().pip_install("vllm")
@app.cls(gpu="A100", image=image, container_idle_timeout=300)
class LLMServer:
@modal.enter()
def load_model(self):
from vllm import LLM
self.llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2")
@modal.web_endpoint(method="POST")
def generate(self, request: dict):
from vllm import SamplingParams
params = SamplingParams(temperature=0.7, max_tokens=request.get("max_tokens", 200))
outputs = self.llm.generate([request["prompt"]], params)
return {"text": outputs[0].outputs[0].text}
bash
# Deploy the endpoint
modal deploy serve_model.py
# Returns: https://your-username--llm-server-llmserver-generate.modal.run
Volumes for Persistent Storage
python
# volume_cache.py — Cache model weights across function invocations
import modal
volume = modal.Volume.from_name("model-cache", create_if_missing=True)
app = modal.App("cached-model")
image = modal.Image.debian_slim().pip_install("huggingface_hub", "torch", "transformers")
@app.function(gpu="A100", volumes={"/models": volume}, image=image, timeout=600)
def run_inference(prompt: str) -> str:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = "/models/mistral-7b"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
Parallel Map (Batch Processing)
python
# batch_process.py — Process many items in parallel across GPUs
import modal
app = modal.App("batch-embeddings")
image = modal.Image.debian_slim().pip_install("sentence-transformers")
@app.function(gpu="T4", image=image, concurrency_limit=10)
def embed(text: str) -> list[float]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
return model.encode(text).tolist()
@app.local_entrypoint()
def main():
texts = [f"Document {i}: This is sample text." for i in range(100)]
# Process all texts in parallel
results = list(embed.map(texts))
print(f"Generated {len(results)} embeddings of dim {len(results[0])}")
Scheduled Jobs (Cron)
python
# scheduled_job.py — Run a function on a schedule
import modal
app = modal.App("daily-training")
@app.function(schedule=modal.Cron("0 2 * * *"), gpu="A100", timeout=3600)
def nightly_finetune():
"""Runs every night at 2 AM UTC"""
print("Starting nightly fine-tuning job...")
# training logic here
Secrets Management
python
# secrets.py — Access API keys and secrets securely
import modal
app = modal.App("with-secrets")
@app.function(secrets=[modal.Secret.from_name("my-openai-secret")])
def call_openai():
import os
from openai import OpenAI
# OPENAI_API_KEY is injected from the secret
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
return client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Hello"}],
).choices[0].message.content
Key Concepts
- Scales to zero: No cost when idle; containers spin up on demand
- GPU selection:
"T4","A10G","A100","H100"— pick by workload @modal.enter(): Runs once when container starts — ideal for loading models- Volumes: Persistent storage shared across function calls for model caching
.map(): Fan out work across many containers in parallel- Web endpoints: Auto-generated HTTPS URLs with authentication
modal deployfor persistent endpoints,modal runfor one-off execution