Raj is an ML engineer at a legal tech company. Their lawyers need an LLM that understands legal terminology and can draft contract clauses — something general-purpose models handle poorly. He'll fine-tune Llama 3.1 8B on their proprietary legal corpus, track everything in W&B, and deploy the model with vLLM for the internal team.
Step 1: Prepare the Training Data
Raj starts by formatting the company's legal Q&A pairs and contract examples into the chat format Llama expects.
# prepare_data.py — Convert legal documents into training format
import json
from datasets import Dataset
raw_examples = [
{
"instruction": "Draft a non-compete clause for a senior engineer in California.",
"response": "Given California's strong public policy against non-compete agreements (Business and Professions Code § 16600), a traditional non-compete clause would likely be unenforceable. Instead, consider a narrowly tailored non-solicitation provision..."
},
{
"instruction": "What is the statute of limitations for breach of contract in New York?",
"response": "Under New York CPLR § 213(2), the statute of limitations for breach of contract is six years from the date of the breach..."
},
]
def format_for_chat(example):
return {
"messages": [
{"role": "system", "content": "You are an expert legal assistant specializing in contract law."},
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["response"]},
]
}
formatted = [format_for_chat(ex) for ex in raw_examples]
# Save as JSONL
with open("legal_train.jsonl", "w") as f:
for item in formatted:
f.write(json.dumps(item) + "\n")
# Load as Hugging Face Dataset
dataset = Dataset.from_list(formatted)
split = dataset.train_test_split(test_size=0.1, seed=42)
split.save_to_disk("./legal_dataset")
print(f"Train: {len(split['train'])}, Eval: {len(split['test'])}")
Step 2: Fine-Tune with LoRA and Track with W&B
Raj uses LoRA for parameter-efficient fine-tuning — training only 0.06% of the model's parameters while getting excellent results. Every metric flows to W&B automatically.
# finetune.py — LoRA fine-tuning of Llama 3.1 with W&B experiment tracking
import os
os.environ["WANDB_PROJECT"] = "legal-llm"
import torch
from transformers import (
AutoModelForCausalLM, AutoTokenizer,
TrainingArguments, Trainer,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_from_disk
import wandb
wandb.init(
project="legal-llm",
name="llama-3.1-8b-legal-lora-v1",
tags=["lora", "legal", "llama-3.1"],
)
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
load_in_4bit=True,
)
# Configure LoRA
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Load and tokenize dataset
dataset = load_from_disk("./legal_dataset")
def tokenize(example):
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
tokens = tokenizer(text, truncation=True, max_length=2048, padding="max_length")
tokens["labels"] = tokens["input_ids"].copy()
return tokens
tokenized = dataset.map(tokenize, remove_columns=["messages"])
# Training arguments
training_args = TrainingArguments(
output_dir="./legal-llm-checkpoints",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_ratio=0.05,
weight_decay=0.01,
bf16=True,
logging_steps=10,
eval_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=100,
report_to="wandb",
run_name="llama-legal-lora-v1",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
)
trainer.train()
# Save the LoRA adapter
trainer.model.save_pretrained("./legal-llm-lora")
tokenizer.save_pretrained("./legal-llm-lora")
# Log the model as a W&B artifact
artifact = wandb.Artifact("legal-llm-lora", type="model")
artifact.add_dir("./legal-llm-lora")
wandb.log_artifact(artifact)
wandb.finish()
Step 3: Evaluate and Compare in W&B
Before deploying, Raj compares his fine-tuned model against the base model on a legal benchmark.
# evaluate.py — Compare base vs fine-tuned model on legal tasks
import wandb
from transformers import pipeline
wandb.init(project="legal-llm", name="evaluation", job_type="eval")
test_questions = [
"What constitutes a material breach of contract?",
"Explain the parol evidence rule.",
"What are liquidated damages?",
]
# Evaluate base model
base_pipe = pipeline("text-generation", model="meta-llama/Llama-3.1-8B-Instruct", device_map="auto")
# Evaluate fine-tuned model
ft_pipe = pipeline("text-generation", model="./legal-llm-lora", device_map="auto")
table = wandb.Table(columns=["question", "base_answer", "finetuned_answer"])
for q in test_questions:
base_out = base_pipe(q, max_new_tokens=200)[0]["generated_text"]
ft_out = ft_pipe(q, max_new_tokens=200)[0]["generated_text"]
table.add_data(q, base_out, ft_out)
wandb.log({"eval_comparison": table})
wandb.finish()
Step 4: Merge LoRA and Deploy with vLLM
Happy with the results, Raj merges the LoRA adapter into the base model and deploys it with vLLM for high-throughput serving.
# merge_and_export.py — Merge LoRA weights into the base model for deployment
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, "./legal-llm-lora")
# Merge LoRA weights into base model
merged = model.merge_and_unload()
merged.save_pretrained("./legal-llm-merged")
tokenizer = AutoTokenizer.from_pretrained("./legal-llm-lora")
tokenizer.save_pretrained("./legal-llm-merged")
print("Merged model saved to ./legal-llm-merged")
# deploy.sh — Serve the merged model with vLLM
python -m vllm.entrypoints.openai.api_server \
--model ./legal-llm-merged \
--host 0.0.0.0 \
--port 8000 \
--max-model-len 4096 \
--dtype bfloat16 \
--gpu-memory-utilization 0.9
# test_deployment.py — Test the deployed model using the OpenAI SDK
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="unused")
response = client.chat.completions.create(
model="./legal-llm-merged",
messages=[
{"role": "system", "content": "You are an expert legal assistant."},
{"role": "user", "content": "Draft an indemnification clause for a software vendor agreement."},
],
max_tokens=500,
temperature=0.3,
)
print(response.choices[0].message.content)
What Raj Achieved
After two weeks of iteration tracked entirely in W&B:
- 3 LoRA experiments compared side by side — different learning rates, ranks, and target modules
- Legal accuracy improved 40% over the base Llama model on their internal benchmark
- vLLM serves 150 req/s on a single A100, handling the entire legal team's workload
- Model artifacts versioned in W&B — easy to roll back if quality degrades
- Total training cost: ~$15 in GPU time (4-bit LoRA is remarkably efficient)