The Problem
A product company uses LLMs across 8 features: chatbot, summarization, code generation, content moderation, search, classification, translation, and data extraction. Each team calls OpenAI directly with their own API key, no shared caching, no budget limits. Monthly AI spend grew from $2K to $18K in 6 months. When OpenAI had a 2-hour outage, 5 features went down simultaneously. One team's runaway loop burned $3K in a single night. Nobody knows which feature costs what.
Step 1: Request Router with Model Selection
typescript
// src/gateway/router.ts
import { z } from 'zod';
const LLMRequest = z.object({
model: z.string().optional(),
messages: z.array(z.object({ role: z.enum(['system', 'user', 'assistant']), content: z.string() })),
temperature: z.number().min(0).max(2).default(0.7),
maxTokens: z.number().int().positive().default(1000),
teamId: z.string(),
featureId: z.string(),
priority: z.enum(['low', 'medium', 'high']).default('medium'),
cacheTtl: z.number().int().optional(), // seconds, 0 = no cache
});
type LLMRequest = z.infer<typeof LLMRequest>;
interface ModelConfig {
provider: 'openai' | 'anthropic' | 'local';
model: string;
costPer1kInput: number; // dollars
costPer1kOutput: number;
maxTokens: number;
latencyP50Ms: number;
}
const models: Record<string, ModelConfig> = {
'gpt-4o': { provider: 'openai', model: 'gpt-4o', costPer1kInput: 0.0025, costPer1kOutput: 0.01, maxTokens: 128000, latencyP50Ms: 800 },
'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', costPer1kInput: 0.00015, costPer1kOutput: 0.0006, maxTokens: 128000, latencyP50Ms: 400 },
'claude-sonnet': { provider: 'anthropic', model: 'claude-sonnet-4-20250514', costPer1kInput: 0.003, costPer1kOutput: 0.015, maxTokens: 200000, latencyP50Ms: 900 },
'claude-haiku': { provider: 'anthropic', model: 'claude-haiku-4-20250514', costPer1kInput: 0.0008, costPer1kOutput: 0.004, maxTokens: 200000, latencyP50Ms: 300 },
};
// Smart model selection based on task complexity
export function selectModel(request: LLMRequest): string[] {
const totalChars = request.messages.reduce((s, m) => s + m.content.length, 0);
// Priority-based routing
if (request.priority === 'low') return ['gpt-4o-mini', 'claude-haiku'];
if (request.priority === 'high') return ['gpt-4o', 'claude-sonnet', 'gpt-4o-mini'];
// Complexity-based: short messages → cheap model
if (totalChars < 500 && request.maxTokens < 500) return ['gpt-4o-mini', 'claude-haiku'];
return ['gpt-4o-mini', 'gpt-4o', 'claude-sonnet'];
}
Step 2: Provider Fallback Chain
typescript
// src/gateway/executor.ts
import { Redis } from 'ioredis';
import { createHash } from 'crypto';
const redis = new Redis(process.env.REDIS_URL!);
interface LLMResponse {
content: string;
model: string;
provider: string;
inputTokens: number;
outputTokens: number;
latencyMs: number;
cached: boolean;
costUsd: number;
}
export async function executeLLMRequest(
request: any,
modelChain: string[]
): Promise<LLMResponse> {
// Check cache first
if (request.cacheTtl !== 0) {
const cached = await checkCache(request);
if (cached) return { ...cached, cached: true };
}
// Try each model in the fallback chain
for (let i = 0; i < modelChain.length; i++) {
const modelKey = modelChain[i];
const config = models[modelKey];
if (!config) continue;
// Check if provider is healthy
const healthy = await isProviderHealthy(config.provider);
if (!healthy && i < modelChain.length - 1) continue;
try {
const start = Date.now();
const result = await callProvider(config, request);
const latencyMs = Date.now() - start;
const costUsd = (result.inputTokens / 1000) * config.costPer1kInput +
(result.outputTokens / 1000) * config.costPer1kOutput;
const response: LLMResponse = {
content: result.content,
model: config.model,
provider: config.provider,
inputTokens: result.inputTokens,
outputTokens: result.outputTokens,
latencyMs,
cached: false,
costUsd,
};
// Cache successful response
if (request.cacheTtl !== 0) {
await cacheResponse(request, response, request.cacheTtl ?? 3600);
}
// Record health
await markProviderHealthy(config.provider);
return response;
} catch (err) {
await markProviderUnhealthy(config.provider);
if (i === modelChain.length - 1) throw err;
// Try next model in chain
}
}
throw new Error('All providers failed');
}
async function checkCache(request: any): Promise<LLMResponse | null> {
const key = `llm:cache:${hashRequest(request)}`;
const cached = await redis.get(key);
return cached ? JSON.parse(cached) : null;
}
async function cacheResponse(request: any, response: LLMResponse, ttl: number): Promise<void> {
const key = `llm:cache:${hashRequest(request)}`;
await redis.setex(key, ttl, JSON.stringify(response));
}
function hashRequest(request: any): string {
const payload = JSON.stringify({
messages: request.messages,
temperature: request.temperature,
maxTokens: request.maxTokens,
});
return createHash('sha256').update(payload).digest('hex').slice(0, 16);
}
async function isProviderHealthy(provider: string): Promise<boolean> {
const failures = await redis.get(`llm:health:${provider}:failures`);
return !failures || parseInt(failures) < 3;
}
async function markProviderHealthy(provider: string): Promise<void> {
await redis.del(`llm:health:${provider}:failures`);
}
async function markProviderUnhealthy(provider: string): Promise<void> {
await redis.incr(`llm:health:${provider}:failures`);
await redis.expire(`llm:health:${provider}:failures`, 300); // reset after 5 min
}
async function callProvider(config: any, request: any): Promise<{
content: string; inputTokens: number; outputTokens: number;
}> {
const apiKey = config.provider === 'openai' ? process.env.OPENAI_API_KEY
: config.provider === 'anthropic' ? process.env.ANTHROPIC_API_KEY : '';
if (config.provider === 'openai') {
const res = await fetch('https://api.openai.com/v1/chat/completions', {
method: 'POST',
headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
body: JSON.stringify({
model: config.model, messages: request.messages,
temperature: request.temperature, max_tokens: request.maxTokens,
}),
});
if (!res.ok) throw new Error(`OpenAI ${res.status}`);
const data = await res.json() as any;
return {
content: data.choices[0].message.content,
inputTokens: data.usage.prompt_tokens,
outputTokens: data.usage.completion_tokens,
};
}
if (config.provider === 'anthropic') {
const res = await fetch('https://api.anthropic.com/v1/messages', {
method: 'POST',
headers: {
'x-api-key': apiKey!, 'Content-Type': 'application/json',
'anthropic-version': '2023-06-01',
},
body: JSON.stringify({
model: config.model, messages: request.messages.filter((m: any) => m.role !== 'system'),
system: request.messages.find((m: any) => m.role === 'system')?.content,
max_tokens: request.maxTokens,
}),
});
if (!res.ok) throw new Error(`Anthropic ${res.status}`);
const data = await res.json() as any;
return {
content: data.content[0].text,
inputTokens: data.usage.input_tokens,
outputTokens: data.usage.output_tokens,
};
}
throw new Error(`Unknown provider: ${config.provider}`);
}
const models: Record<string, any> = {
'gpt-4o': { provider: 'openai', model: 'gpt-4o', costPer1kInput: 0.0025, costPer1kOutput: 0.01 },
'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', costPer1kInput: 0.00015, costPer1kOutput: 0.0006 },
'claude-sonnet': { provider: 'anthropic', model: 'claude-sonnet-4-20250514', costPer1kInput: 0.003, costPer1kOutput: 0.015 },
'claude-haiku': { provider: 'anthropic', model: 'claude-haiku-4-20250514', costPer1kInput: 0.0008, costPer1kOutput: 0.004 },
};
Step 3: Per-Team Budget Control
typescript
// src/gateway/budget.ts
import { Redis } from 'ioredis';
const redis = new Redis(process.env.REDIS_URL!);
interface BudgetConfig {
monthlyLimitUsd: number;
dailyLimitUsd: number;
alertThresholdPercent: number;
}
const teamBudgets: Record<string, BudgetConfig> = {
'chatbot': { monthlyLimitUsd: 5000, dailyLimitUsd: 200, alertThresholdPercent: 80 },
'search': { monthlyLimitUsd: 2000, dailyLimitUsd: 100, alertThresholdPercent: 80 },
'moderation': { monthlyLimitUsd: 1000, dailyLimitUsd: 50, alertThresholdPercent: 80 },
'default': { monthlyLimitUsd: 500, dailyLimitUsd: 30, alertThresholdPercent: 80 },
};
export async function checkBudget(teamId: string, estimatedCostUsd: number): Promise<{
allowed: boolean;
reason?: string;
currentSpend: { daily: number; monthly: number };
}> {
const budget = teamBudgets[teamId] ?? teamBudgets.default;
const day = new Date().toISOString().split('T')[0];
const month = day.slice(0, 7);
const [daily, monthly] = await Promise.all([
redis.get(`budget:${teamId}:${day}`).then(v => parseFloat(v ?? '0')),
redis.get(`budget:${teamId}:${month}`).then(v => parseFloat(v ?? '0')),
]);
if (daily + estimatedCostUsd > budget.dailyLimitUsd) {
return { allowed: false, reason: 'Daily budget exceeded', currentSpend: { daily, monthly } };
}
if (monthly + estimatedCostUsd > budget.monthlyLimitUsd) {
return { allowed: false, reason: 'Monthly budget exceeded', currentSpend: { daily, monthly } };
}
return { allowed: true, currentSpend: { daily, monthly } };
}
export async function recordSpend(teamId: string, costUsd: number): Promise<void> {
const day = new Date().toISOString().split('T')[0];
const month = day.slice(0, 7);
await redis.incrbyfloat(`budget:${teamId}:${day}`, costUsd);
await redis.incrbyfloat(`budget:${teamId}:${month}`, costUsd);
await redis.expire(`budget:${teamId}:${day}`, 86400 * 2);
await redis.expire(`budget:${teamId}:${month}`, 86400 * 35);
}
Results
- Monthly AI spend: dropped from $18K to $6.8K — 62% reduction
- Cache hit rate: 34% of requests served from cache (identical prompts across users)
- Provider uptime: 99.95% (automatic fallback during OpenAI's 2-hour outage — zero downtime)
- Runaway loop protection: budget limits caught 4 incidents, prevented $8K+ waste
- Cost attribution: each team sees real-time spend, drives optimization
- Model routing: 60% of requests use gpt-4o-mini (16x cheaper than gpt-4o) with no quality impact
- Per-feature cost visibility: chatbot=$2.1K, search=$1.8K, moderation=$0.4K, other=$2.5K