Terminal.skills
Use Cases/Build an LLM Gateway with Fallback and Cost Control

Build an LLM Gateway with Fallback and Cost Control

Route LLM requests across OpenAI, Anthropic, and open-source models with automatic fallback, per-team budgets, prompt caching, and cost tracking that cut AI spend by 62% while improving reliability to 99.95%.

#redis#caching#database#pub-sub#queues
Works with:claude-codeopenai-codexgemini-clicursor
$

The Problem

A product company uses LLMs across 8 features: chatbot, summarization, code generation, content moderation, search, classification, translation, and data extraction. Each team calls OpenAI directly with their own API key, no shared caching, no budget limits. Monthly AI spend grew from $2K to $18K in 6 months. When OpenAI had a 2-hour outage, 5 features went down simultaneously. One team's runaway loop burned $3K in a single night. Nobody knows which feature costs what.

Step 1: Request Router with Model Selection

typescript
// src/gateway/router.ts
import { z } from 'zod';

const LLMRequest = z.object({
  model: z.string().optional(),
  messages: z.array(z.object({ role: z.enum(['system', 'user', 'assistant']), content: z.string() })),
  temperature: z.number().min(0).max(2).default(0.7),
  maxTokens: z.number().int().positive().default(1000),
  teamId: z.string(),
  featureId: z.string(),
  priority: z.enum(['low', 'medium', 'high']).default('medium'),
  cacheTtl: z.number().int().optional(), // seconds, 0 = no cache
});

type LLMRequest = z.infer<typeof LLMRequest>;

interface ModelConfig {
  provider: 'openai' | 'anthropic' | 'local';
  model: string;
  costPer1kInput: number;    // dollars
  costPer1kOutput: number;
  maxTokens: number;
  latencyP50Ms: number;
}

const models: Record<string, ModelConfig> = {
  'gpt-4o': { provider: 'openai', model: 'gpt-4o', costPer1kInput: 0.0025, costPer1kOutput: 0.01, maxTokens: 128000, latencyP50Ms: 800 },
  'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', costPer1kInput: 0.00015, costPer1kOutput: 0.0006, maxTokens: 128000, latencyP50Ms: 400 },
  'claude-sonnet': { provider: 'anthropic', model: 'claude-sonnet-4-20250514', costPer1kInput: 0.003, costPer1kOutput: 0.015, maxTokens: 200000, latencyP50Ms: 900 },
  'claude-haiku': { provider: 'anthropic', model: 'claude-haiku-4-20250514', costPer1kInput: 0.0008, costPer1kOutput: 0.004, maxTokens: 200000, latencyP50Ms: 300 },
};

// Smart model selection based on task complexity
export function selectModel(request: LLMRequest): string[] {
  const totalChars = request.messages.reduce((s, m) => s + m.content.length, 0);

  // Priority-based routing
  if (request.priority === 'low') return ['gpt-4o-mini', 'claude-haiku'];
  if (request.priority === 'high') return ['gpt-4o', 'claude-sonnet', 'gpt-4o-mini'];

  // Complexity-based: short messages → cheap model
  if (totalChars < 500 && request.maxTokens < 500) return ['gpt-4o-mini', 'claude-haiku'];
  return ['gpt-4o-mini', 'gpt-4o', 'claude-sonnet'];
}

Step 2: Provider Fallback Chain

typescript
// src/gateway/executor.ts
import { Redis } from 'ioredis';
import { createHash } from 'crypto';

const redis = new Redis(process.env.REDIS_URL!);

interface LLMResponse {
  content: string;
  model: string;
  provider: string;
  inputTokens: number;
  outputTokens: number;
  latencyMs: number;
  cached: boolean;
  costUsd: number;
}

export async function executeLLMRequest(
  request: any,
  modelChain: string[]
): Promise<LLMResponse> {
  // Check cache first
  if (request.cacheTtl !== 0) {
    const cached = await checkCache(request);
    if (cached) return { ...cached, cached: true };
  }

  // Try each model in the fallback chain
  for (let i = 0; i < modelChain.length; i++) {
    const modelKey = modelChain[i];
    const config = models[modelKey];
    if (!config) continue;

    // Check if provider is healthy
    const healthy = await isProviderHealthy(config.provider);
    if (!healthy && i < modelChain.length - 1) continue;

    try {
      const start = Date.now();
      const result = await callProvider(config, request);
      const latencyMs = Date.now() - start;

      const costUsd = (result.inputTokens / 1000) * config.costPer1kInput +
                      (result.outputTokens / 1000) * config.costPer1kOutput;

      const response: LLMResponse = {
        content: result.content,
        model: config.model,
        provider: config.provider,
        inputTokens: result.inputTokens,
        outputTokens: result.outputTokens,
        latencyMs,
        cached: false,
        costUsd,
      };

      // Cache successful response
      if (request.cacheTtl !== 0) {
        await cacheResponse(request, response, request.cacheTtl ?? 3600);
      }

      // Record health
      await markProviderHealthy(config.provider);

      return response;
    } catch (err) {
      await markProviderUnhealthy(config.provider);
      if (i === modelChain.length - 1) throw err;
      // Try next model in chain
    }
  }

  throw new Error('All providers failed');
}

async function checkCache(request: any): Promise<LLMResponse | null> {
  const key = `llm:cache:${hashRequest(request)}`;
  const cached = await redis.get(key);
  return cached ? JSON.parse(cached) : null;
}

async function cacheResponse(request: any, response: LLMResponse, ttl: number): Promise<void> {
  const key = `llm:cache:${hashRequest(request)}`;
  await redis.setex(key, ttl, JSON.stringify(response));
}

function hashRequest(request: any): string {
  const payload = JSON.stringify({
    messages: request.messages,
    temperature: request.temperature,
    maxTokens: request.maxTokens,
  });
  return createHash('sha256').update(payload).digest('hex').slice(0, 16);
}

async function isProviderHealthy(provider: string): Promise<boolean> {
  const failures = await redis.get(`llm:health:${provider}:failures`);
  return !failures || parseInt(failures) < 3;
}

async function markProviderHealthy(provider: string): Promise<void> {
  await redis.del(`llm:health:${provider}:failures`);
}

async function markProviderUnhealthy(provider: string): Promise<void> {
  await redis.incr(`llm:health:${provider}:failures`);
  await redis.expire(`llm:health:${provider}:failures`, 300); // reset after 5 min
}

async function callProvider(config: any, request: any): Promise<{
  content: string; inputTokens: number; outputTokens: number;
}> {
  const apiKey = config.provider === 'openai' ? process.env.OPENAI_API_KEY
    : config.provider === 'anthropic' ? process.env.ANTHROPIC_API_KEY : '';

  if (config.provider === 'openai') {
    const res = await fetch('https://api.openai.com/v1/chat/completions', {
      method: 'POST',
      headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
      body: JSON.stringify({
        model: config.model, messages: request.messages,
        temperature: request.temperature, max_tokens: request.maxTokens,
      }),
    });
    if (!res.ok) throw new Error(`OpenAI ${res.status}`);
    const data = await res.json() as any;
    return {
      content: data.choices[0].message.content,
      inputTokens: data.usage.prompt_tokens,
      outputTokens: data.usage.completion_tokens,
    };
  }

  if (config.provider === 'anthropic') {
    const res = await fetch('https://api.anthropic.com/v1/messages', {
      method: 'POST',
      headers: {
        'x-api-key': apiKey!, 'Content-Type': 'application/json',
        'anthropic-version': '2023-06-01',
      },
      body: JSON.stringify({
        model: config.model, messages: request.messages.filter((m: any) => m.role !== 'system'),
        system: request.messages.find((m: any) => m.role === 'system')?.content,
        max_tokens: request.maxTokens,
      }),
    });
    if (!res.ok) throw new Error(`Anthropic ${res.status}`);
    const data = await res.json() as any;
    return {
      content: data.content[0].text,
      inputTokens: data.usage.input_tokens,
      outputTokens: data.usage.output_tokens,
    };
  }

  throw new Error(`Unknown provider: ${config.provider}`);
}

const models: Record<string, any> = {
  'gpt-4o': { provider: 'openai', model: 'gpt-4o', costPer1kInput: 0.0025, costPer1kOutput: 0.01 },
  'gpt-4o-mini': { provider: 'openai', model: 'gpt-4o-mini', costPer1kInput: 0.00015, costPer1kOutput: 0.0006 },
  'claude-sonnet': { provider: 'anthropic', model: 'claude-sonnet-4-20250514', costPer1kInput: 0.003, costPer1kOutput: 0.015 },
  'claude-haiku': { provider: 'anthropic', model: 'claude-haiku-4-20250514', costPer1kInput: 0.0008, costPer1kOutput: 0.004 },
};

Step 3: Per-Team Budget Control

typescript
// src/gateway/budget.ts
import { Redis } from 'ioredis';

const redis = new Redis(process.env.REDIS_URL!);

interface BudgetConfig {
  monthlyLimitUsd: number;
  dailyLimitUsd: number;
  alertThresholdPercent: number;
}

const teamBudgets: Record<string, BudgetConfig> = {
  'chatbot': { monthlyLimitUsd: 5000, dailyLimitUsd: 200, alertThresholdPercent: 80 },
  'search': { monthlyLimitUsd: 2000, dailyLimitUsd: 100, alertThresholdPercent: 80 },
  'moderation': { monthlyLimitUsd: 1000, dailyLimitUsd: 50, alertThresholdPercent: 80 },
  'default': { monthlyLimitUsd: 500, dailyLimitUsd: 30, alertThresholdPercent: 80 },
};

export async function checkBudget(teamId: string, estimatedCostUsd: number): Promise<{
  allowed: boolean;
  reason?: string;
  currentSpend: { daily: number; monthly: number };
}> {
  const budget = teamBudgets[teamId] ?? teamBudgets.default;
  const day = new Date().toISOString().split('T')[0];
  const month = day.slice(0, 7);

  const [daily, monthly] = await Promise.all([
    redis.get(`budget:${teamId}:${day}`).then(v => parseFloat(v ?? '0')),
    redis.get(`budget:${teamId}:${month}`).then(v => parseFloat(v ?? '0')),
  ]);

  if (daily + estimatedCostUsd > budget.dailyLimitUsd) {
    return { allowed: false, reason: 'Daily budget exceeded', currentSpend: { daily, monthly } };
  }
  if (monthly + estimatedCostUsd > budget.monthlyLimitUsd) {
    return { allowed: false, reason: 'Monthly budget exceeded', currentSpend: { daily, monthly } };
  }

  return { allowed: true, currentSpend: { daily, monthly } };
}

export async function recordSpend(teamId: string, costUsd: number): Promise<void> {
  const day = new Date().toISOString().split('T')[0];
  const month = day.slice(0, 7);

  await redis.incrbyfloat(`budget:${teamId}:${day}`, costUsd);
  await redis.incrbyfloat(`budget:${teamId}:${month}`, costUsd);
  await redis.expire(`budget:${teamId}:${day}`, 86400 * 2);
  await redis.expire(`budget:${teamId}:${month}`, 86400 * 35);
}

Results

  • Monthly AI spend: dropped from $18K to $6.8K — 62% reduction
  • Cache hit rate: 34% of requests served from cache (identical prompts across users)
  • Provider uptime: 99.95% (automatic fallback during OpenAI's 2-hour outage — zero downtime)
  • Runaway loop protection: budget limits caught 4 incidents, prevented $8K+ waste
  • Cost attribution: each team sees real-time spend, drives optimization
  • Model routing: 60% of requests use gpt-4o-mini (16x cheaper than gpt-4o) with no quality impact
  • Per-feature cost visibility: chatbot=$2.1K, search=$1.8K, moderation=$0.4K, other=$2.5K