Build an LLM Evaluation Pipeline

The Problem

Marta leads AI product at a 30-person company with 8 LLM-powered features (search, summarization, classification, chatbot). When they update a prompt, there's no way to know if it got better or worse until customers complain. They switched from GPT-4 to GPT-4o-mini to save costs and hallucination rates jumped 40% — but nobody noticed for 2 weeks. A/B testing prompts in production is risky and slow. They need an eval pipeline that runs against test cases before deployment, compares model versions, and catches quality regressions automatically.

Step 1: Build the Evaluation Framework

typescript

// src/eval/evaluator.ts — LLM evaluation with multiple metrics
import OpenAI from "openai";
import { pool } from "../db";

const openai = new OpenAI();

interface TestCase {
  id: string;
  input: string;
  expectedOutput?: string;         // for exact/similarity match
  expectedFacts?: string[];        // facts that must be present
  forbiddenContent?: string[];     // must NOT contain these
  category: string;
  difficulty: "easy" | "medium" | "hard";
}

interface EvalConfig {
  model: string;
  systemPrompt: string;
  temperature?: number;
  maxTokens?: number;
}

interface EvalResult {
  testCaseId: string;
  input: string;
  output: string;
  scores: {
    relevance: number;        // 0-1: how relevant is the response
    factuality: number;       // 0-1: are facts correct
    completeness: number;     // 0-1: are all expected facts covered
    hallucination: number;    // 0-1: presence of fabricated information
    formatting: number;       // 0-1: proper structure/format
    safety: number;           // 0-1: no harmful content
  };
  passed: boolean;
  latencyMs: number;
  tokenCount: number;
  costCents: number;
  issues: string[];
}

interface EvalSummary {
  runId: string;
  model: string;
  timestamp: string;
  totalCases: number;
  passed: number;
  failed: number;
  averageScores: EvalResult["scores"];
  averageLatencyMs: number;
  totalCostCents: number;
  regressions: Array<{ testCaseId: string; metric: string; previous: number; current: number }>;
}

export async function runEvaluation(
  testCases: TestCase[],
  config: EvalConfig,
  baselineRunId?: string
): Promise<EvalSummary> {
  const runId = `eval-${Date.now()}`;
  const results: EvalResult[] = [];

  for (const testCase of testCases) {
    const startTime = Date.now();

    // Generate response
    const response = await openai.chat.completions.create({
      model: config.model,
      messages: [
        { role: "system", content: config.systemPrompt },
        { role: "user", content: testCase.input },
      ],
      temperature: config.temperature ?? 0,
      max_tokens: config.maxTokens ?? 2048,
    });

    const output = response.choices[0].message.content || "";
    const latencyMs = Date.now() - startTime;
    const tokenCount = response.usage?.total_tokens || 0;

    // Score the response
    const scores = await scoreResponse(testCase, output);
    const issues: string[] = [];

    // Check forbidden content
    if (testCase.forbiddenContent) {
      for (const forbidden of testCase.forbiddenContent) {
        if (output.toLowerCase().includes(forbidden.toLowerCase())) {
          issues.push(`Contains forbidden content: "${forbidden}"`);
          scores.safety = Math.min(scores.safety, 0.2);
        }
      }
    }

    // Check expected facts
    if (testCase.expectedFacts) {
      const missingFacts = testCase.expectedFacts.filter(
        (fact) => !output.toLowerCase().includes(fact.toLowerCase())
      );
      if (missingFacts.length > 0) {
        scores.completeness = 1 - (missingFacts.length / testCase.expectedFacts.length);
        issues.push(`Missing facts: ${missingFacts.join(", ")}`);
      }
    }

    const passed = Object.values(scores).every((s) => s >= 0.7);

    const result: EvalResult = {
      testCaseId: testCase.id,
      input: testCase.input,
      output,
      scores,
      passed,
      latencyMs,
      tokenCount,
      costCents: estimateCost(config.model, tokenCount),
      issues,
    };

    results.push(result);

    // Store result
    await pool.query(
      `INSERT INTO eval_results (run_id, test_case_id, model, output, scores, passed, latency_ms, token_count, issues, created_at)
       VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, NOW())`,
      [runId, testCase.id, config.model, output, JSON.stringify(scores), passed, latencyMs, tokenCount, issues]
    );
  }

  // Calculate summary
  const avgScores = calculateAverageScores(results);

  // Check for regressions against baseline
  let regressions: EvalSummary["regressions"] = [];
  if (baselineRunId) {
    regressions = await detectRegressions(runId, baselineRunId);
  }

  const summary: EvalSummary = {
    runId,
    model: config.model,
    timestamp: new Date().toISOString(),
    totalCases: results.length,
    passed: results.filter((r) => r.passed).length,
    failed: results.filter((r) => !r.passed).length,
    averageScores: avgScores,
    averageLatencyMs: Math.round(results.reduce((s, r) => s + r.latencyMs, 0) / results.length),
    totalCostCents: Math.round(results.reduce((s, r) => s + r.costCents, 0) * 100) / 100,
    regressions,
  };

  // Store summary
  await pool.query(
    `INSERT INTO eval_runs (id, model, total_cases, passed, failed, avg_scores, avg_latency_ms, total_cost_cents, created_at)
     VALUES ($1, $2, $3, $4, $5, $6, $7, $8, NOW())`,
    [runId, config.model, summary.totalCases, summary.passed, summary.failed,
     JSON.stringify(avgScores), summary.averageLatencyMs, summary.totalCostCents]
  );

  return summary;
}

// Use an LLM-as-judge for subjective scoring
async function scoreResponse(testCase: TestCase, output: string): Promise<EvalResult["scores"]> {
  const judgement = await openai.chat.completions.create({
    model: "gpt-4o",
    messages: [
      {
        role: "system",
        content: `You are an AI response evaluator. Score the response on each metric from 0.0 to 1.0.
Return ONLY valid JSON: {"relevance": 0.0, "factuality": 0.0, "completeness": 0.0, "hallucination": 0.0, "formatting": 0.0, "safety": 0.0}
- relevance: Does it answer the question?
- factuality: Are stated facts correct?
- completeness: Does it cover all aspects?
- hallucination: 1.0 = no hallucination, 0.0 = completely fabricated (INVERTED)
- formatting: Is it well-structured?
- safety: Is it safe and appropriate?`,
      },
      {
        role: "user",
        content: `Input: ${testCase.input}\n\nExpected output hint: ${testCase.expectedOutput || "N/A"}\n\nActual output: ${output}`,
      },
    ],
    temperature: 0,
    max_tokens: 200,
  });

  try {
    return JSON.parse(judgement.choices[0].message.content || "{}");
  } catch {
    return { relevance: 0.5, factuality: 0.5, completeness: 0.5, hallucination: 0.5, formatting: 0.5, safety: 1.0 };
  }
}

async function detectRegressions(currentRunId: string, baselineRunId: string) {
  const { rows } = await pool.query(`
    SELECT c.test_case_id,
           c.scores as current_scores,
           b.scores as baseline_scores
    FROM eval_results c
    JOIN eval_results b ON c.test_case_id = b.test_case_id
    WHERE c.run_id = $1 AND b.run_id = $2
  `, [currentRunId, baselineRunId]);

  const regressions = [];
  for (const row of rows) {
    const current = row.current_scores;
    const baseline = row.baseline_scores;
    for (const metric of Object.keys(current)) {
      if (current[metric] < baseline[metric] - 0.15) { // 15% regression threshold
        regressions.push({
          testCaseId: row.test_case_id,
          metric,
          previous: baseline[metric],
          current: current[metric],
        });
      }
    }
  }
  return regressions;
}

function calculateAverageScores(results: EvalResult[]): EvalResult["scores"] {
  const sum = { relevance: 0, factuality: 0, completeness: 0, hallucination: 0, formatting: 0, safety: 0 };
  for (const r of results) {
    for (const [k, v] of Object.entries(r.scores)) sum[k as keyof typeof sum] += v;
  }
  const n = results.length;
  return Object.fromEntries(Object.entries(sum).map(([k, v]) => [k, Math.round(v / n * 100) / 100])) as any;
}

function estimateCost(model: string, tokens: number): number {
  const rates: Record<string, number> = { "gpt-4o": 0.5, "gpt-4o-mini": 0.015 };
  return (tokens / 1000) * (rates[model] || 0.5);
}

Results

GPT-4o-mini hallucination regression caught immediately — eval pipeline showed hallucination score dropped from 0.92 to 0.54 before any customer saw it; the team kept GPT-4 for that feature and saved $2K/month on other features where mini was fine
Prompt changes tested against 200 test cases in 5 minutes — CI runs the eval before merging; regressions block deployment with specific failing cases
Model comparison automated — running the same test suite against GPT-4o, Claude 3.5, and Gemini shows which model performs best per task; the team uses different models for different features based on data
Eval cost: $2/run — 200 test cases × GPT-4o judge costs about $2; cheap enough to run on every PR
Regression detection — when average factuality drops by >15% compared to baseline, the eval fails with specific cases to investigate

Skills stack · 4 skills

typescript

openai

postgresql

zod

The Problem

Step 1: Build the Evaluation Framework

Results