The Problem
Rita leads AI at a 25-person company building customer support agents. They ship agent updates weekly but have no systematic way to measure quality. Last week, a prompt change improved response accuracy but introduced hallucinated product features. Tool use (search, order lookup) works 90% of the time but they don't know which 10% fails. Latency varies from 2s to 30s with no tracking. When an agent gets worse, they find out from customer complaints, not testing. They need an evaluation framework: automated test suites, hallucination detection, tool use accuracy, latency benchmarks, and regression alerts.
Step 1: Build the Evaluation Engine
import { pool } from "../db";
import { Redis } from "ioredis";
import { randomBytes } from "node:crypto";
const redis = new Redis(process.env.REDIS_URL!);
interface EvalCase {
id: string;
name: string;
category: "accuracy" | "hallucination" | "tool_use" | "latency" | "safety" | "custom";
input: { messages: Array<{ role: string; content: string }>; context?: Record<string, any> };
expectedOutput: { mustContain?: string[]; mustNotContain?: string[]; expectedTools?: string[]; maxLatencyMs?: number; customCheck?: string };
tags: string[];
weight: number;
}
interface EvalResult {
caseId: string;
passed: boolean;
score: number;
details: { accuracy: number; hallucination: boolean; toolUseCorrect: boolean; latencyMs: number; issues: string[] };
agentResponse: string;
timestamp: string;
}
interface EvalSuite {
id: string;
name: string;
cases: EvalCase[];
agentVersion: string;
results: EvalResult[];
summary: { totalCases: number; passed: number; failed: number; avgScore: number; avgLatency: number; hallucinationRate: number };
completedAt: string;
}
export async function runEvalSuite(suiteName: string, agentFn: (messages: any[]) => Promise<{ response: string; toolCalls: string[]; latencyMs: number }>): Promise<EvalSuite> {
const { rows: cases } = await pool.query("SELECT * FROM eval_cases WHERE suite = $1 ORDER BY weight DESC", [suiteName]);
const suiteId = `eval-${randomBytes(6).toString("hex")}`;
const results: EvalResult[] = [];
for (const testCase of cases) {
const tc: EvalCase = { ...testCase, input: JSON.parse(testCase.input), expectedOutput: JSON.parse(testCase.expected_output), tags: JSON.parse(testCase.tags) };
const start = Date.now();
try {
const { response, toolCalls, latencyMs } = await agentFn(tc.input.messages);
const issues: string[] = [];
let score = 1.0;
// Check must-contain
if (tc.expectedOutput.mustContain) {
for (const phrase of tc.expectedOutput.mustContain) {
if (!response.toLowerCase().includes(phrase.toLowerCase())) { issues.push(`Missing: "${phrase}"`); score -= 0.2; }
}
}
// Check must-not-contain (hallucination)
let hallucination = false;
if (tc.expectedOutput.mustNotContain) {
for (const phrase of tc.expectedOutput.mustNotContain) {
if (response.toLowerCase().includes(phrase.toLowerCase())) { issues.push(`Hallucinated: "${phrase}"`); hallucination = true; score -= 0.3; }
}
}
// Check tool use
let toolUseCorrect = true;
if (tc.expectedOutput.expectedTools) {
for (const tool of tc.expectedOutput.expectedTools) {
if (!toolCalls.includes(tool)) { issues.push(`Missing tool: ${tool}`); toolUseCorrect = false; score -= 0.2; }
}
}
// Check latency
if (tc.expectedOutput.maxLatencyMs && latencyMs > tc.expectedOutput.maxLatencyMs) {
issues.push(`Latency ${latencyMs}ms > ${tc.expectedOutput.maxLatencyMs}ms`);
score -= 0.1;
}
score = Math.max(0, score);
results.push({
caseId: tc.id, passed: score >= 0.7 && !hallucination,
score, details: { accuracy: score, hallucination, toolUseCorrect, latencyMs, issues },
agentResponse: response.slice(0, 2000), timestamp: new Date().toISOString(),
});
} catch (error: any) {
results.push({
caseId: tc.id, passed: false, score: 0,
details: { accuracy: 0, hallucination: false, toolUseCorrect: false, latencyMs: Date.now() - start, issues: [`Error: ${error.message}`] },
agentResponse: "", timestamp: new Date().toISOString(),
});
}
}
const passed = results.filter((r) => r.passed).length;
const hallucinations = results.filter((r) => r.details.hallucination).length;
const summary = {
totalCases: results.length, passed, failed: results.length - passed,
avgScore: results.reduce((s, r) => s + r.score, 0) / results.length,
avgLatency: results.reduce((s, r) => s + r.details.latencyMs, 0) / results.length,
hallucinationRate: results.length > 0 ? (hallucinations / results.length) * 100 : 0,
};
const suite: EvalSuite = { id: suiteId, name: suiteName, cases: cases as any, agentVersion: process.env.AGENT_VERSION || "unknown", results, summary, completedAt: new Date().toISOString() };
await pool.query(
`INSERT INTO eval_runs (id, suite_name, agent_version, summary, results, completed_at) VALUES ($1, $2, $3, $4, $5, NOW())`,
[suiteId, suiteName, suite.agentVersion, JSON.stringify(summary), JSON.stringify(results)]
);
// Regression check
const { rows: [prev] } = await pool.query(
"SELECT summary FROM eval_runs WHERE suite_name = $1 AND id != $2 ORDER BY completed_at DESC LIMIT 1", [suiteName, suiteId]
);
if (prev) {
const prevSummary = JSON.parse(prev.summary);
if (summary.avgScore < prevSummary.avgScore - 0.05 || summary.hallucinationRate > prevSummary.hallucinationRate + 2) {
await redis.rpush("notification:queue", JSON.stringify({ type: "eval_regression", suite: suiteName, current: summary, previous: prevSummary }));
}
}
return suite;
}
export async function compareVersions(suiteName: string, limit: number = 10): Promise<Array<{ version: string; score: number; hallucinations: number; latency: number; date: string }>> {
const { rows } = await pool.query(
"SELECT agent_version, summary, completed_at FROM eval_runs WHERE suite_name = $1 ORDER BY completed_at DESC LIMIT $2",
[suiteName, limit]
);
return rows.map((r: any) => {
const s = JSON.parse(r.summary);
return { version: r.agent_version, score: s.avgScore, hallucinations: s.hallucinationRate, latency: s.avgLatency, date: r.completed_at };
});
}
Results
- Regression caught before deploy — eval suite runs in CI; prompt change that introduced hallucinations caught by
mustNotContainchecks; blocked from production - Hallucination rate tracked — 2.3% hallucination rate across 200 test cases; after prompt fix: 0.4%; measurable improvement
- Tool use accuracy: 90% → 97% — eval revealed agent wasn't calling order lookup for return requests; prompt updated; tool use regression tests prevent backsliding
- Latency benchmarked — p95 latency tracked per eval run; version 3.2 introduced 30s timeout on complex queries; caught by maxLatencyMs check
- Version comparison — dashboard shows score trending up over 10 versions (0.72 → 0.91); each prompt change has measurable impact; data-driven agent development