The Problem
Jake leads backend at a 20-person SaaS. Errors are handled inconsistently: some endpoints return {error: 'Something went wrong'}, others return stack traces, and some crash with 500 and no body. Users see undefined in the UI when an API field is missing. Sentry gets 500 duplicate errors because the same bug fires on every request. There's no difference between "database is down" (retry later) and "invalid input" (don't retry). They need a structured error system: typed errors, consistent API responses, automatic retry classification, user-friendly messages, error deduplication, and graceful fallbacks.
Step 1: Build the Error Boundary Engine
// src/errors/boundary.ts — Structured error handling with retry classification and reporting
import { Redis } from "ioredis";
import { createHash } from "node:crypto";
const redis = new Redis(process.env.REDIS_URL!);
// Structured error types
export class AppError extends Error {
public readonly code: string;
public readonly statusCode: number;
public readonly isRetryable: boolean;
public readonly isOperational: boolean; // true = expected, false = bug
public readonly context: Record<string, any>;
public readonly fingerprint: string;
constructor(params: {
message: string;
code: string;
statusCode?: number;
isRetryable?: boolean;
isOperational?: boolean;
context?: Record<string, any>;
cause?: Error;
}) {
super(params.message);
this.name = "AppError";
this.code = params.code;
this.statusCode = params.statusCode || 500;
this.isRetryable = params.isRetryable ?? false;
this.isOperational = params.isOperational ?? true;
this.context = params.context || {};
this.cause = params.cause;
this.fingerprint = createHash("sha256")
.update(`${this.code}:${this.message}:${this.stack?.split("\n")[1] || ""}`)
.digest("hex").slice(0, 16);
}
}
// Pre-defined error factories
export const Errors = {
notFound: (resource: string, id?: string) =>
new AppError({ message: `${resource}${id ? ` '${id}'` : ''} not found`, code: "NOT_FOUND", statusCode: 404, isOperational: true }),
validation: (details: Record<string, string>) =>
new AppError({ message: "Validation failed", code: "VALIDATION_ERROR", statusCode: 400, isOperational: true, context: { details } }),
unauthorized: (reason?: string) =>
new AppError({ message: reason || "Authentication required", code: "UNAUTHORIZED", statusCode: 401, isOperational: true }),
forbidden: (action?: string) =>
new AppError({ message: `Permission denied${action ? `: ${action}` : ''}`, code: "FORBIDDEN", statusCode: 403, isOperational: true }),
conflict: (message: string) =>
new AppError({ message, code: "CONFLICT", statusCode: 409, isOperational: true }),
rateLimit: (retryAfter: number) =>
new AppError({ message: "Too many requests", code: "RATE_LIMITED", statusCode: 429, isRetryable: true, isOperational: true, context: { retryAfter } }),
serviceUnavailable: (service: string) =>
new AppError({ message: `${service} is temporarily unavailable`, code: "SERVICE_UNAVAILABLE", statusCode: 503, isRetryable: true, isOperational: true }),
database: (operation: string, cause?: Error) =>
new AppError({ message: `Database error during ${operation}`, code: "DATABASE_ERROR", statusCode: 500, isRetryable: true, isOperational: false, cause }),
external: (service: string, statusCode: number, cause?: Error) =>
new AppError({ message: `${service} returned ${statusCode}`, code: "EXTERNAL_ERROR", statusCode: 502, isRetryable: statusCode >= 500, context: { service, upstreamStatus: statusCode }, cause }),
internal: (message: string, cause?: Error) =>
new AppError({ message, code: "INTERNAL_ERROR", statusCode: 500, isRetryable: false, isOperational: false, cause }),
};
// Error boundary middleware for Hono
export function errorBoundary() {
return async (c: any, next: any) => {
try {
await next();
} catch (error: any) {
const appError = normalizeError(error);
// Report to error tracking (deduplicated)
await reportError(appError, {
method: c.req.method,
path: c.req.path,
userId: c.get("userId"),
});
// Build response
const response: any = {
error: {
code: appError.code,
message: appError.isOperational ? appError.message : "An unexpected error occurred",
retryable: appError.isRetryable,
},
};
if (appError.code === "VALIDATION_ERROR" && appError.context.details) {
response.error.details = appError.context.details;
}
if (appError.isRetryable && appError.context.retryAfter) {
c.header("Retry-After", String(appError.context.retryAfter));
}
// Never expose stack traces in production
if (process.env.NODE_ENV === "development") {
response.error.stack = appError.stack;
}
return c.json(response, appError.statusCode);
}
};
}
// Normalize any error into AppError
function normalizeError(error: any): AppError {
if (error instanceof AppError) return error;
// PostgreSQL errors
if (error.code === "23505") return Errors.conflict("Resource already exists");
if (error.code === "23503") return Errors.validation({ reference: "Referenced resource not found" });
if (error.code === "ECONNREFUSED") return Errors.database("connection", error);
// Zod validation errors
if (error.name === "ZodError") {
const details: Record<string, string> = {};
for (const issue of error.issues) {
details[issue.path.join(".")] = issue.message;
}
return Errors.validation(details);
}
// Fetch errors (external service calls)
if (error.name === "AbortError") return Errors.serviceUnavailable("External service (timeout)");
// Unknown errors
return Errors.internal(error.message || "Unknown error", error);
}
// Deduplicated error reporting
async function reportError(error: AppError, context: Record<string, any>): Promise<void> {
const dedupeKey = `error:seen:${error.fingerprint}`;
const recentCount = await redis.incr(dedupeKey);
await redis.expire(dedupeKey, 3600); // 1 hour dedup window
if (recentCount === 1) {
// First occurrence — send full report
await redis.rpush("error:reports", JSON.stringify({
fingerprint: error.fingerprint,
code: error.code,
message: error.message,
statusCode: error.statusCode,
isOperational: error.isOperational,
stack: error.stack,
context: { ...error.context, ...context },
timestamp: new Date().toISOString(),
}));
} else if (recentCount === 10 || recentCount === 100) {
// Milestone alerts for repeat errors
await redis.rpush("error:reports", JSON.stringify({
fingerprint: error.fingerprint,
message: `Error ${error.code} occurred ${recentCount} times in the last hour`,
escalation: true,
timestamp: new Date().toISOString(),
}));
}
}
// Retry wrapper with exponential backoff
export async function withRetry<T>(
fn: () => Promise<T>,
options?: { maxRetries?: number; baseDelayMs?: number; onRetry?: (error: Error, attempt: number) => void }
): Promise<T> {
const maxRetries = options?.maxRetries ?? 3;
const baseDelay = options?.baseDelayMs ?? 1000;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await fn();
} catch (error: any) {
const appError = error instanceof AppError ? error : normalizeError(error);
if (!appError.isRetryable || attempt === maxRetries) throw appError;
options?.onRetry?.(appError, attempt + 1);
await new Promise((r) => setTimeout(r, baseDelay * Math.pow(2, attempt)));
}
}
throw Errors.internal("Retry exhausted"); // should never reach here
}
// Graceful fallback wrapper
export async function withFallback<T>(
fn: () => Promise<T>,
fallback: T | (() => Promise<T>),
options?: { reportError?: boolean }
): Promise<T> {
try {
return await fn();
} catch (error: any) {
if (options?.reportError !== false) {
const appError = normalizeError(error);
await reportError(appError, { fallbackUsed: true });
}
return typeof fallback === "function" ? await (fallback as () => Promise<T>)() : fallback;
}
}
Results
- Consistent API responses — every error returns
{error: {code, message, retryable}}; frontend renders user-friendly messages; no moreundefinedor stack traces - Retry classification — 503 and database errors tagged as retryable; clients implement automatic retry; 400 validation errors not retried; saves server resources
- Error deduplication — same bug fires 500 times; Sentry gets 1 report + "occurred 100 times" alert; engineers see one issue, not 500 copies
- Graceful fallbacks — recommendation service down → return cached popular items instead of error page; users never know a service failed
- Zero stack trace leaks — production responses hide internal details; development mode shows full stack; security audit passed