The Problem
Ivan leads ops at a 25-person company with dev, staging, and production environments. Someone changed a database config in production directly (bypassing CI/CD) — it works but staging doesn't have the change. A firewall rule was added to production 3 months ago; nobody remembers why and it's not in any config file. Environment variables differ between staging and prod — 12 of 80 vars are different, causing intermittent bugs. They need drift detection: snapshot configs per environment, compare snapshots, detect unauthorized changes, alert on drift, and generate remediation plans.
Step 1: Build the Drift Detector
import { pool } from "../db";
import { Redis } from "ioredis";
import { createHash, randomBytes } from "node:crypto";
const redis = new Redis(process.env.REDIS_URL!);
interface ConfigSnapshot {
id: string;
environment: string;
category: string;
config: Record<string, any>;
checksum: string;
capturedAt: string;
capturedBy: string;
}
interface DriftResult {
environment1: string;
environment2: string;
drifts: Array<{
key: string;
category: string;
value1: any;
value2: any;
severity: "critical" | "warning" | "info";
recommendation: string;
}>;
totalKeys: number;
matchingKeys: number;
driftPercentage: number;
}
const SEVERITY_RULES: Array<{ pattern: RegExp; severity: "critical" | "warning" | "info" }> = [
{ pattern: /password|secret|key|token/i, severity: "critical" },
{ pattern: /database|redis|host|port/i, severity: "critical" },
{ pattern: /timeout|retry|limit|max/i, severity: "warning" },
{ pattern: /log|debug|verbose/i, severity: "info" },
];
// Capture config snapshot for an environment
export async function captureSnapshot(environment: string, configs: Record<string, Record<string, any>>, capturedBy: string): Promise<ConfigSnapshot[]> {
const snapshots: ConfigSnapshot[] = [];
for (const [category, config] of Object.entries(configs)) {
const id = `snap-${randomBytes(6).toString("hex")}`;
const checksum = createHash("sha256").update(JSON.stringify(config)).digest("hex").slice(0, 16);
await pool.query(
`INSERT INTO config_snapshots (id, environment, category, config, checksum, captured_by, captured_at)
VALUES ($1, $2, $3, $4, $5, $6, NOW())`,
[id, environment, category, JSON.stringify(config), checksum, capturedBy]
);
snapshots.push({ id, environment, category, config, checksum, capturedAt: new Date().toISOString(), capturedBy });
}
return snapshots;
}
// Compare two environments
export async function detectDrift(env1: string, env2: string, categories?: string[]): Promise<DriftResult> {
const snap1 = await getLatestSnapshots(env1, categories);
const snap2 = await getLatestSnapshots(env2, categories);
const drifts: DriftResult["drifts"] = [];
const allKeys = new Set<string>();
let matching = 0;
// Compare all categories
const allCategories = new Set([...Object.keys(snap1), ...Object.keys(snap2)]);
for (const category of allCategories) {
const config1 = snap1[category] || {};
const config2 = snap2[category] || {};
const keys = new Set([...Object.keys(config1), ...Object.keys(config2)]);
for (const key of keys) {
allKeys.add(`${category}.${key}`);
const val1 = config1[key];
const val2 = config2[key];
if (JSON.stringify(val1) === JSON.stringify(val2)) { matching++; continue; }
const severity = getSeverity(key);
const recommendation = generateRecommendation(key, val1, val2, env1, env2);
drifts.push({ key, category, value1: val1, value2: val2, severity, recommendation });
}
}
const result: DriftResult = {
environment1: env1, environment2: env2, drifts,
totalKeys: allKeys.size, matchingKeys: matching,
driftPercentage: allKeys.size > 0 ? Math.round(((allKeys.size - matching) / allKeys.size) * 100) : 0,
};
// Alert on critical drifts
const criticalDrifts = drifts.filter((d) => d.severity === "critical");
if (criticalDrifts.length > 0) {
await redis.rpush("notification:queue", JSON.stringify({
type: "config_drift", environments: [env1, env2],
critical: criticalDrifts.length, total: drifts.length,
}));
}
// Store result
await pool.query(
`INSERT INTO drift_reports (env1, env2, total_keys, matching_keys, drift_count, critical_count, created_at)
VALUES ($1, $2, $3, $4, $5, $6, NOW())`,
[env1, env2, result.totalKeys, result.matchingKeys, drifts.length, criticalDrifts.length]
);
return result;
}
// Detect unauthorized changes (compare current to last known-good)
export async function detectUnauthorizedChanges(environment: string): Promise<Array<{ key: string; category: string; previousValue: any; currentValue: any; changedAt: string }>> {
const changes: any[] = [];
const categories = await getSnapshotCategories(environment);
for (const category of categories) {
const { rows } = await pool.query(
`SELECT config, checksum, captured_at FROM config_snapshots
WHERE environment = $1 AND category = $2
ORDER BY captured_at DESC LIMIT 2`,
[environment, category]
);
if (rows.length < 2) continue;
const current = JSON.parse(rows[0].config);
const previous = JSON.parse(rows[1].config);
for (const key of Object.keys(current)) {
if (JSON.stringify(current[key]) !== JSON.stringify(previous[key])) {
changes.push({ key, category, previousValue: previous[key], currentValue: current[key], changedAt: rows[0].captured_at });
}
}
}
return changes;
}
function getSeverity(key: string): "critical" | "warning" | "info" {
for (const rule of SEVERITY_RULES) {
if (rule.pattern.test(key)) return rule.severity;
}
return "info";
}
function generateRecommendation(key: string, val1: any, val2: any, env1: string, env2: string): string {
if (val1 === undefined) return `Add '${key}' to ${env1} (exists in ${env2})`;
if (val2 === undefined) return `Add '${key}' to ${env2} (exists in ${env1})`;
return `Sync '${key}' — ${env1} has '${JSON.stringify(val1).slice(0, 50)}', ${env2} has '${JSON.stringify(val2).slice(0, 50)}'`;
}
async function getLatestSnapshots(environment: string, categories?: string[]): Promise<Record<string, Record<string, any>>> {
let sql = `SELECT DISTINCT ON (category) category, config FROM config_snapshots WHERE environment = $1`;
const params: any[] = [environment];
if (categories?.length) { sql += ` AND category = ANY($2)`; params.push(categories); }
sql += " ORDER BY category, captured_at DESC";
const { rows } = await pool.query(sql, params);
return Object.fromEntries(rows.map((r: any) => [r.category, JSON.parse(r.config)]));
}
async function getSnapshotCategories(environment: string): Promise<string[]> {
const { rows } = await pool.query("SELECT DISTINCT category FROM config_snapshots WHERE environment = $1", [environment]);
return rows.map((r: any) => r.category);
}
Results
- 12 env var differences found — drift report shows exactly which vars differ between staging and prod; 8 were bugs, 4 intentional; all documented
- Unauthorized changes detected — production DB config changed outside CI/CD → drift alert fires → ops investigates → hotfix properly tracked
- Environment parity — weekly drift scan ensures dev/staging/prod match; drift percentage: 15% → 2%; "works in staging but not prod" bugs eliminated
- Severity-based alerting — password/secret drifts = critical alert; log level drifts = info; team focuses on what matters
- Remediation plans — each drift includes specific fix: "Add REDIS_URL to staging" or "Sync DB_TIMEOUT — prod has 30s, staging has 5s"