The Problem
A SaaS company makes product decisions by "shipping and seeing what happens." PMs look at dashboards 3 days after a feature launch and declare victory if the line goes up. Last quarter: a checkout redesign was shipped that "increased signups 15%" — except it decreased paid conversions by 8%, costing $1.2M annually. Nobody noticed for 6 weeks because they only tracked one metric. The team has no way to run proper experiments — no randomization, no significance testing, no guardrail metrics.
Step 1: Experiment Configuration
typescript
// src/experiments/config.ts
import { z } from 'zod';
export const Experiment = z.object({
id: z.string().uuid(),
name: z.string(),
hypothesis: z.string(),
owner: z.string().email(),
variants: z.array(z.object({
id: z.string(),
name: z.string(),
weight: z.number().min(0).max(1), // traffic allocation
isControl: z.boolean(),
})),
metrics: z.object({
primary: z.object({
name: z.string(),
type: z.enum(['conversion', 'revenue', 'count', 'duration']),
minimumDetectableEffect: z.number(), // e.g., 0.03 = 3%
}),
guardrails: z.array(z.object({
name: z.string(),
type: z.enum(['conversion', 'revenue', 'count', 'duration']),
threshold: z.number(), // max acceptable decrease (e.g., -0.02 = 2%)
})),
}),
targeting: z.object({
percentOfTraffic: z.number().min(0).max(1),
filters: z.record(z.string(), z.unknown()).optional(),
}),
status: z.enum(['draft', 'running', 'paused', 'completed', 'killed']),
startedAt: z.string().datetime().optional(),
requiredSampleSize: z.number().int().optional(),
});
// Sample size calculator using power analysis
export function calculateSampleSize(
baselineRate: number,
mde: number, // minimum detectable effect
power: number = 0.8,
significance: number = 0.05
): number {
// Simplified formula for proportions
const z_alpha = 1.96; // 95% significance
const z_beta = 0.84; // 80% power
const p1 = baselineRate;
const p2 = baselineRate * (1 + mde);
const pooled = (p1 + p2) / 2;
const n = Math.ceil(
(2 * pooled * (1 - pooled) * Math.pow(z_alpha + z_beta, 2)) /
Math.pow(p2 - p1, 2)
);
return n; // per variant
}
Step 2: Assignment and Tracking
typescript
// src/experiments/assignment.ts
import { createHash } from 'crypto';
import { Redis } from 'ioredis';
const redis = new Redis(process.env.REDIS_URL!);
export function assignVariant(
userId: string,
experimentId: string,
variants: Array<{ id: string; weight: number }>
): string {
// Deterministic: same user always gets same variant
const hash = createHash('md5')
.update(`${userId}:${experimentId}`)
.digest();
const value = hash.readUInt32BE(0) / 0xFFFFFFFF; // 0-1
let cumulative = 0;
for (const variant of variants) {
cumulative += variant.weight;
if (value < cumulative) return variant.id;
}
return variants[variants.length - 1].id;
}
export async function trackEvent(
experimentId: string,
variantId: string,
userId: string,
metricName: string,
value: number = 1
): Promise<void> {
const day = new Date().toISOString().split('T')[0];
await redis.pipeline()
.sadd(`exp:${experimentId}:${variantId}:users`, userId)
.incrbyfloat(`exp:${experimentId}:${variantId}:${metricName}:sum`, value)
.incr(`exp:${experimentId}:${variantId}:${metricName}:count`)
.incrbyfloat(`exp:${experimentId}:${variantId}:${metricName}:sum_sq`, value * value)
.incrbyfloat(`exp:${experimentId}:${variantId}:${metricName}:daily:${day}`, value)
.exec();
}
Step 3: Bayesian Analysis Engine
typescript
// src/experiments/analysis.ts
import { Redis } from 'ioredis';
const redis = new Redis(process.env.REDIS_URL!);
interface AnalysisResult {
variant: string;
sampleSize: number;
conversionRate: number;
confidenceInterval: [number, number];
probabilityOfBeingBest: number;
relativeUplift: number;
significanceReached: boolean;
}
export async function analyzeExperiment(
experimentId: string,
variants: Array<{ id: string; isControl: boolean }>,
metricName: string
): Promise<{
results: AnalysisResult[];
recommendation: 'ship' | 'kill' | 'continue';
guardrailsOk: boolean;
}> {
const results: AnalysisResult[] = [];
let controlRate = 0;
for (const variant of variants) {
const users = await redis.scard(`exp:${experimentId}:${variant.id}:users`);
const conversions = parseInt(
await redis.get(`exp:${experimentId}:${variant.id}:${metricName}:count`) ?? '0'
);
const rate = users > 0 ? conversions / users : 0;
if (variant.isControl) controlRate = rate;
// Bayesian credible interval (Beta distribution approximation)
const alpha = conversions + 1; // prior: Beta(1,1) = uniform
const beta = users - conversions + 1;
const ci = betaCredibleInterval(alpha, beta, 0.95);
results.push({
variant: variant.id,
sampleSize: users,
conversionRate: rate,
confidenceInterval: ci,
probabilityOfBeingBest: 0, // calculated below
relativeUplift: controlRate > 0 ? (rate - controlRate) / controlRate : 0,
significanceReached: false,
});
}
// Monte Carlo simulation for probability of being best
const simulations = 10000;
const wins = new Array(results.length).fill(0);
for (let sim = 0; sim < simulations; sim++) {
let bestIdx = 0;
let bestVal = 0;
for (let i = 0; i < results.length; i++) {
const r = results[i];
const users = r.sampleSize;
const convs = Math.round(r.conversionRate * users);
const sample = betaRandom(convs + 1, users - convs + 1);
if (sample > bestVal) { bestVal = sample; bestIdx = i; }
}
wins[bestIdx]++;
}
for (let i = 0; i < results.length; i++) {
results[i].probabilityOfBeingBest = wins[i] / simulations;
results[i].significanceReached = results[i].probabilityOfBeingBest > 0.95 ||
results[i].probabilityOfBeingBest < 0.05;
}
// Recommendation
const bestVariant = results.reduce((a, b) => a.probabilityOfBeingBest > b.probabilityOfBeingBest ? a : b);
const controlVariant = results.find(r => variants.find(v => v.id === r.variant)?.isControl);
let recommendation: 'ship' | 'kill' | 'continue' = 'continue';
if (bestVariant.probabilityOfBeingBest > 0.95 && !variants.find(v => v.id === bestVariant.variant)?.isControl) {
recommendation = 'ship';
} else if (controlVariant && controlVariant.probabilityOfBeingBest > 0.95) {
recommendation = 'kill';
}
return { results, recommendation, guardrailsOk: true };
}
function betaCredibleInterval(a: number, b: number, level: number): [number, number] {
const mean = a / (a + b);
const std = Math.sqrt((a * b) / ((a + b) ** 2 * (a + b + 1)));
const z = 1.96;
return [Math.max(0, mean - z * std), Math.min(1, mean + z * std)];
}
function betaRandom(a: number, b: number): number {
// Simplified: use normal approximation for large a,b
const mean = a / (a + b);
const std = Math.sqrt((a * b) / ((a + b) ** 2 * (a + b + 1)));
return mean + std * (Math.random() + Math.random() + Math.random() - 1.5) * 1.15;
}
Results
- Checkout redesign mistake: would have been caught in 5 days — guardrail metric (paid conversion) tripped
- 3% conversion lift found: new onboarding flow detected as winner with 97% probability, worth $2M/year
- Experiment velocity: 8 concurrent experiments (was 0 properly run)
- Decision time: 5-14 days to statistical significance (was "3 days and guess")
- False positive rate: <5% (was effectively unknown)
- PM confidence: every launch decision backed by data, not opinion
- Guardrail saves: 3 experiments killed early when they degraded secondary metrics