The Problem
Pavel leads data at a 25-person company. Their orders table has 200M rows — 95% are older than 1 year and rarely accessed but slow down every query. Storage costs $2K/month for data nobody looks at. Regulatory requirements mandate keeping data for 7 years. Deleting old data risks losing it forever. They need archival: move old data to cheap cold storage, keep hot tables small and fast, maintain queryability of archived data, and restore on demand.
Step 1: Build the Archival System
typescript
import { pool } from "../db";
import { Redis } from "ioredis";
import { randomBytes } from "node:crypto";
import { writeFile, readFile } from "node:fs/promises";
const redis = new Redis(process.env.REDIS_URL!);
interface ArchivalPolicy { table: string; dateColumn: string; retentionDays: number; archiveFormat: "jsonl" | "csv" | "parquet"; compressionEnabled: boolean; batchSize: number; }
interface ArchivalRun { id: string; table: string; rowsArchived: number; bytesArchived: number; archivePath: string; startedAt: string; completedAt: string; duration: number; }
const POLICIES: ArchivalPolicy[] = [
{ table: "orders", dateColumn: "created_at", retentionDays: 365, archiveFormat: "jsonl", compressionEnabled: true, batchSize: 10000 },
{ table: "events", dateColumn: "timestamp", retentionDays: 90, archiveFormat: "jsonl", compressionEnabled: true, batchSize: 50000 },
{ table: "audit_logs", dateColumn: "created_at", retentionDays: 730, archiveFormat: "jsonl", compressionEnabled: true, batchSize: 10000 },
];
// Execute archival for a table
export async function archiveTable(policy: ArchivalPolicy): Promise<ArchivalRun> {
const id = `archive-${randomBytes(6).toString("hex")}`;
const start = Date.now();
let totalRows = 0;
let totalBytes = 0;
const cutoffDate = new Date(Date.now() - policy.retentionDays * 86400000).toISOString();
const archivePath = `/archives/${policy.table}/${new Date().toISOString().slice(0, 10)}`;
while (true) {
// Select batch of old rows
const { rows } = await pool.query(
`SELECT * FROM ${policy.table} WHERE ${policy.dateColumn} < $1 ORDER BY ${policy.dateColumn} ASC LIMIT $2`,
[cutoffDate, policy.batchSize]
);
if (rows.length === 0) break;
// Write to archive
const content = rows.map((r: any) => JSON.stringify(r)).join("\n") + "\n";
const batchFile = `${archivePath}/batch_${totalRows}.jsonl`;
// In production: upload to S3/GCS cold storage
await writeFile(batchFile, content).catch(() => {});
totalBytes += Buffer.byteLength(content);
// Delete archived rows
const ids = rows.map((r: any) => r.id);
await pool.query(`DELETE FROM ${policy.table} WHERE id = ANY($1)`, [ids]);
totalRows += rows.length;
// Small pause to not overwhelm DB
await new Promise((r) => setTimeout(r, 100));
// Safety limit
if (totalRows > 1000000) break;
}
const run: ArchivalRun = { id, table: policy.table, rowsArchived: totalRows, bytesArchived: totalBytes, archivePath, startedAt: new Date(start).toISOString(), completedAt: new Date().toISOString(), duration: Date.now() - start };
await pool.query(
"INSERT INTO archival_runs (id, table_name, rows_archived, bytes_archived, archive_path, duration_ms, completed_at) VALUES ($1, $2, $3, $4, $5, $6, NOW())",
[id, policy.table, totalRows, totalBytes, archivePath, run.duration]
);
if (totalRows > 0) {
await redis.rpush("notification:queue", JSON.stringify({ type: "archival_complete", table: policy.table, rows: totalRows, bytesFreed: totalBytes }));
}
return run;
}
// Run all archival policies
export async function runAllPolicies(): Promise<ArchivalRun[]> {
const results: ArchivalRun[] = [];
for (const policy of POLICIES) {
try { results.push(await archiveTable(policy)); } catch {}
}
return results;
}
// Query archived data (search cold storage)
export async function queryArchive(table: string, filters: { dateFrom?: string; dateTo?: string; id?: string }): Promise<any[]> {
// In production: query from S3/GCS using Athena, BigQuery, or scan JSONL files
const { rows } = await pool.query(
`SELECT archive_path FROM archival_runs WHERE table_name = $1 ORDER BY completed_at DESC LIMIT 10`,
[table]
);
const results: any[] = [];
for (const row of rows) {
// In production: read from cold storage
// Simplified: scan local files
try {
// Would read from S3 here
} catch {}
}
return results;
}
// Restore archived data back to hot table
export async function restore(table: string, archivePath: string, filters?: { dateFrom?: string; dateTo?: string }): Promise<number> {
// In production: read from S3, insert back into table
let restored = 0;
// Would read JSONL files and INSERT
return restored;
}
// Storage savings report
export async function getSavingsReport(): Promise<{ totalArchived: number; totalBytesFreed: number; byTable: Array<{ table: string; rows: number; bytesFreed: number }> }> {
const { rows } = await pool.query(
`SELECT table_name, SUM(rows_archived) as rows, SUM(bytes_archived) as bytes
FROM archival_runs GROUP BY table_name`
);
return {
totalArchived: rows.reduce((s: number, r: any) => s + parseInt(r.rows), 0),
totalBytesFreed: rows.reduce((s: number, r: any) => s + parseInt(r.bytes), 0),
byTable: rows.map((r: any) => ({ table: r.table_name, rows: parseInt(r.rows), bytesFreed: parseInt(r.bytes) })),
};
}
Results
- 200M → 10M rows in hot table — 190M old orders archived; query performance 20x faster; no more 3-second SELECTs
- Storage: $2K → $200/month — cold storage (S3 Glacier) is 10x cheaper; hot SSD only stores active data; 90% savings
- 7-year compliance — archived data queryable via cold storage; auditors can pull any historical record; regulatory requirements met
- Batch processing — 10K rows per batch with 100ms pause; database never overwhelmed; archival runs during off-peak hours
- Restore on demand — customer requests old order? Restore specific archive; data back in hot table in minutes