Build a Database Index Advisor

The Problem

Peter leads backend at a 25-person SaaS. Database performance degrades as data grows — queries that were fast on 100K rows crawl on 10M rows. They have 50 indexes but some are unused (wasting disk and slowing writes), while missing indexes cause sequential scans on large tables. Adding an index is trial-and-error: sometimes it helps, sometimes it makes things worse. They need an index advisor: analyze actual query patterns from pg_stat_statements, recommend indexes based on WHERE/JOIN/ORDER BY clauses, detect unused indexes, estimate impact, and generate ready-to-run migration SQL.

Step 1: Build the Index Advisor

typescript

import { pool } from "../db";
import { Redis } from "ioredis";
const redis = new Redis(process.env.REDIS_URL!);

interface IndexRecommendation { table: string; columns: string[]; type: "btree" | "gin" | "gist" | "hash"; reason: string; estimatedImpact: string; queriesAffected: number; createSQL: string; }
interface UnusedIndex { name: string; table: string; columns: string; size: string; scans: number; reason: string; dropSQL: string; }

// Analyze queries and recommend indexes
export async function analyzeAndRecommend(): Promise<{ recommendations: IndexRecommendation[]; unusedIndexes: UnusedIndex[]; existingIndexes: number; totalTableSize: string }> {
  const recommendations: IndexRecommendation[] = [];

  // Get slow queries from pg_stat_statements
  const { rows: slowQueries } = await pool.query(
    `SELECT query, calls, mean_exec_time, rows FROM pg_stat_statements WHERE mean_exec_time > 100 AND calls > 10 ORDER BY mean_exec_time * calls DESC LIMIT 50`
  ).catch(() => ({ rows: [] }));

  // Analyze sequential scans
  const { rows: seqScans } = await pool.query(
    `SELECT schemaname, relname, seq_scan, seq_tup_read, idx_scan, n_live_tup
     FROM pg_stat_user_tables WHERE seq_scan > 100 AND n_live_tup > 10000
     ORDER BY seq_tup_read DESC LIMIT 20`
  );

  for (const table of seqScans) {
    if (table.seq_scan > table.idx_scan * 2 && table.n_live_tup > 50000) {
      // Find which columns are frequently in WHERE clauses
      const relatedQueries = slowQueries.filter((q: any) => q.query.toLowerCase().includes(table.relname.toLowerCase()));
      const whereColumns = extractWhereColumns(relatedQueries.map((q: any) => q.query), table.relname);

      for (const col of whereColumns) {
        // Check if index already exists
        const { rows: existing } = await pool.query(
          `SELECT indexname FROM pg_indexes WHERE tablename = $1 AND indexdef LIKE $2`,
          [table.relname, `%${col}%`]
        );
        if (existing.length === 0) {
          const indexName = `idx_${table.relname}_${col}`;
          recommendations.push({
            table: table.relname, columns: [col], type: "btree",
            reason: `Table '${table.relname}' has ${table.seq_scan} seq scans vs ${table.idx_scan} idx scans with ${table.n_live_tup} rows. Column '${col}' appears in WHERE clauses of slow queries.`,
            estimatedImpact: `Could reduce seq scans by ~${Math.round((table.seq_scan / (table.seq_scan + table.idx_scan)) * 100)}%`,
            queriesAffected: relatedQueries.length,
            createSQL: `CREATE INDEX CONCURRENTLY ${indexName} ON ${table.relname} (${col});`,
          });
        }
      }
    }
  }

  // Composite index recommendations for multi-column WHERE
  for (const q of slowQueries) {
    const compositeColumns = extractCompositeWhere(q.query);
    if (compositeColumns.length >= 2) {
      const table = compositeColumns[0].table;
      const cols = compositeColumns.map((c: any) => c.column);
      const indexName = `idx_${table}_${cols.join("_")}`.slice(0, 63);
      const { rows: existing } = await pool.query(
        `SELECT indexname FROM pg_indexes WHERE tablename = $1 AND indexdef LIKE $2`,
        [table, `%(${cols.join(", ")})%`]
      );
      if (existing.length === 0) {
        recommendations.push({
          table, columns: cols, type: "btree",
          reason: `Slow query (avg ${Math.round(q.mean_exec_time)}ms, ${q.calls} calls) uses WHERE with columns ${cols.join(", ")}`,
          estimatedImpact: `Could improve query from ${Math.round(q.mean_exec_time)}ms to <10ms`,
          queriesAffected: 1,
          createSQL: `CREATE INDEX CONCURRENTLY ${indexName} ON ${table} (${cols.join(", ")});`,
        });
      }
    }
  }

  // Detect unused indexes
  const { rows: unused } = await pool.query(
    `SELECT indexrelname as name, relname as table, pg_size_pretty(pg_relation_size(indexrelid)) as size,
       idx_scan as scans, indexdef
     FROM pg_stat_user_indexes JOIN pg_indexes ON indexrelname = indexname
     WHERE idx_scan = 0 AND indexrelname NOT LIKE 'pg_%' AND indexrelname NOT LIKE '%_pkey'
     ORDER BY pg_relation_size(indexrelid) DESC`
  );

  const unusedIndexes: UnusedIndex[] = unused.map((idx: any) => ({
    name: idx.name, table: idx.table, columns: idx.indexdef.match(/\((.+)\)/)?.[1] || "",
    size: idx.size, scans: idx.scans, reason: "Zero scans since last stats reset",
    dropSQL: `DROP INDEX CONCURRENTLY ${idx.name};`,
  }));

  // Stats
  const { rows: [{ count: indexCount }] } = await pool.query("SELECT COUNT(*) as count FROM pg_indexes WHERE schemaname = 'public'");
  const { rows: [{ size: totalSize }] } = await pool.query("SELECT pg_size_pretty(pg_database_size(current_database())) as size");

  return { recommendations: recommendations.slice(0, 20), unusedIndexes, existingIndexes: parseInt(indexCount), totalTableSize: totalSize };
}

function extractWhereColumns(queries: string[], tableName: string): string[] {
  const columns = new Set<string>();
  for (const query of queries) {
    const whereMatch = query.match(/WHERE\s+(.+?)(?:ORDER|GROUP|LIMIT|$)/is);
    if (!whereMatch) continue;
    const whereClauses = whereMatch[1];
    const colMatches = whereClauses.matchAll(/(\w+)\s*(?:=|>|<|>=|<=|<>|LIKE|IN|IS|BETWEEN)/gi);
    for (const match of colMatches) {
      const col = match[1].toLowerCase();
      if (!['and', 'or', 'not', 'null', 'true', 'false'].includes(col)) columns.add(col);
    }
  }
  return [...columns];
}

function extractCompositeWhere(query: string): Array<{ table: string; column: string }> {
  const fromMatch = query.match(/FROM\s+(\w+)/i);
  if (!fromMatch) return [];
  const table = fromMatch[1];
  const columns = extractWhereColumns([query], table);
  return columns.map((col) => ({ table, column: col }));
}

// Generate migration file
export function generateMigration(recommendations: IndexRecommendation[]): string {
  let migration = `-- Auto-generated index migration\n-- Generated at: ${new Date().toISOString()}\n\n-- UP\n`;
  for (const rec of recommendations) {
    migration += `-- ${rec.reason}\n${rec.createSQL}\n\n`;
  }
  migration += `-- DOWN\n`;
  for (const rec of recommendations) {
    const indexName = rec.createSQL.match(/INDEX CONCURRENTLY (\w+)/)?.[1];
    migration += `DROP INDEX IF EXISTS ${indexName};\n`;
  }
  return migration;
}

Results

Missing indexes found — advisor identified 8 tables with high seq_scan/idx_scan ratio; adding 5 indexes reduced avg query time from 800ms to 15ms
Unused indexes cleaned — 12 indexes (2.3GB) with zero scans dropped; write performance improved 15%; disk space recovered
Composite index recommended — WHERE clause with tenant_id AND status AND created_at → composite index; query: 2500ms → 3ms
Ready-to-run SQL — CREATE INDEX CONCURRENTLY generated; copy-paste to migration; CONCURRENTLY means no table lock; safe for production
Impact estimated — "Could reduce seq scans by ~85%" and "improve from 800ms to <10ms"; DBA prioritizes high-impact indexes first

Skills stack · 5 skills

typescript

redis

postgresql

hono

zod

The Problem

Step 1: Build the Index Advisor

Results