The Problem
Chen leads marketing at a 20-person company. Their site has 500 pages but they don't know which have SEO issues. Some pages have duplicate titles, others have no meta description, and many images lack alt text. They pay $200/month for Ahrefs but only use the site audit feature. Google Search Console shows crawl errors but doesn't explain how to fix them. Performance varies wildly — some pages take 8 seconds to load. They need a self-hosted audit tool: crawl the site, check every SEO factor, score each page, detect broken links, validate structured data, and prioritize fixes by impact.
Step 1: Build the Audit Engine
import { pool } from "../db";
import { Redis } from "ioredis";
import { randomBytes } from "node:crypto";
const redis = new Redis(process.env.REDIS_URL!);
interface SEOAudit { id: string; url: string; score: number; issues: SEOIssue[]; meta: PageMeta; performance: PagePerformance; structuredData: any[]; links: LinkCheck[]; crawledAt: string; }
interface SEOIssue { type: string; severity: "critical" | "warning" | "info"; message: string; recommendation: string; impact: string; }
interface PageMeta { title: string; titleLength: number; description: string; descriptionLength: number; h1: string[]; h2Count: number; canonical: string | null; robots: string; ogTitle: string | null; ogImage: string | null; }
interface PagePerformance { loadTimeMs: number; sizeKb: number; imageCount: number; imagesWithoutAlt: number; cssFiles: number; jsFiles: number; }
interface LinkCheck { url: string; status: number; type: "internal" | "external"; anchor: string; }
// Audit a single page
export async function auditPage(url: string): Promise<SEOAudit> {
const id = `audit-${randomBytes(6).toString("hex")}`;
const start = Date.now();
// Fetch page
const response = await fetch(url, { signal: AbortSignal.timeout(15000), headers: { "User-Agent": "SEOAuditBot/1.0" } });
const html = await response.text();
const loadTimeMs = Date.now() - start;
// Parse meta tags
const meta = extractMeta(html, url);
const performance = analyzePerformance(html, loadTimeMs);
const links = extractLinks(html, url);
const structuredData = extractStructuredData(html);
// Generate issues
const issues: SEOIssue[] = [];
// Title checks
if (!meta.title) issues.push({ type: "missing_title", severity: "critical", message: "Page has no title tag", recommendation: "Add a unique, descriptive <title> tag (50-60 characters)", impact: "Title is the #1 ranking factor for on-page SEO" });
else if (meta.titleLength < 30) issues.push({ type: "short_title", severity: "warning", message: `Title too short (${meta.titleLength} chars)`, recommendation: "Expand title to 50-60 characters with target keywords", impact: "Longer titles capture more search queries" });
else if (meta.titleLength > 60) issues.push({ type: "long_title", severity: "warning", message: `Title too long (${meta.titleLength} chars) — will be truncated in search results`, recommendation: "Shorten to 60 characters", impact: "Truncated titles reduce click-through rate" });
// Description checks
if (!meta.description) issues.push({ type: "missing_description", severity: "critical", message: "No meta description", recommendation: "Add a compelling meta description (120-160 characters)", impact: "Google may generate a snippet that doesn't match user intent" });
else if (meta.descriptionLength < 70) issues.push({ type: "short_description", severity: "warning", message: `Description too short (${meta.descriptionLength} chars)`, recommendation: "Expand to 120-160 characters", impact: "Short descriptions miss opportunity to attract clicks" });
else if (meta.descriptionLength > 160) issues.push({ type: "long_description", severity: "info", message: `Description may be truncated (${meta.descriptionLength} chars)`, recommendation: "Keep under 160 characters", impact: "Minor — Google may truncate" });
// H1 checks
if (meta.h1.length === 0) issues.push({ type: "missing_h1", severity: "critical", message: "No H1 tag found", recommendation: "Add one H1 tag with the page's primary keyword", impact: "H1 signals the main topic to search engines" });
else if (meta.h1.length > 1) issues.push({ type: "multiple_h1", severity: "warning", message: `${meta.h1.length} H1 tags found`, recommendation: "Use only one H1 per page", impact: "Multiple H1s dilute topical focus" });
// Canonical
if (!meta.canonical) issues.push({ type: "missing_canonical", severity: "warning", message: "No canonical URL set", recommendation: "Add <link rel=\"canonical\"> to prevent duplicate content", impact: "Prevents crawl budget waste on duplicate URLs" });
// Open Graph
if (!meta.ogTitle) issues.push({ type: "missing_og", severity: "info", message: "No Open Graph tags", recommendation: "Add og:title, og:description, og:image for social sharing", impact: "Social shares without OG tags look unprofessional" });
// Performance
if (loadTimeMs > 3000) issues.push({ type: "slow_page", severity: "critical", message: `Page loads in ${(loadTimeMs / 1000).toFixed(1)}s`, recommendation: "Optimize images, minimize JS/CSS, enable caching", impact: "Pages loading >3s lose 53% of mobile visitors" });
if (performance.imagesWithoutAlt > 0) issues.push({ type: "missing_alt", severity: "warning", message: `${performance.imagesWithoutAlt} images without alt text`, recommendation: "Add descriptive alt text to all images", impact: "Missing alt text hurts accessibility and image search rankings" });
// Broken links
const brokenLinks = links.filter((l) => l.status >= 400);
if (brokenLinks.length > 0) issues.push({ type: "broken_links", severity: "critical", message: `${brokenLinks.length} broken links found`, recommendation: "Fix or remove broken links", impact: "Broken links waste crawl budget and hurt user experience" });
// Calculate score
const criticalCount = issues.filter((i) => i.severity === "critical").length;
const warningCount = issues.filter((i) => i.severity === "warning").length;
const score = Math.max(0, 100 - criticalCount * 20 - warningCount * 5);
const audit: SEOAudit = { id, url, score, issues, meta, performance, structuredData, links: links.slice(0, 50), crawledAt: new Date().toISOString() };
await pool.query(
`INSERT INTO seo_audits (id, url, score, issues_count, critical_count, crawled_at) VALUES ($1, $2, $3, $4, $5, NOW())`,
[id, url, score, issues.length, criticalCount]
);
return audit;
}
function extractMeta(html: string, url: string): PageMeta {
const title = html.match(/<title>([^<]*)<\/title>/i)?.[1]?.trim() || "";
const description = html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']*)/i)?.[1] || "";
const h1s = [...html.matchAll(/<h1[^>]*>([^<]*)<\/h1>/gi)].map((m) => m[1].trim());
const h2Count = (html.match(/<h2/gi) || []).length;
const canonical = html.match(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']*)/i)?.[1] || null;
const robots = html.match(/<meta[^>]+name=["']robots["'][^>]+content=["']([^"']*)/i)?.[1] || "";
const ogTitle = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']*)/i)?.[1] || null;
const ogImage = html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']*)/i)?.[1] || null;
return { title, titleLength: title.length, description, descriptionLength: description.length, h1: h1s, h2Count, canonical, robots, ogTitle, ogImage };
}
function analyzePerformance(html: string, loadTimeMs: number): PagePerformance {
const images = html.match(/<img[^>]*>/gi) || [];
const imagesWithoutAlt = images.filter((img) => !img.match(/alt=["'][^"']+/i)).length;
return { loadTimeMs, sizeKb: Math.round(Buffer.byteLength(html) / 1024), imageCount: images.length, imagesWithoutAlt, cssFiles: (html.match(/<link[^>]+stylesheet/gi) || []).length, jsFiles: (html.match(/<script[^>]+src/gi) || []).length };
}
function extractLinks(html: string, baseUrl: string): LinkCheck[] {
const links: LinkCheck[] = [];
const matches = html.matchAll(/<a[^>]+href=["']([^"'#]+)["'][^>]*>([^<]*)/gi);
for (const match of matches) {
const href = match[1];
const anchor = match[2].trim();
const isInternal = href.startsWith("/") || href.startsWith(baseUrl);
links.push({ url: href, status: 200, type: isInternal ? "internal" : "external", anchor });
}
return links.slice(0, 100);
}
function extractStructuredData(html: string): any[] {
const scripts = html.match(/<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi) || [];
return scripts.map((s) => { try { return JSON.parse(s.replace(/<[^>]+>/g, "")); } catch { return null; } }).filter(Boolean);
}
// Crawl entire site
export async function crawlSite(startUrl: string, maxPages: number = 100): Promise<{ audits: SEOAudit[]; summary: { avgScore: number; criticalPages: number; totalIssues: number } }> {
const visited = new Set<string>();
const queue = [startUrl];
const audits: SEOAudit[] = [];
while (queue.length > 0 && visited.size < maxPages) {
const url = queue.shift()!;
if (visited.has(url)) continue;
visited.add(url);
try {
const audit = await auditPage(url);
audits.push(audit);
// Add internal links to queue
for (const link of audit.links) {
if (link.type === "internal" && !visited.has(link.url)) {
const fullUrl = link.url.startsWith("http") ? link.url : new URL(link.url, startUrl).href;
queue.push(fullUrl);
}
}
} catch {}
}
const avgScore = audits.reduce((s, a) => s + a.score, 0) / audits.length;
return {
audits,
summary: { avgScore: Math.round(avgScore), criticalPages: audits.filter((a) => a.score < 50).length, totalIssues: audits.reduce((s, a) => s + a.issues.length, 0) },
};
}
Results
- 500 pages audited in 15 minutes — automated crawl finds every issue; no manual page-by-page checking; covers entire site
- Prioritized fixes — critical: missing title (3 pages), broken links (12); warning: short descriptions (45); team fixes high-impact issues first
- Score per page — homepage: 85/100; blog posts: 60/100 (missing descriptions); product pages: 90/100; clear where to focus
- $200/month Ahrefs saved — self-hosted tool covers site audit needs; runs on cron weekly; results stored for trend tracking
- Structured data validated — pages with schema.org JSON-LD checked; missing fields identified; rich snippet eligibility confirmed