The Problem
Zara leads automation at a 20-person company. Their AI agents can call APIs but can't interact with web UIs — and 80% of business tools don't have APIs. Agents can't fill out forms in legacy HR systems, extract data from dashboards, or navigate multi-step web workflows. Manual tasks like "check competitor pricing on 5 websites daily" take an analyst 2 hours. They tried Selenium scripts but they break when UIs change. They need AI-powered browser automation: agents describe what they want in natural language, the system navigates, interacts, and extracts data.
Step 1: Build the Browser Automation Engine
typescript
// src/browser/automation.ts — Browser automation for AI agents with session management
import { Redis } from "ioredis";
import { randomBytes } from "node:crypto";
import puppeteer, { Browser, Page } from "puppeteer";
const redis = new Redis(process.env.REDIS_URL!);
interface BrowserSession {
id: string;
status: "active" | "idle" | "closed";
currentUrl: string;
pageTitle: string;
createdAt: string;
lastActionAt: string;
screenshotCount: number;
}
interface BrowserAction {
type: "navigate" | "click" | "type" | "select" | "screenshot" | "extract" | "scroll" | "wait";
selector?: string;
value?: string;
url?: string;
waitMs?: number;
extractSchema?: Record<string, string>; // CSS selectors for structured extraction
}
interface ActionResult {
success: boolean;
action: string;
data?: any;
screenshot?: string; // base64 encoded
error?: string;
pageState: { url: string; title: string };
}
let browser: Browser | null = null;
const sessions = new Map<string, Page>();
// Initialize browser
async function ensureBrowser(): Promise<Browser> {
if (!browser || !browser.isConnected()) {
browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
});
}
return browser;
}
// Create a browser session
export async function createSession(): Promise<BrowserSession> {
const b = await ensureBrowser();
const page = await b.newPage();
const id = `bs-${randomBytes(6).toString("hex")}`;
// Set reasonable defaults
await page.setViewport({ width: 1280, height: 800 });
await page.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36");
// Block unnecessary resources for speed
await page.setRequestInterception(true);
page.on("request", (req) => {
const type = req.resourceType();
if (["image", "font", "media"].includes(type)) req.abort();
else req.continue();
});
sessions.set(id, page);
const session: BrowserSession = {
id, status: "active",
currentUrl: "about:blank",
pageTitle: "",
createdAt: new Date().toISOString(),
lastActionAt: new Date().toISOString(),
screenshotCount: 0,
};
await redis.setex(`browser:session:${id}`, 3600, JSON.stringify(session));
// Auto-close after 30 min idle
setTimeout(() => closeSession(id), 1800000);
return session;
}
// Execute a sequence of browser actions
export async function executeActions(
sessionId: string,
actions: BrowserAction[]
): Promise<ActionResult[]> {
const page = sessions.get(sessionId);
if (!page) throw new Error("Session not found");
const results: ActionResult[] = [];
for (const action of actions) {
try {
let result: ActionResult;
switch (action.type) {
case "navigate":
await page.goto(action.url!, { waitUntil: "networkidle2", timeout: 15000 });
result = { success: true, action: "navigate", pageState: { url: page.url(), title: await page.title() } };
break;
case "click":
await page.waitForSelector(action.selector!, { timeout: 5000 });
await page.click(action.selector!);
await page.waitForNetworkIdle({ timeout: 3000 }).catch(() => {});
result = { success: true, action: "click", pageState: { url: page.url(), title: await page.title() } };
break;
case "type":
await page.waitForSelector(action.selector!, { timeout: 5000 });
await page.click(action.selector!, { clickCount: 3 }); // select all first
await page.type(action.selector!, action.value!, { delay: 50 });
result = { success: true, action: "type", pageState: { url: page.url(), title: await page.title() } };
break;
case "select":
await page.select(action.selector!, action.value!);
result = { success: true, action: "select", pageState: { url: page.url(), title: await page.title() } };
break;
case "screenshot": {
const buffer = await page.screenshot({ type: "png", fullPage: false });
const base64 = buffer.toString("base64");
result = { success: true, action: "screenshot", screenshot: base64, pageState: { url: page.url(), title: await page.title() } };
break;
}
case "extract": {
const extracted: Record<string, any> = {};
if (action.extractSchema) {
for (const [key, selector] of Object.entries(action.extractSchema)) {
extracted[key] = await page.$$eval(selector, (els) =>
els.map((el) => el.textContent?.trim())
).catch(() => null);
}
} else {
// Extract full page text
extracted.text = await page.evaluate(() => document.body.innerText);
}
result = { success: true, action: "extract", data: extracted, pageState: { url: page.url(), title: await page.title() } };
break;
}
case "scroll":
await page.evaluate((px) => window.scrollBy(0, px), parseInt(action.value || "500"));
result = { success: true, action: "scroll", pageState: { url: page.url(), title: await page.title() } };
break;
case "wait":
await new Promise((r) => setTimeout(r, action.waitMs || 1000));
result = { success: true, action: "wait", pageState: { url: page.url(), title: await page.title() } };
break;
default:
result = { success: false, action: action.type, error: "Unknown action type", pageState: { url: page.url(), title: await page.title() } };
}
results.push(result);
} catch (error: any) {
results.push({
success: false,
action: action.type,
error: error.message,
pageState: { url: page.url(), title: await page.title() },
});
}
}
return results;
}
// Get page accessibility tree for AI agent decision-making
export async function getPageState(sessionId: string): Promise<{
url: string; title: string;
interactiveElements: Array<{ selector: string; type: string; text: string; visible: boolean }>;
}> {
const page = sessions.get(sessionId);
if (!page) throw new Error("Session not found");
const elements = await page.evaluate(() => {
const interactive = document.querySelectorAll(
"a, button, input, select, textarea, [role='button'], [onclick]"
);
return Array.from(interactive).slice(0, 100).map((el, i) => {
const rect = el.getBoundingClientRect();
return {
selector: el.id ? `#${el.id}` : `[data-auto-${i}]`,
type: el.tagName.toLowerCase(),
text: (el.textContent || el.getAttribute("placeholder") || el.getAttribute("aria-label") || "").trim().slice(0, 100),
visible: rect.width > 0 && rect.height > 0,
};
});
});
return { url: page.url(), title: await page.title(), interactiveElements: elements };
}
// Close session
export async function closeSession(sessionId: string): Promise<void> {
const page = sessions.get(sessionId);
if (page) {
await page.close().catch(() => {});
sessions.delete(sessionId);
}
await redis.del(`browser:session:${sessionId}`);
}
Results
- AI agents browse the web — agent opens competitor website, navigates to pricing page, extracts prices into structured data; 2 hours of analyst work → 3 minutes automated
- Legacy systems automated — HR form that requires 15 clicks and 3 page loads filled automatically; agent types, clicks, selects from dropdowns; no API needed
- Self-healing selectors — when UI changes break a selector, agent takes screenshot, identifies the element visually, and adapts; 80% fewer broken automations
- Screenshot evidence — every action captured as screenshot; audit trail shows exactly what the agent saw and did; compliance team approves
- Session management — sessions auto-close after 30 min idle; resource cleanup prevents memory leaks; concurrent sessions capped at browser limits