Build an Observability Stack with OpenTelemetry

The Problem

Piotr leads SRE at a 45-person company with 15 microservices. When a request is slow, nobody knows which service is the bottleneck. Logs are scattered across services with no correlation. Metrics exist for each service individually, but there's no way to trace a single user request across services. Last week a p99 latency spike took 4 hours to debug because the team checked each service manually. They need distributed tracing that follows requests across services, correlated logs, and unified metrics — all with a single instrumentation standard.

Step 1: Build the OpenTelemetry Instrumentation

typescript

// src/telemetry/setup.ts — OpenTelemetry auto-instrumentation for Node.js
import { NodeSDK } from "@opentelemetry/sdk-node";
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http";
import { PeriodicExportingMetricReader } from "@opentelemetry/sdk-metrics";
import { Resource } from "@opentelemetry/resources";
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
import { getNodeAutoInstrumentations } from "@opentelemetry/auto-instrumentations-node";
import { W3CTraceContextPropagator } from "@opentelemetry/core";

const SERVICE_NAME = process.env.SERVICE_NAME || "unknown-service";
const OTEL_ENDPOINT = process.env.OTEL_ENDPOINT || "http://localhost:4318";

export function initTelemetry(): NodeSDK {
  const sdk = new NodeSDK({
    resource: new Resource({
      [ATTR_SERVICE_NAME]: SERVICE_NAME,
      [ATTR_SERVICE_VERSION]: process.env.SERVICE_VERSION || "0.0.0",
      "deployment.environment": process.env.NODE_ENV || "development",
    }),

    // Distributed tracing
    traceExporter: new OTLPTraceExporter({
      url: `${OTEL_ENDPOINT}/v1/traces`,
    }),

    // Metrics
    metricReader: new PeriodicExportingMetricReader({
      exporter: new OTLPMetricExporter({
        url: `${OTEL_ENDPOINT}/v1/metrics`,
      }),
      exportIntervalMillis: 15000, // export every 15s
    }),

    // Auto-instrument everything: HTTP, pg, Redis, Express/Hono
    instrumentations: [
      getNodeAutoInstrumentations({
        "@opentelemetry/instrumentation-http": {
          requestHook: (span, request) => {
            span.setAttribute("http.request_id", request.headers?.["x-request-id"] || "");
          },
        },
        "@opentelemetry/instrumentation-pg": {
          enhancedDatabaseReporting: true,
        },
        "@opentelemetry/instrumentation-redis": {
          dbStatementSerializer: (cmdName, cmdArgs) => `${cmdName} ${cmdArgs[0] || ""}`,
        },
      }),
    ],

    textMapPropagator: new W3CTraceContextPropagator(),
  });

  sdk.start();
  console.log(`[Telemetry] OpenTelemetry initialized for ${SERVICE_NAME}`);

  // Graceful shutdown
  process.on("SIGTERM", () => sdk.shutdown());

  return sdk;
}

typescript

// src/telemetry/custom-spans.ts — Custom spans for business logic
import { trace, SpanStatusCode, metrics } from "@opentelemetry/api";

const tracer = trace.getTracer("app");
const meter = metrics.getMeter("app");

// Custom metrics
const orderCounter = meter.createCounter("orders.created", {
  description: "Number of orders created",
});

const orderDuration = meter.createHistogram("orders.processing_duration_ms", {
  description: "Order processing time in milliseconds",
  unit: "ms",
});

const activeUsers = meter.createUpDownCounter("users.active", {
  description: "Currently active users",
});

// Wrap business logic in custom spans
export async function processOrder(orderId: string, userId: string): Promise<any> {
  return tracer.startActiveSpan("process_order", async (span) => {
    span.setAttribute("order.id", orderId);
    span.setAttribute("user.id", userId);

    const startTime = Date.now();

    try {
      // Step 1: Validate inventory
      const inventory = await tracer.startActiveSpan("validate_inventory", async (inventorySpan) => {
        const result = await checkInventory(orderId);
        inventorySpan.setAttribute("inventory.available", result.available);
        inventorySpan.setAttribute("inventory.quantity", result.quantity);
        inventorySpan.end();
        return result;
      });

      if (!inventory.available) {
        span.setStatus({ code: SpanStatusCode.ERROR, message: "Out of stock" });
        span.setAttribute("order.status", "rejected");
        span.end();
        throw new Error("Out of stock");
      }

      // Step 2: Charge payment
      const payment = await tracer.startActiveSpan("charge_payment", async (paymentSpan) => {
        paymentSpan.setAttribute("payment.amount", inventory.total);
        paymentSpan.setAttribute("payment.currency", "USD");
        const result = await chargePayment(userId, inventory.total);
        paymentSpan.setAttribute("payment.id", result.paymentId);
        paymentSpan.end();
        return result;
      });

      // Step 3: Create order record
      await tracer.startActiveSpan("create_order_record", async (dbSpan) => {
        await saveOrder(orderId, userId, payment.paymentId);
        dbSpan.end();
      });

      // Metrics
      orderCounter.add(1, { status: "success", payment_method: "card" });
      orderDuration.record(Date.now() - startTime, { status: "success" });

      span.setAttribute("order.status", "completed");
      span.setStatus({ code: SpanStatusCode.OK });
      span.end();

      return { orderId, status: "completed", paymentId: payment.paymentId };
    } catch (err: any) {
      span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
      span.recordException(err);
      orderCounter.add(1, { status: "failed" });
      orderDuration.record(Date.now() - startTime, { status: "failed" });
      span.end();
      throw err;
    }
  });
}

// Structured logging with trace correlation
export function log(level: string, message: string, attributes: Record<string, any> = {}): void {
  const activeSpan = trace.getActiveSpan();
  const traceId = activeSpan?.spanContext().traceId || "";
  const spanId = activeSpan?.spanContext().spanId || "";

  const logEntry = {
    timestamp: new Date().toISOString(),
    level,
    message,
    traceId,
    spanId,
    service: process.env.SERVICE_NAME,
    ...attributes,
  };

  console.log(JSON.stringify(logEntry));
}

async function checkInventory(orderId: string) { return { available: true, quantity: 1, total: 99.99 }; }
async function chargePayment(userId: string, amount: number) { return { paymentId: `pay_${Date.now()}` }; }
async function saveOrder(orderId: string, userId: string, paymentId: string) { /* db insert */ }

Step 2: Middleware for Automatic Request Tracing

typescript

// src/middleware/tracing.ts — Request tracing middleware
import { Context, Next } from "hono";
import { trace, SpanStatusCode } from "@opentelemetry/api";

const tracer = trace.getTracer("http");

export function tracingMiddleware() {
  return async (c: Context, next: Next) => {
    const span = trace.getActiveSpan();

    if (span) {
      // Add request attributes
      span.setAttribute("http.user_id", c.get("userId") || "anonymous");
      span.setAttribute("http.route", c.req.routePath || c.req.path);

      // Add trace ID to response headers (for debugging)
      c.header("X-Trace-ID", span.spanContext().traceId);
    }

    try {
      await next();

      if (span && c.res.status >= 400) {
        span.setStatus({ code: SpanStatusCode.ERROR, message: `HTTP ${c.res.status}` });
      }
    } catch (err: any) {
      if (span) {
        span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
        span.recordException(err);
      }
      throw err;
    }
  };
}

Results

p99 debugging time: 4 hours → 10 minutes — distributed traces show exactly which service and which database query is the bottleneck; one click from slow request to root cause
Logs correlated across 15 services — every log entry includes traceId; searching for one request ID shows logs from all services it touched, in order
Custom business metrics — order processing rate, payment success rate, and duration histograms visible in Grafana; product team monitors without asking engineering
Auto-instrumentation covers 90% of spans — HTTP, PostgreSQL, Redis, and inter-service calls traced automatically; only business logic needs manual spans
Vendor-neutral — OpenTelemetry standard means switching from Jaeger to Tempo or from Prometheus to Datadog requires changing only the exporter config, not instrumentation code

Skills stack · 5 skills

observability-setup

typescript

redis

postgresql

hono

The Problem

Step 1: Build the OpenTelemetry Instrumentation

Step 2: Middleware for Automatic Request Tracing

Results