The Problem
Cara leads data at a 20-person legal tech company processing 10,000 contracts monthly. Lawyers need to know: which companies are mentioned, what dates matter, what monetary amounts appear, and who signed what. Manual review takes 30 minutes per contract. Search only finds exact keyword matches — searching for "Microsoft" misses contracts mentioning "MSFT" or "Microsoft Corporation." They need entity extraction: automatically identify companies, people, dates, monetary amounts, and legal terms from contract text, then build a searchable knowledge graph of relationships.
Step 1: Build the Extraction Pipeline
// src/extraction/entities.ts — Entity extraction with relationship mapping and knowledge graph
import { pool } from "../db";
import { Redis } from "ioredis";
import { randomBytes } from "node:crypto";
const redis = new Redis(process.env.REDIS_URL!);
interface Entity {
id: string;
text: string;
normalizedText: string;
type: "person" | "organization" | "date" | "money" | "location" | "legal_term" | "custom";
confidence: number;
position: { start: number; end: number };
metadata: Record<string, any>;
}
interface Relationship {
id: string;
sourceEntityId: string;
targetEntityId: string;
type: "signed_by" | "effective_date" | "payment_to" | "governed_by" | "party_to" | "amount_of";
confidence: number;
context: string;
}
interface ExtractionResult {
documentId: string;
entities: Entity[];
relationships: Relationship[];
summary: { totalEntities: number; byType: Record<string, number>; avgConfidence: number };
}
// Built-in entity patterns
const ENTITY_PATTERNS: Array<{ type: Entity["type"]; patterns: RegExp[]; normalizer?: (text: string) => string }> = [
{
type: "money",
patterns: [
/\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|thousand|USD|EUR|GBP))?/gi,
/(?:USD|EUR|GBP|JPY)\s*[\d,]+(?:\.\d{2})?/gi,
/[\d,]+(?:\.\d{2})?\s*(?:dollars|euros|pounds)/gi,
],
normalizer: (text) => text.replace(/[,$\s]/g, "").replace(/million/i, "000000").replace(/billion/i, "000000000"),
},
{
type: "date",
patterns: [
/(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}/gi,
/\d{1,2}\/\d{1,2}\/\d{2,4}/g,
/\d{4}-\d{2}-\d{2}/g,
],
normalizer: (text) => {
const d = new Date(text);
return isNaN(d.getTime()) ? text : d.toISOString().slice(0, 10);
},
},
{
type: "organization",
patterns: [
/[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s+(?:Inc|LLC|Ltd|Corp|Corporation|Company|Group|Partners|LP|LLP|GmbH|AG|SA|BV|NV))\.?/g,
],
},
{
type: "person",
patterns: [
/(?:Mr|Mrs|Ms|Dr|Prof)\.?\s+[A-Z][a-z]+\s+[A-Z][a-z]+/g,
],
},
{
type: "legal_term",
patterns: [
/\b(?:indemnification|liability|warranty|termination|confidential|arbitration|jurisdiction|force majeure|breach|remedy|damages|governing law)\b/gi,
],
normalizer: (text) => text.toLowerCase(),
},
];
// Custom entity types (configured per client)
const customPatterns = new Map<string, Array<{ type: string; patterns: RegExp[] }>>();
// Extract entities from text
export async function extractEntities(
documentId: string,
text: string,
options?: { customTypes?: string[] }
): Promise<ExtractionResult> {
const entities: Entity[] = [];
const seen = new Set<string>();
// Run built-in patterns
for (const rule of ENTITY_PATTERNS) {
for (const pattern of rule.patterns) {
const regex = new RegExp(pattern.source, pattern.flags);
let match;
while ((match = regex.exec(text)) !== null) {
const entityText = match[0].trim();
const key = `${rule.type}:${entityText.toLowerCase()}`;
if (seen.has(key)) continue;
seen.add(key);
entities.push({
id: `ent-${randomBytes(4).toString("hex")}`,
text: entityText,
normalizedText: rule.normalizer ? rule.normalizer(entityText) : entityText,
type: rule.type,
confidence: calculateConfidence(rule.type, entityText, text),
position: { start: match.index, end: match.index + entityText.length },
metadata: {},
});
}
}
}
// Extract relationships between entities
const relationships = extractRelationships(entities, text);
// Calculate summary
const byType: Record<string, number> = {};
for (const e of entities) byType[e.type] = (byType[e.type] || 0) + 1;
const avgConfidence = entities.reduce((sum, e) => sum + e.confidence, 0) / Math.max(entities.length, 1);
const result: ExtractionResult = {
documentId, entities, relationships,
summary: { totalEntities: entities.length, byType, avgConfidence },
};
// Store results
await pool.query(
`INSERT INTO extraction_results (document_id, entities, relationships, summary, created_at)
VALUES ($1, $2, $3, $4, NOW())
ON CONFLICT (document_id) DO UPDATE SET entities = $2, relationships = $3, summary = $4`,
[documentId, JSON.stringify(entities), JSON.stringify(relationships), JSON.stringify(result.summary)]
);
// Index entities for search
for (const entity of entities) {
await redis.sadd(`entity:${entity.type}:${entity.normalizedText.toLowerCase()}`, documentId);
}
return result;
}
function extractRelationships(entities: Entity[], text: string): Relationship[] {
const relationships: Relationship[] = [];
const orgs = entities.filter((e) => e.type === "organization");
const people = entities.filter((e) => e.type === "person");
const dates = entities.filter((e) => e.type === "date");
const amounts = entities.filter((e) => e.type === "money");
// Person → Organization relationships (within 200 chars)
for (const person of people) {
for (const org of orgs) {
const distance = Math.abs(person.position.start - org.position.start);
if (distance < 200) {
const context = text.slice(
Math.min(person.position.start, org.position.start),
Math.max(person.position.end, org.position.end)
);
if (/sign|execut|authoriz|behalf|represent/i.test(context)) {
relationships.push({
id: `rel-${randomBytes(4).toString("hex")}`,
sourceEntityId: person.id, targetEntityId: org.id,
type: "signed_by", confidence: 0.8, context: context.slice(0, 200),
});
}
}
}
}
// Date → context relationships
for (const date of dates) {
const surrounding = text.slice(Math.max(0, date.position.start - 100), date.position.end + 50);
if (/effective|commence|start/i.test(surrounding)) {
for (const org of orgs.slice(0, 2)) {
relationships.push({
id: `rel-${randomBytes(4).toString("hex")}`,
sourceEntityId: org.id, targetEntityId: date.id,
type: "effective_date", confidence: 0.7, context: surrounding.slice(0, 200),
});
}
}
}
// Amount → Organization relationships
for (const amount of amounts) {
const surrounding = text.slice(Math.max(0, amount.position.start - 150), amount.position.end + 50);
for (const org of orgs) {
if (surrounding.includes(org.text)) {
relationships.push({
id: `rel-${randomBytes(4).toString("hex")}`,
sourceEntityId: amount.id, targetEntityId: org.id,
type: "payment_to", confidence: 0.6, context: surrounding.slice(0, 200),
});
}
}
}
return relationships;
}
function calculateConfidence(type: string, text: string, fullText: string): number {
let confidence = 0.7;
// Boost for entities that appear multiple times
const occurrences = (fullText.match(new RegExp(text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi")) || []).length;
if (occurrences > 1) confidence += 0.1;
if (occurrences > 3) confidence += 0.1;
// Boost for specific types with strong patterns
if (type === "money" && /^\$/.test(text)) confidence += 0.1;
if (type === "date" && /\d{4}/.test(text)) confidence += 0.1;
return Math.min(1, confidence);
}
// Search across all documents by entity
export async function searchByEntity(type: string, query: string): Promise<string[]> {
return redis.smembers(`entity:${type}:${query.toLowerCase()}`);
}
// Build knowledge graph for a set of documents
export async function buildKnowledgeGraph(documentIds: string[]): Promise<{
nodes: Array<{ id: string; label: string; type: string; documentCount: number }>;
edges: Array<{ source: string; target: string; type: string; weight: number }>;
}> {
const entityMap = new Map<string, { label: string; type: string; docs: Set<string> }>();
const edgeMap = new Map<string, { source: string; target: string; type: string; weight: number }>();
for (const docId of documentIds) {
const { rows: [row] } = await pool.query(
"SELECT entities, relationships FROM extraction_results WHERE document_id = $1", [docId]
);
if (!row) continue;
const entities: Entity[] = JSON.parse(row.entities);
const relationships: Relationship[] = JSON.parse(row.relationships);
for (const e of entities) {
const key = `${e.type}:${e.normalizedText}`;
if (!entityMap.has(key)) entityMap.set(key, { label: e.text, type: e.type, docs: new Set() });
entityMap.get(key)!.docs.add(docId);
}
for (const r of relationships) {
const source = entities.find((e) => e.id === r.sourceEntityId);
const target = entities.find((e) => e.id === r.targetEntityId);
if (source && target) {
const key = `${source.normalizedText}→${target.normalizedText}`;
if (!edgeMap.has(key)) edgeMap.set(key, { source: source.normalizedText, target: target.normalizedText, type: r.type, weight: 0 });
edgeMap.get(key)!.weight++;
}
}
}
return {
nodes: Array.from(entityMap.entries()).map(([id, v]) => ({ id, label: v.label, type: v.type, documentCount: v.docs.size })),
edges: Array.from(edgeMap.values()),
};
}
Results
- Contract review: 30 min → 2 min — entities auto-extracted and highlighted; lawyer jumps to relevant sections instead of reading cover-to-cover
- "MSFT" finds Microsoft contracts — normalized entity names link variants; searching "Microsoft" returns docs mentioning "Microsoft Corporation", "MSFT", and "Microsoft Inc."
- Knowledge graph reveals hidden connections — company A appears in 47 contracts with company B; relationship visualized; M&A team discovers partnership pattern
- Monetary amounts tracked — all dollar figures extracted with context: "$5M payment due on March 1"; finance dashboard shows obligations by date and counterparty
- Custom entity types — client adds "product names" as custom entity type with their product catalog as patterns; extraction tailored to their domain