From a81ee50ed8200c3363c7f250cefff8460d09c9fc Mon Sep 17 00:00:00 2001 From: DavidHerran Date: Tue, 9 Jun 2026 23:07:38 -0500 Subject: [PATCH] feat(resilience): operational hardening (NEXT phase of the audit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Acts on the audit's NEXT block — operational resilience. Backups (N1): - New `backup` compose service (postgres:16-alpine) runs scripts/backup-loop.sh: immediate pg_dump on start, then nightly, gzip, 14-day rotation into ./backups on the host. Configurable via BACKUP_RETENTION_DAYS / BACKUP_INTERVAL_SECONDS. (Offsite copy is the documented next step.) Resource limits + healthchecks (N2): - deploy.resources.limits.memory on postgres (2g), app (1500m), nginx (256m), backup (256m) so no container can starve the others (the Nginx outage was a reminder). - Nginx now has a healthcheck hitting a new self-served `/nginx-health` endpoint on the default_server (no upstream dependency). Chat resilience (N3): - buildSystemPrompt() wraps its 4 Prisma queries in try/catch with safe defaults — if Postgres is down the assistant degrades instead of 500-ing. - Result is cached for 60s (only on healthy builds) so we don't run 4 queries per message; CMS edits still appear within the TTL. - POST fails fast with 503 if OPENAI_API_KEY is missing (instead of breaking mid-stream after headers are sent). - streamText gets an onError handler that logs + persists an `error` AiEvent. Idempotent submissions (N4): - consultation/route.ts and operations.ts now wrap the email-tracking UPDATE in try/catch — the lead/signal is already saved, so a telemetry hiccup can't 500 the request and trigger a duplicate retry. operations.ts also returns emailError. Performance (N5): - Index GlobalNode(application, isActive) — backs the case-study join on every application page. Migration 20260609130000_index_globalnode_application. Verified: next build compiles (Docker parity, SESSION_SECRET unset), TypeScript clean, prisma schema valid, golden tests 17/17, `docker compose config` valid. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 1 + docker-compose.yml | 50 ++++++++++++ nginx/conf.d/flux.conf | 5 ++ .../migration.sql | 10 +++ prisma/schema.prisma | 3 + scripts/backup-loop.sh | 15 ++++ scripts/db-backup.sh | 31 ++++++++ src/app/actions/operations.ts | 26 ++++--- src/app/api/chat/route.ts | 76 +++++++++++++++---- src/app/api/consultation/route.ts | 22 ++++-- 10 files changed, 208 insertions(+), 31 deletions(-) create mode 100644 prisma/migrations/20260609130000_index_globalnode_application/migration.sql create mode 100755 scripts/backup-loop.sh create mode 100755 scripts/db-backup.sh diff --git a/.gitignore b/.gitignore index ca41af4..5435e84 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ public/branding/ # Local Claude Code / MCP config — agent-specific, not project .mcp.json .claude/ +backups/ diff --git a/docker-compose.yml b/docker-compose.yml index 64195f5..8554af7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,6 +17,12 @@ services: - pgdata:/var/lib/postgresql/data networks: - flux-net + # Resource caps so no single container can starve the others (the Nginx + # outage earlier was a reminder). VPS has ~11 GB; these leave headroom. + deploy: + resources: + limits: + memory: 2g healthcheck: test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"] interval: 5s @@ -81,6 +87,10 @@ services: - flux-net expose: - "3000" + deploy: + resources: + limits: + memory: 1500m healthcheck: test: - CMD-SHELL @@ -114,6 +124,46 @@ services: - app networks: - flux-net + deploy: + resources: + limits: + memory: 256m + healthcheck: + # Nginx self-health (served directly by the default_server, no upstream). + test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1/nginx-health || exit 1"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 10s + + # ── Automated Postgres backups ── + # Nightly pg_dump -> gzip into ./backups on the host, 14-day rotation. + # NOTE: this is LOCAL to the VPS. Offsite copy (S3/rsync) is the recommended + # next step once the client provides storage credentials. + backup: + image: postgres:16-alpine + restart: always + depends_on: + postgres: + condition: service_healthy + environment: + DB_USER: ${DB_USER} + DB_PASSWORD: ${DB_PASSWORD} + DB_NAME: ${DB_NAME} + BACKUP_DIR: /backups + RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14} + BACKUP_INTERVAL_SECONDS: ${BACKUP_INTERVAL_SECONDS:-86400} + volumes: + - ./backups:/backups + - ./scripts/db-backup.sh:/usr/local/bin/db-backup.sh:ro + - ./scripts/backup-loop.sh:/usr/local/bin/backup-loop.sh:ro + entrypoint: ["/bin/sh", "/usr/local/bin/backup-loop.sh"] + networks: + - flux-net + deploy: + resources: + limits: + memory: 256m volumes: pgdata: diff --git a/nginx/conf.d/flux.conf b/nginx/conf.d/flux.conf index 84e71d2..f934378 100644 --- a/nginx/conf.d/flux.conf +++ b/nginx/conf.d/flux.conf @@ -22,6 +22,11 @@ server { listen 80 default_server; server_name _; + # Nginx self-health endpoint (served directly, no upstream) — used by the + # docker-compose healthcheck. Reachable on 127.0.0.1 inside the container + # (no Host match needed, so it lands here on the default_server). + location = /nginx-health { return 200 "ok\n"; access_log off; } + # Keep ACME HTTP-01 working so certbot can still renew on any host. location /.well-known/acme-challenge/ { root /var/www/certbot; } diff --git a/prisma/migrations/20260609130000_index_globalnode_application/migration.sql b/prisma/migrations/20260609130000_index_globalnode_application/migration.sql new file mode 100644 index 0000000..52d5c2f --- /dev/null +++ b/prisma/migrations/20260609130000_index_globalnode_application/migration.sql @@ -0,0 +1,10 @@ +-- ───────────────────────────────────────────────────────────────────────── +-- ADDITIVE MIGRATION — index GlobalNode(application, isActive). +-- The application detail page queries case studies by application slug + +-- isActive (the GlobalNode.application -> Application.slug join). Without an +-- index this is a full table scan on every application page render. +-- Idempotent. Safe for `migrate deploy`. +-- ───────────────────────────────────────────────────────────────────────── + +CREATE INDEX IF NOT EXISTS "GlobalNode_application_isActive_idx" + ON "GlobalNode" ("application", "isActive"); diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 0b92ca1..bf3d87e 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -64,6 +64,9 @@ model GlobalNode { @@index([isActive]) @@index([nodeType]) @@index([nodeType, isActive]) + // Case studies on an application page filter by application slug + isActive + // (src/app/[locale]/applications/[slug]/page.tsx). Back this join with an index. + @@index([application, isActive]) } // ------------------------------------------------------ diff --git a/scripts/backup-loop.sh b/scripts/backup-loop.sh new file mode 100755 index 0000000..ddf1a5c --- /dev/null +++ b/scripts/backup-loop.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# ───────────────────────────────────────────────────────────────────────────── +# Backup service entrypoint. Runs one backup immediately on start, then loops +# every BACKUP_INTERVAL_SECONDS (default 24h). A loop (vs cron) inherits the +# container environment cleanly and survives restarts without lost schedules. +# ───────────────────────────────────────────────────────────────────────────── +set -eu + +INTERVAL="${BACKUP_INTERVAL_SECONDS:-86400}" +echo "[backup] service started; interval=${INTERVAL}s, retention=${RETENTION_DAYS:-14}d" + +while true; do + /usr/local/bin/db-backup.sh || echo "[backup] cycle failed; will retry next interval" + sleep "$INTERVAL" +done diff --git a/scripts/db-backup.sh b/scripts/db-backup.sh new file mode 100755 index 0000000..912225a --- /dev/null +++ b/scripts/db-backup.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# ───────────────────────────────────────────────────────────────────────────── +# Single Postgres backup: pg_dump -> gzip -> N-day rotation. +# Run by scripts/backup-loop.sh inside the `backup` compose service. +# Env: DB_USER, DB_PASSWORD, DB_NAME, BACKUP_DIR, RETENTION_DAYS +# ───────────────────────────────────────────────────────────────────────────── +set -eu + +BACKUP_DIR="${BACKUP_DIR:-/backups}" +RETENTION_DAYS="${RETENTION_DAYS:-14}" +TS=$(date -u +%Y%m%d_%H%M%S) +OUT="${BACKUP_DIR}/flux_db_${TS}.sql.gz" + +mkdir -p "$BACKUP_DIR" +export PGPASSWORD="$DB_PASSWORD" + +echo "[backup] $(date -u +%Y-%m-%dT%H:%M:%SZ) starting pg_dump -> ${OUT}" + +# --no-owner/--no-privileges keep the dump portable across roles on restore. +if pg_dump -h postgres -U "$DB_USER" -d "$DB_NAME" --no-owner --no-privileges | gzip -9 > "$OUT"; then + SIZE=$(du -h "$OUT" | cut -f1) + echo "[backup] OK: ${OUT} (${SIZE})" +else + echo "[backup] FAILED: pg_dump returned non-zero; removing partial file" + rm -f "$OUT" + exit 1 +fi + +# Rotation — drop dumps older than RETENTION_DAYS. +DELETED=$(find "$BACKUP_DIR" -name 'flux_db_*.sql.gz' -mtime +"$RETENTION_DAYS" -print -delete 2>/dev/null | wc -l || echo 0) +echo "[backup] rotation: kept last ${RETENTION_DAYS} days, pruned ${DELETED} old dump(s)" diff --git a/src/app/actions/operations.ts b/src/app/actions/operations.ts index ef1e026..4105334 100644 --- a/src/app/actions/operations.ts +++ b/src/app/actions/operations.ts @@ -92,17 +92,23 @@ export async function submitOperationsSignal(payload: { replyTo: payload.clientEmail, }); - // Track email delivery in DB - await prisma.operationsSignal.update({ - where: { id: signal.id }, - data: { - emailSentTo: emailResult.sentTo.join(", "), - emailSentAt: emailResult.sentAt, - emailError: emailResult.error, - }, - }); + // Track email delivery — best-effort. The signal (lead) is already saved, + // so a telemetry-update hiccup must NOT fail the request and make the + // client retry into a duplicate. + try { + await prisma.operationsSignal.update({ + where: { id: signal.id }, + data: { + emailSentTo: emailResult.sentTo.join(", "), + emailSentAt: emailResult.sentAt, + emailError: emailResult.error, + }, + }); + } catch (trackErr) { + console.warn("[operations] email tracking update failed (lead already saved):", trackErr); + } - return { success: true, ticketId, emailSent: emailResult.success }; + return { success: true, ticketId, emailSent: emailResult.success, emailError: emailResult.error }; } catch (error) { console.error("Error submitting signal:", error); return { error: "Failed to submit request. Please try again." }; diff --git a/src/app/api/chat/route.ts b/src/app/api/chat/route.ts index 5e3d2ef..51bddca 100644 --- a/src/app/api/chat/route.ts +++ b/src/app/api/chat/route.ts @@ -39,22 +39,43 @@ const COMPARISON_DATA: Record { - // Query real data from Prisma - const [activeApps, installationCount, eventCount, partsCount] = await Promise.all([ - prisma.application.findMany({ - where: { isActive: true }, - select: { slug: true, title: true, shortDescription: true, category: true }, - orderBy: { title: 'asc' }, - }), - prisma.globalNode.count({ where: { nodeType: 'installation', isActive: true } }), - prisma.globalNode.count({ where: { nodeType: 'event', isActive: true } }), - prisma.sparePart.count({ where: { isActive: true } }), - ]); + if (_promptCache && Date.now() - _promptCache.at < SYSTEM_PROMPT_TTL_MS) { + return _promptCache.value; + } - const appList = activeApps.map((a: any) => ` - ${a.title} (slug: "${a.slug}", category: ${a.category})`).join('\n'); + // Live DB context. If Postgres is unreachable, fall back to safe defaults so + // the assistant still answers (degraded) instead of 500-ing the whole chat. + let activeApps: Array<{ slug: string; title: string; shortDescription: string; category: string }> = []; + let installationCount = 0, eventCount = 0, partsCount = 0; + let dbOk = true; + try { + [activeApps, installationCount, eventCount, partsCount] = await Promise.all([ + prisma.application.findMany({ + where: { isActive: true }, + select: { slug: true, title: true, shortDescription: true, category: true }, + orderBy: { title: 'asc' }, + }), + prisma.globalNode.count({ where: { nodeType: 'installation', isActive: true } }), + prisma.globalNode.count({ where: { nodeType: 'event', isActive: true } }), + prisma.sparePart.count({ where: { isActive: true } }), + ]); + } catch (e) { + dbOk = false; + log.warn('chat.system_prompt_db_unavailable', { err: String(e) }); + } - return `You are "FluxAI", the intelligent engineering advisor and sales specialist for FLUX Srl — a world leader in solid-state Radio Frequency (RF), Microwave, and Infrared industrial equipment. Founded by Patrizio Grando with 40+ years of legacy. Headquarters: Romano d'Ezzelino, Vicenza, Italy. + const appList = activeApps.length + ? activeApps.map((a) => ` - ${a.title} (slug: "${a.slug}", category: ${a.category})`).join('\n') + : ' (live catalog temporarily unavailable — describe FLUX applications from general RF knowledge)'; + + const prompt = `You are "FluxAI", the intelligent engineering advisor and sales specialist for FLUX Srl — a world leader in solid-state Radio Frequency (RF), Microwave, and Infrared industrial equipment. Founded by Patrizio Grando with 40+ years of legacy. Headquarters: Romano d'Ezzelino, Vicenza, Italy. PERSONALITY: - Senior RF engineer who also understands business ROI. @@ -143,6 +164,10 @@ PROACTIVE NEXT STEPS (always suggest the next logical action): comparison → "Let me quantify the difference for your specific operation..." → energy_savings_calculator LANGUAGE: Respond in the exact same language the user writes in.`; + + // Only cache a healthy build so a transient DB outage retries next message. + if (dbOk) _promptCache = { value: prompt, at: Date.now() }; + return prompt; } // ─── HELPER: Parse JSON safely ────────────────────────────────── @@ -198,6 +223,17 @@ export async function POST(req: Request) { ); } + // ─── Fail fast if the AI provider isn't configured ───────────── + // Without this, a missing/invalid key surfaces mid-stream after headers + // are already sent, producing a confusing broken response. + if (!process.env.OPENAI_API_KEY) { + log.error("chat.openai_key_missing", new Error("OPENAI_API_KEY is not set")); + return new Response( + JSON.stringify({ error: "The AI assistant is temporarily unavailable. Please try again later." }), + { status: 503, headers: { "Content-Type": "application/json" } }, + ); + } + const { messages, context, @@ -287,6 +323,20 @@ export async function POST(req: Request) { system: systemPrompt + contextNote, messages: coreMessages, providerOptions: { openai: { promptCacheKey: 'fluxai-v1' } }, + // Surface streaming/provider errors (OpenAI 429/500, bad key) in the logs + // and, when possible, persist them to the conversation timeline. + onError: ({ error }) => { + log.error("chat.stream_error", error, { conversationId: conversationId ?? undefined }); + if (conversationId) { + prisma.aiEvent.create({ + data: { + conversationId, + type: "error", + payloadJson: JSON.stringify({ message: error instanceof Error ? error.message : String(error) }).slice(0, 2000), + }, + }).catch(() => {}); + } + }, onFinish: async ({ usage, toolCalls, toolResults }) => { if (!conversationId) return; try { diff --git a/src/app/api/consultation/route.ts b/src/app/api/consultation/route.ts index 9e9b9b0..c87f75f 100644 --- a/src/app/api/consultation/route.ts +++ b/src/app/api/consultation/route.ts @@ -145,14 +145,20 @@ export async function POST(request: NextRequest) { replyTo: contact.email, }); - await prisma.operationsSignal.update({ - where: { id: signal.id }, - data: { - emailSentTo: emailResult.sentTo.join(", "), - emailSentAt: emailResult.sentAt, - emailError: emailResult.error, - }, - }); + // Best-effort email tracking — the lead is already saved; never fail the + // request (and risk a client retry / duplicate) over a telemetry update. + try { + await prisma.operationsSignal.update({ + where: { id: signal.id }, + data: { + emailSentTo: emailResult.sentTo.join(", "), + emailSentAt: emailResult.sentAt, + emailError: emailResult.error, + }, + }); + } catch (trackErr) { + log.warn("consultation.email_tracking_failed", { ticketId, err: String(trackErr) }); + } log.info("consultation.submitted", { ticketId, emailSent: emailResult.success });