From a81ee50ed8200c3363c7f250cefff8460d09c9fc Mon Sep 17 00:00:00 2001
From: DavidHerran <davidherran@dreamhousestudios.co>
Date: Tue, 9 Jun 2026 23:07:38 -0500
Subject: [PATCH] feat(resilience): operational hardening (NEXT phase of the
 audit)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Acts on the audit's NEXT block — operational resilience.

Backups (N1):
- New `backup` compose service (postgres:16-alpine) runs scripts/backup-loop.sh:
  immediate pg_dump on start, then nightly, gzip, 14-day rotation into
  ./backups on the host. Configurable via BACKUP_RETENTION_DAYS /
  BACKUP_INTERVAL_SECONDS. (Offsite copy is the documented next step.)

Resource limits + healthchecks (N2):
- deploy.resources.limits.memory on postgres (2g), app (1500m), nginx (256m),
  backup (256m) so no container can starve the others (the Nginx outage was a
  reminder).
- Nginx now has a healthcheck hitting a new self-served `/nginx-health`
  endpoint on the default_server (no upstream dependency).

Chat resilience (N3):
- buildSystemPrompt() wraps its 4 Prisma queries in try/catch with safe
  defaults — if Postgres is down the assistant degrades instead of 500-ing.
- Result is cached for 60s (only on healthy builds) so we don't run 4 queries
  per message; CMS edits still appear within the TTL.
- POST fails fast with 503 if OPENAI_API_KEY is missing (instead of breaking
  mid-stream after headers are sent).
- streamText gets an onError handler that logs + persists an `error` AiEvent.

Idempotent submissions (N4):
- consultation/route.ts and operations.ts now wrap the email-tracking UPDATE
  in try/catch — the lead/signal is already saved, so a telemetry hiccup can't
  500 the request and trigger a duplicate retry. operations.ts also returns
  emailError.

Performance (N5):
- Index GlobalNode(application, isActive) — backs the case-study join on every
  application page. Migration 20260609130000_index_globalnode_application.

Verified: next build compiles (Docker parity, SESSION_SECRET unset),
TypeScript clean, prisma schema valid, golden tests 17/17,
`docker compose config` valid.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitignore                                    |  1 +
 docker-compose.yml                            | 50 ++++++++++++
 nginx/conf.d/flux.conf                        |  5 ++
 .../migration.sql                             | 10 +++
 prisma/schema.prisma                          |  3 +
 scripts/backup-loop.sh                        | 15 ++++
 scripts/db-backup.sh                          | 31 ++++++++
 src/app/actions/operations.ts                 | 26 ++++---
 src/app/api/chat/route.ts                     | 76 +++++++++++++++----
 src/app/api/consultation/route.ts             | 22 ++++--
 10 files changed, 208 insertions(+), 31 deletions(-)
 create mode 100644 prisma/migrations/20260609130000_index_globalnode_application/migration.sql
 create mode 100755 scripts/backup-loop.sh
 create mode 100755 scripts/db-backup.sh

diff --git a/.gitignore b/.gitignore
index ca41af4..5435e84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,4 @@ public/branding/
 # Local Claude Code / MCP config — agent-specific, not project
 .mcp.json
 .claude/
+backups/
diff --git a/docker-compose.yml b/docker-compose.yml
index 64195f5..8554af7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,6 +17,12 @@ services:
       - pgdata:/var/lib/postgresql/data
     networks:
       - flux-net
+    # Resource caps so no single container can starve the others (the Nginx
+    # outage earlier was a reminder). VPS has ~11 GB; these leave headroom.
+    deploy:
+      resources:
+        limits:
+          memory: 2g
     healthcheck:
       test: ["CMD-SHELL", "pg_isready -U ${DB_USER} -d ${DB_NAME}"]
       interval: 5s
@@ -81,6 +87,10 @@ services:
       - flux-net
     expose:
       - "3000"
+    deploy:
+      resources:
+        limits:
+          memory: 1500m
     healthcheck:
       test:
         - CMD-SHELL
@@ -114,6 +124,46 @@ services:
       - app
     networks:
       - flux-net
+    deploy:
+      resources:
+        limits:
+          memory: 256m
+    healthcheck:
+      # Nginx self-health (served directly by the default_server, no upstream).
+      test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1/nginx-health || exit 1"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+      start_period: 10s
+
+  # ── Automated Postgres backups ──
+  # Nightly pg_dump -> gzip into ./backups on the host, 14-day rotation.
+  # NOTE: this is LOCAL to the VPS. Offsite copy (S3/rsync) is the recommended
+  # next step once the client provides storage credentials.
+  backup:
+    image: postgres:16-alpine
+    restart: always
+    depends_on:
+      postgres:
+        condition: service_healthy
+    environment:
+      DB_USER: ${DB_USER}
+      DB_PASSWORD: ${DB_PASSWORD}
+      DB_NAME: ${DB_NAME}
+      BACKUP_DIR: /backups
+      RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
+      BACKUP_INTERVAL_SECONDS: ${BACKUP_INTERVAL_SECONDS:-86400}
+    volumes:
+      - ./backups:/backups
+      - ./scripts/db-backup.sh:/usr/local/bin/db-backup.sh:ro
+      - ./scripts/backup-loop.sh:/usr/local/bin/backup-loop.sh:ro
+    entrypoint: ["/bin/sh", "/usr/local/bin/backup-loop.sh"]
+    networks:
+      - flux-net
+    deploy:
+      resources:
+        limits:
+          memory: 256m
 
 volumes:
   pgdata:
diff --git a/nginx/conf.d/flux.conf b/nginx/conf.d/flux.conf
index 84e71d2..f934378 100644
--- a/nginx/conf.d/flux.conf
+++ b/nginx/conf.d/flux.conf
@@ -22,6 +22,11 @@ server {
     listen 80 default_server;
     server_name _;
 
+    # Nginx self-health endpoint (served directly, no upstream) — used by the
+    # docker-compose healthcheck. Reachable on 127.0.0.1 inside the container
+    # (no Host match needed, so it lands here on the default_server).
+    location = /nginx-health { return 200 "ok\n"; access_log off; }
+
     # Keep ACME HTTP-01 working so certbot can still renew on any host.
     location /.well-known/acme-challenge/ { root /var/www/certbot; }
 
diff --git a/prisma/migrations/20260609130000_index_globalnode_application/migration.sql b/prisma/migrations/20260609130000_index_globalnode_application/migration.sql
new file mode 100644
index 0000000..52d5c2f
--- /dev/null
+++ b/prisma/migrations/20260609130000_index_globalnode_application/migration.sql
@@ -0,0 +1,10 @@
+-- ─────────────────────────────────────────────────────────────────────────
+-- ADDITIVE MIGRATION — index GlobalNode(application, isActive).
+-- The application detail page queries case studies by application slug +
+-- isActive (the GlobalNode.application -> Application.slug join). Without an
+-- index this is a full table scan on every application page render.
+-- Idempotent. Safe for `migrate deploy`.
+-- ─────────────────────────────────────────────────────────────────────────
+
+CREATE INDEX IF NOT EXISTS "GlobalNode_application_isActive_idx"
+  ON "GlobalNode" ("application", "isActive");
diff --git a/prisma/schema.prisma b/prisma/schema.prisma
index 0b92ca1..bf3d87e 100644
--- a/prisma/schema.prisma
+++ b/prisma/schema.prisma
@@ -64,6 +64,9 @@ model GlobalNode {
   @@index([isActive])
   @@index([nodeType])
   @@index([nodeType, isActive])
+  // Case studies on an application page filter by application slug + isActive
+  // (src/app/[locale]/applications/[slug]/page.tsx). Back this join with an index.
+  @@index([application, isActive])
 }
 
 // ------------------------------------------------------
diff --git a/scripts/backup-loop.sh b/scripts/backup-loop.sh
new file mode 100755
index 0000000..ddf1a5c
--- /dev/null
+++ b/scripts/backup-loop.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# ─────────────────────────────────────────────────────────────────────────────
+# Backup service entrypoint. Runs one backup immediately on start, then loops
+# every BACKUP_INTERVAL_SECONDS (default 24h). A loop (vs cron) inherits the
+# container environment cleanly and survives restarts without lost schedules.
+# ─────────────────────────────────────────────────────────────────────────────
+set -eu
+
+INTERVAL="${BACKUP_INTERVAL_SECONDS:-86400}"
+echo "[backup] service started; interval=${INTERVAL}s, retention=${RETENTION_DAYS:-14}d"
+
+while true; do
+  /usr/local/bin/db-backup.sh || echo "[backup] cycle failed; will retry next interval"
+  sleep "$INTERVAL"
+done
diff --git a/scripts/db-backup.sh b/scripts/db-backup.sh
new file mode 100755
index 0000000..912225a
--- /dev/null
+++ b/scripts/db-backup.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+# ─────────────────────────────────────────────────────────────────────────────
+# Single Postgres backup: pg_dump -> gzip -> N-day rotation.
+# Run by scripts/backup-loop.sh inside the `backup` compose service.
+# Env: DB_USER, DB_PASSWORD, DB_NAME, BACKUP_DIR, RETENTION_DAYS
+# ─────────────────────────────────────────────────────────────────────────────
+set -eu
+
+BACKUP_DIR="${BACKUP_DIR:-/backups}"
+RETENTION_DAYS="${RETENTION_DAYS:-14}"
+TS=$(date -u +%Y%m%d_%H%M%S)
+OUT="${BACKUP_DIR}/flux_db_${TS}.sql.gz"
+
+mkdir -p "$BACKUP_DIR"
+export PGPASSWORD="$DB_PASSWORD"
+
+echo "[backup] $(date -u +%Y-%m-%dT%H:%M:%SZ) starting pg_dump -> ${OUT}"
+
+# --no-owner/--no-privileges keep the dump portable across roles on restore.
+if pg_dump -h postgres -U "$DB_USER" -d "$DB_NAME" --no-owner --no-privileges | gzip -9 > "$OUT"; then
+  SIZE=$(du -h "$OUT" | cut -f1)
+  echo "[backup] OK: ${OUT} (${SIZE})"
+else
+  echo "[backup] FAILED: pg_dump returned non-zero; removing partial file"
+  rm -f "$OUT"
+  exit 1
+fi
+
+# Rotation — drop dumps older than RETENTION_DAYS.
+DELETED=$(find "$BACKUP_DIR" -name 'flux_db_*.sql.gz' -mtime +"$RETENTION_DAYS" -print -delete 2>/dev/null | wc -l || echo 0)
+echo "[backup] rotation: kept last ${RETENTION_DAYS} days, pruned ${DELETED} old dump(s)"
diff --git a/src/app/actions/operations.ts b/src/app/actions/operations.ts
index ef1e026..4105334 100644
--- a/src/app/actions/operations.ts
+++ b/src/app/actions/operations.ts
@@ -92,17 +92,23 @@ export async function submitOperationsSignal(payload: {
       replyTo: payload.clientEmail,
     });
 
-    // Track email delivery in DB
-    await prisma.operationsSignal.update({
-      where: { id: signal.id },
-      data: {
-        emailSentTo: emailResult.sentTo.join(", "),
-        emailSentAt: emailResult.sentAt,
-        emailError: emailResult.error,
-      },
-    });
+    // Track email delivery — best-effort. The signal (lead) is already saved,
+    // so a telemetry-update hiccup must NOT fail the request and make the
+    // client retry into a duplicate.
+    try {
+      await prisma.operationsSignal.update({
+        where: { id: signal.id },
+        data: {
+          emailSentTo: emailResult.sentTo.join(", "),
+          emailSentAt: emailResult.sentAt,
+          emailError: emailResult.error,
+        },
+      });
+    } catch (trackErr) {
+      console.warn("[operations] email tracking update failed (lead already saved):", trackErr);
+    }
 
-    return { success: true, ticketId, emailSent: emailResult.success };
+    return { success: true, ticketId, emailSent: emailResult.success, emailError: emailResult.error };
   } catch (error) {
     console.error("Error submitting signal:", error);
     return { error: "Failed to submit request. Please try again." };
diff --git a/src/app/api/chat/route.ts b/src/app/api/chat/route.ts
index 5e3d2ef..51bddca 100644
--- a/src/app/api/chat/route.ts
+++ b/src/app/api/chat/route.ts
@@ -39,22 +39,43 @@ const COMPARISON_DATA: Record<string, { rf: number; traditional: number; unit: s
 // ─── DYNAMIC SYSTEM PROMPT BUILDER ──────────────────────────────
 // Injects real-time database context so the AI knows what exists
 
+// Cache the built prompt briefly so we don't run 4 DB queries on every single
+// chat message. CMS changes appear within the TTL. Only healthy builds are
+// cached, so a transient DB outage retries on the next message.
+let _promptCache: { value: string; at: number } | null = null;
+const SYSTEM_PROMPT_TTL_MS = 60_000;
+
 async function buildSystemPrompt(): Promise<string> {
-  // Query real data from Prisma
-  const [activeApps, installationCount, eventCount, partsCount] = await Promise.all([
-    prisma.application.findMany({
-      where: { isActive: true },
-      select: { slug: true, title: true, shortDescription: true, category: true },
-      orderBy: { title: 'asc' },
-    }),
-    prisma.globalNode.count({ where: { nodeType: 'installation', isActive: true } }),
-    prisma.globalNode.count({ where: { nodeType: 'event', isActive: true } }),
-    prisma.sparePart.count({ where: { isActive: true } }),
-  ]);
+  if (_promptCache && Date.now() - _promptCache.at < SYSTEM_PROMPT_TTL_MS) {
+    return _promptCache.value;
+  }
 
-  const appList = activeApps.map((a: any) => `  - ${a.title} (slug: "${a.slug}", category: ${a.category})`).join('\n');
+  // Live DB context. If Postgres is unreachable, fall back to safe defaults so
+  // the assistant still answers (degraded) instead of 500-ing the whole chat.
+  let activeApps: Array<{ slug: string; title: string; shortDescription: string; category: string }> = [];
+  let installationCount = 0, eventCount = 0, partsCount = 0;
+  let dbOk = true;
+  try {
+    [activeApps, installationCount, eventCount, partsCount] = await Promise.all([
+      prisma.application.findMany({
+        where: { isActive: true },
+        select: { slug: true, title: true, shortDescription: true, category: true },
+        orderBy: { title: 'asc' },
+      }),
+      prisma.globalNode.count({ where: { nodeType: 'installation', isActive: true } }),
+      prisma.globalNode.count({ where: { nodeType: 'event', isActive: true } }),
+      prisma.sparePart.count({ where: { isActive: true } }),
+    ]);
+  } catch (e) {
+    dbOk = false;
+    log.warn('chat.system_prompt_db_unavailable', { err: String(e) });
+  }
 
-  return `You are "FluxAI", the intelligent engineering advisor and sales specialist for FLUX Srl — a world leader in solid-state Radio Frequency (RF), Microwave, and Infrared industrial equipment. Founded by Patrizio Grando with 40+ years of legacy. Headquarters: Romano d'Ezzelino, Vicenza, Italy.
+  const appList = activeApps.length
+    ? activeApps.map((a) => `  - ${a.title} (slug: "${a.slug}", category: ${a.category})`).join('\n')
+    : '  (live catalog temporarily unavailable — describe FLUX applications from general RF knowledge)';
+
+  const prompt = `You are "FluxAI", the intelligent engineering advisor and sales specialist for FLUX Srl — a world leader in solid-state Radio Frequency (RF), Microwave, and Infrared industrial equipment. Founded by Patrizio Grando with 40+ years of legacy. Headquarters: Romano d'Ezzelino, Vicenza, Italy.
 
 PERSONALITY:
 - Senior RF engineer who also understands business ROI.
@@ -143,6 +164,10 @@ PROACTIVE NEXT STEPS (always suggest the next logical action):
   comparison → "Let me quantify the difference for your specific operation..." → energy_savings_calculator
 
 LANGUAGE: Respond in the exact same language the user writes in.`;
+
+  // Only cache a healthy build so a transient DB outage retries next message.
+  if (dbOk) _promptCache = { value: prompt, at: Date.now() };
+  return prompt;
 }
 
 // ─── HELPER: Parse JSON safely ──────────────────────────────────
@@ -198,6 +223,17 @@ export async function POST(req: Request) {
     );
   }
 
+  // ─── Fail fast if the AI provider isn't configured ─────────────
+  // Without this, a missing/invalid key surfaces mid-stream after headers
+  // are already sent, producing a confusing broken response.
+  if (!process.env.OPENAI_API_KEY) {
+    log.error("chat.openai_key_missing", new Error("OPENAI_API_KEY is not set"));
+    return new Response(
+      JSON.stringify({ error: "The AI assistant is temporarily unavailable. Please try again later." }),
+      { status: 503, headers: { "Content-Type": "application/json" } },
+    );
+  }
+
   const {
     messages,
     context,
@@ -287,6 +323,20 @@ export async function POST(req: Request) {
     system: systemPrompt + contextNote,
     messages: coreMessages,
     providerOptions: { openai: { promptCacheKey: 'fluxai-v1' } },
+    // Surface streaming/provider errors (OpenAI 429/500, bad key) in the logs
+    // and, when possible, persist them to the conversation timeline.
+    onError: ({ error }) => {
+      log.error("chat.stream_error", error, { conversationId: conversationId ?? undefined });
+      if (conversationId) {
+        prisma.aiEvent.create({
+          data: {
+            conversationId,
+            type: "error",
+            payloadJson: JSON.stringify({ message: error instanceof Error ? error.message : String(error) }).slice(0, 2000),
+          },
+        }).catch(() => {});
+      }
+    },
     onFinish: async ({ usage, toolCalls, toolResults }) => {
       if (!conversationId) return;
       try {
diff --git a/src/app/api/consultation/route.ts b/src/app/api/consultation/route.ts
index 9e9b9b0..c87f75f 100644
--- a/src/app/api/consultation/route.ts
+++ b/src/app/api/consultation/route.ts
@@ -145,14 +145,20 @@ export async function POST(request: NextRequest) {
       replyTo: contact.email,
     });
 
-    await prisma.operationsSignal.update({
-      where: { id: signal.id },
-      data: {
-        emailSentTo: emailResult.sentTo.join(", "),
-        emailSentAt: emailResult.sentAt,
-        emailError: emailResult.error,
-      },
-    });
+    // Best-effort email tracking — the lead is already saved; never fail the
+    // request (and risk a client retry / duplicate) over a telemetry update.
+    try {
+      await prisma.operationsSignal.update({
+        where: { id: signal.id },
+        data: {
+          emailSentTo: emailResult.sentTo.join(", "),
+          emailSentAt: emailResult.sentAt,
+          emailError: emailResult.error,
+        },
+      });
+    } catch (trackErr) {
+      log.warn("consultation.email_tracking_failed", { ticketId, err: String(trackErr) });
+    }
 
     log.info("consultation.submitted", { ticketId, emailSent: emailResult.success });