Files
davidherran e0399ccf3b
Deploy to VPS / deploy (push) Has been cancelled
feat(i18n): preserve English technical terms in AI translation
Fixes terms like "Radio Frequency" and "solid-state" being mistranslated
into each locale. English is the master language, so brand/technical terms
must stay in English everywhere.

- New src/lib/translationGlossary.ts: a curated PROTECTED_TERMS list plus
  deterministic mask/unmask helpers. Before translation each term is
  replaced with a stable __FLUXTERM_n__ placeholder; after translation the
  placeholders are restored to their canonical English form. Preservation
  is therefore guaranteed, not left to the model's discretion.
- aiTranslator.ts now masks every field before sending, restores every
  field of every locale afterwards, and reinforces the rule in the prompt
  (explicit glossary + "keep tokens byte-for-byte"). A tolerant cleanup
  regex recovers placeholders even if the model adds stray spaces, so a
  mangled token never leaks to the public site.
- Whole-word, case-insensitive matching ("RF" in "surf" is not touched);
  longest terms masked first to avoid overlaps; casing normalised to the
  canonical brand form on restore.
- 4 new golden tests (17 total) cover round-trip, simulated translation,
  whole-word safety, and mangled-token recovery.

To extend: add terms to PROTECTED_TERMS — no other change needed.

Verified: build compiles, TypeScript clean, npm run test:ai 17/17 pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 17:15:34 -05:00

225 lines
9.0 KiB
JavaScript

// tests/ai/golden.test.mjs
// -----------------------------------------------------------------------------
// Golden tests for FluxAI hardening + analytics. Uses Node's built-in test
// runner (no new deps). Run with: `node --test tests/ai/golden.test.mjs`.
//
// These don't hit OpenAI — they verify the deterministic pieces of the stack:
// - escapeHtml strips XSS payloads
// - CSRF token issue/verify roundtrip works and rejects tampering
// - File-type detector recognises magic bytes and rejects HTML/JS pretending
// to be an image
// - Industry detector picks the right label from common B2B phrasings
// - Zod consultation schema accepts well-formed payloads, rejects bad ones
// -----------------------------------------------------------------------------
import { test } from "node:test";
import assert from "node:assert/strict";
import { pathToFileURL } from "node:url";
import { resolve } from "node:path";
process.env.SESSION_SECRET ??= "test-secret-please-replace-with-32-chars-or-more";
// Helper: import .ts via project alias. Tests run against the source file
// to avoid coupling to the build output. tsx isn't installed by default so
// we use loader-less .mjs and import the TS sources via .ts? — but Node
// can't load .ts directly. So we copy the small predicates here.
// 1. escapeHtml — pulled inline because the source is tiny + pure.
const HTML_ESCAPES = {
"&": "&amp;", "<": "&lt;", ">": "&gt;",
'"': "&quot;", "'": "&#39;", "/": "&#x2F;",
"`": "&#x60;", "=": "&#x3D;",
};
function escapeHtml(v) {
if (v == null) return "";
return String(v).replace(/[&<>"'`=/]/g, (c) => HTML_ESCAPES[c] ?? c);
}
test("escapeHtml: kills <script> injections", () => {
const input = `<script>alert(1)</script>`;
const out = escapeHtml(input);
assert.ok(!out.includes("<script>"));
assert.ok(out.includes("&lt;script&gt;"));
});
test("escapeHtml: escapes attribute-breakout payloads", () => {
const out = escapeHtml(`x" onmouseover="alert(1)`);
assert.ok(!out.includes('"'));
assert.ok(out.includes("&quot;"));
});
test("escapeHtml: handles null/undefined", () => {
assert.equal(escapeHtml(null), "");
assert.equal(escapeHtml(undefined), "");
});
// 2. File-type magic-byte sniffer — synthetic buffers.
function startsWith(buf, bytes, offset = 0) {
if (buf.length < offset + bytes.length) return false;
for (let i = 0; i < bytes.length; i++) if (buf[offset + i] !== bytes[i]) return false;
return true;
}
function detectFileType(buf) {
if (!buf || buf.length < 12) return null;
if (startsWith(buf, [0xff, 0xd8, 0xff])) return "jpeg";
if (startsWith(buf, [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a])) return "png";
if (startsWith(buf, [0x47, 0x49, 0x46, 0x38]) && (buf[4] === 0x39 || buf[4] === 0x37) && buf[5] === 0x61) return "gif";
if (startsWith(buf, [0x52, 0x49, 0x46, 0x46]) && startsWith(buf, [0x57, 0x45, 0x42, 0x50], 8)) return "webp";
if (startsWith(buf, [0x66, 0x74, 0x79, 0x70], 4)) {
const brand = buf.subarray(8, 12).toString("ascii");
if (["isom", "iso2", "mp41", "mp42", "avc1", "M4V ", "M4A ", "dash", "MSNV"].includes(brand)) return "mp4";
if (brand === "qt ") return "mov";
}
return null;
}
test("detectFileType: recognises PNG", () => {
const png = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0, 0, 0, 0, 0, 0]);
assert.equal(detectFileType(png), "png");
});
test("detectFileType: recognises JPEG", () => {
const jpg = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
assert.equal(detectFileType(jpg), "jpeg");
});
test("detectFileType: rejects HTML pretending to be PNG", () => {
const html = Buffer.from("<html><body><script>alert(1)</script></body></html>");
assert.equal(detectFileType(html), null);
});
test("detectFileType: recognises MP4 ftyp box", () => {
// 4-byte size + "ftyp" + "isom" + ...
const mp4 = Buffer.from([0, 0, 0, 0x20, 0x66, 0x74, 0x79, 0x70, 0x69, 0x73, 0x6f, 0x6d, 0, 0, 0, 0]);
assert.equal(detectFileType(mp4), "mp4");
});
// 3. Industry detector
function detectIndustryFromText(text) {
const t = text.toLowerCase();
if (/text|fabric|dye|stenter|finishing|yarn/.test(t)) return "textile";
if (/food|defrost|bak|pasteuriz|tempering|cook/.test(t)) return "food";
if (/rubber|latex|vulcaniz|foam|tyre|tire/.test(t)) return "rubber";
if (/pharma|cannabis|drug|api\b|lab/.test(t)) return "pharma";
if (/wood|timber|lumber|kiln/.test(t)) return "wood";
if (/ceramic|kiln|clay/.test(t)) return "other";
return null;
}
test("industry detector: textile process picks textile", () => {
assert.equal(detectIndustryFromText("We dry fabric after dyeing in a stenter"), "textile");
});
test("industry detector: food defrosting picks food", () => {
assert.equal(detectIndustryFromText("We defrost meat blocks for processing"), "food");
});
test("industry detector: returns null when no industry is mentioned", () => {
assert.equal(detectIndustryFromText("Tell me a joke about engineers"), null);
});
// 4. CSRF token — re-implements the verifier so tests don't need a TS loader.
import { createHmac, randomBytes, timingSafeEqual } from "node:crypto";
const CSRF_TTL_MS = 1000 * 60 * 60;
function hmac(payload) {
return createHmac("sha256", Buffer.from(process.env.SESSION_SECRET, "utf8")).update(payload).digest("base64url");
}
function issueCsrfToken() {
const nonce = randomBytes(16).toString("base64url");
const issuedAt = Date.now();
const payload = `${nonce}.${issuedAt}`;
return `${payload}.${hmac(payload)}`;
}
function verifyCsrfToken(token) {
if (!token) return false;
const parts = String(token).split(".");
if (parts.length !== 3) return false;
const [n, t, m] = parts;
if (!n || !t || !m) return false;
const issuedAt = Number(t);
if (!Number.isFinite(issuedAt)) return false;
if (Date.now() - issuedAt > CSRF_TTL_MS) return false;
const expected = hmac(`${n}.${t}`);
const a = Buffer.from(m);
const b = Buffer.from(expected);
if (a.length !== b.length) return false;
return timingSafeEqual(a, b);
}
test("CSRF: fresh token verifies", () => {
const t = issueCsrfToken();
assert.equal(verifyCsrfToken(t), true);
});
test("CSRF: tampered token fails", () => {
const t = issueCsrfToken();
const tampered = t.slice(0, -1) + (t.endsWith("A") ? "B" : "A");
assert.equal(verifyCsrfToken(tampered), false);
});
test("CSRF: garbage rejected", () => {
assert.equal(verifyCsrfToken("not-a-token"), false);
assert.equal(verifyCsrfToken(""), false);
assert.equal(verifyCsrfToken(null), false);
});
// 5. Translation glossary — protected technical terms must survive masking.
// Re-implements the mask/unmask predicates so the test needs no TS loader;
// keep PROTECTED_TERMS in sync with src/lib/translationGlossary.ts.
const PROTECTED_TERMS = [
"FLUX", "Inside Flux", "Radio Frequency", "solid-state", "solid state",
"RF", "MHz", "GHz", "kHz", "kWh", "kW", "MW",
];
const ph = (i) => `__FLUXTERM_${i}__`;
const escapeRe = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const ORDERED = PROTECTED_TERMS.map((term, idx) => ({ term, idx })).sort((a, b) => b.term.length - a.term.length);
function maskTerms(text) {
let out = text;
for (const { term, idx } of ORDERED) out = out.replace(new RegExp(`\\b${escapeRe(term)}\\b`, "gi"), ph(idx));
return out;
}
function unmaskTerms(text) {
let out = text;
for (let i = 0; i < PROTECTED_TERMS.length; i++) out = out.split(ph(i)).join(PROTECTED_TERMS[i]);
out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => PROTECTED_TERMS[Number(n)] ?? "");
return out;
}
test("glossary: Radio Frequency is masked then restored verbatim", () => {
const masked = maskTerms("Radio Frequency drying is efficient.");
assert.ok(!masked.includes("Radio Frequency"));
assert.ok(masked.includes("__FLUXTERM_"));
assert.equal(unmaskTerms(masked), "Radio Frequency drying is efficient.");
});
test("glossary: simulated translation preserves the English term", () => {
// Mask EN -> 'translate' the surrounding words to IT, keep token -> unmask.
const masked = maskTerms("Our Radio Frequency systems use solid-state technology.");
// Pretend the model translated everything except the tokens:
const fakeItalian = masked
.replace("Our", "I nostri")
.replace("systems use", "sistemi usano")
.replace("technology.", "tecnologia.");
const restored = unmaskTerms(fakeItalian);
assert.ok(restored.includes("Radio Frequency"));
assert.ok(restored.includes("solid-state"));
assert.ok(!restored.includes("__FLUXTERM_"));
});
test("glossary: RF whole-word only, not inside other words", () => {
const masked = maskTerms("surf the RF spectrum");
// "RF" masked, "surf" untouched
assert.ok(masked.includes("surf"));
assert.equal(unmaskTerms(masked), "surf the RF spectrum");
});
test("glossary: tolerant cleanup recovers a mangled placeholder", () => {
// Model inserted stray spaces around the token.
const recovered = unmaskTerms("La __ FLUXTERM_2 __ è efficiente.");
assert.ok(recovered.includes("Radio Frequency"));
assert.ok(!recovered.includes("FLUXTERM"));
});
console.log("Golden tests file resolved at:", pathToFileURL(resolve(import.meta.url)).href);