feat(i18n): preserve English technical terms in AI translation

Fixes terms like "Radio Frequency" and "solid-state" being mistranslated into each locale. English is the master language, so brand/technical terms must stay in English everywhere. - New src/lib/translationGlossary.ts: a curated PROTECTED_TERMS list plus deterministic mask/unmask helpers. Before translation each term is replaced with a stable __FLUXTERM_n__ placeholder; after translation the placeholders are restored to their canonical English form. Preservation is therefore guaranteed, not left to the model's discretion. - aiTranslator.ts now masks every field before sending, restores every field of every locale afterwards, and reinforces the rule in the prompt (explicit glossary + "keep tokens byte-for-byte"). A tolerant cleanup regex recovers placeholders even if the model adds stray spaces, so a mangled token never leaks to the public site. - Whole-word, case-insensitive matching ("RF" in "surf" is not touched); longest terms masked first to avoid overlaps; casing normalised to the canonical brand form on restore. - 4 new golden tests (17 total) cover round-trip, simulated translation, whole-word safety, and mangled-token recovery. To extend: add terms to PROTECTED_TERMS — no other change needed. Verified: build compiles, TypeScript clean, npm run test:ai 17/17 pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 17:15:34 -05:00
parent bf8b2aa631
commit e0399ccf3b
3 changed files with 191 additions and 12 deletions
@@ -1,17 +1,34 @@
 import { generateText } from 'ai';
 import { openai } from '@ai-sdk/openai';
+import {
+  maskProtectedTerms,
+  unmaskProtectedTerms,
+  glossaryForPrompt,
+} from '@/lib/translationGlossary';

 /**
 * Motor de traducción impulsado por Vercel AI SDK y OpenAI.
- * Usa generateText para evitar bugs de compatibilidad con Zod.
+ *
+ * English is the master language. Protected technical/brand terms (e.g.
+ * "Radio Frequency", "solid-state", "FLUX") are MASKED with placeholders
+ * before translation and RESTORED to their English form afterwards, so they
+ * are preserved deterministically across every locale — not left to the
+ * model's discretion. See src/lib/translationGlossary.ts.
+ *
 * @param content Objeto con los textos a traducir. Ej: { title: "...", content: "..." }
- * @returns Objeto con los idiomas y sus traducciones
+ * @returns Objeto con los idiomas y sus traducciones, o null on failure.
 */
 export async function translateContentForCMS(content: Record<string, string>) {
  try {
+    // 1. Mask protected terms in every field before sending to the model.
+    const maskedContent: Record<string, string> = {};
+    for (const [key, value] of Object.entries(content)) {
+      maskedContent[key] = maskProtectedTerms(value ?? '');
+    }
+
    const { text } = await generateText({
      model: openai('gpt-4o'),
-      system: `You are an elite technical translator for FLUX, a premium brand of Radio Frequency (RF) industrial machinery.
+      system: `You are an elite technical translator for FLUX, a premium brand of solid-state Radio Frequency (RF) industrial machinery.

      Your task is to translate the user's JSON content into 4 specific locales:
      1. 'it': Standard Professional Italian.
@@ -22,7 +39,9 @@ export async function translateContentForCMS(content: Record<string, string>) {
      CRITICAL RULES:
      - NEVER translate Markdown syntax (#, **, *, >, |---|).
      - NEVER translate URLs, file paths (like /cases/img.jpg), or code blocks.
-      - NEVER translate technical acronyms like "RF", "kW", "MHz", "FLUX".
+      - NEVER translate technical acronyms or units like "RF", "kW", "MHz", "FLUX".
+      - English is the master language. Keep this protected glossary in ENGLISH, untranslated, in every locale: ${glossaryForPrompt()}.
+      - CRITICAL: The text contains placeholder tokens of the form __FLUXTERM_0__, __FLUXTERM_1__, etc. These stand in for protected English terms. Keep every such token EXACTLY as-is, byte for byte. Do not translate, space, reorder the underscores, or alter them in any way. Position them naturally in the translated sentence.
      - Keep the exact same JSON key names as the input.

      OUTPUT FORMAT:
@@ -35,7 +54,7 @@ export async function translateContentForCMS(content: Record<string, string>) {
        "de": { "key1": "translated text..." }
      }`,

-      prompt: JSON.stringify(content),
+      prompt: JSON.stringify(maskedContent),
    });

    // Limpiamos el texto por si GPT-4o decide ponerle "```json" alrededor
@@ -44,6 +63,18 @@ export async function translateContentForCMS(content: Record<string, string>) {
    // Convertimos la respuesta de la IA en un objeto real de Javascript
    const parsedObject = JSON.parse(cleanedText);

+    // 2. Restore protected terms in every translated field of every locale.
+    for (const locale of Object.keys(parsedObject)) {
+      const fields = parsedObject[locale];
+      if (fields && typeof fields === 'object') {
+        for (const key of Object.keys(fields)) {
+          if (typeof fields[key] === 'string') {
+            fields[key] = unmaskProtectedTerms(fields[key]);
+          }
+        }
+      }
+    }
+
    return parsedObject;

  } catch (error) {
@@ -0,0 +1,91 @@
+// src/lib/translationGlossary.ts
+// -----------------------------------------------------------------------------
+// Protected technical terminology for the AI translation engine.
+//
+// English is the master language of the site. Some technical / brand terms
+// must stay in English across ALL locales — translating "Radio Frequency"
+// literally into Venetian or Italian reads wrong. Instead of hoping the LLM
+// obeys a "do not translate" instruction, we MASK these terms with stable
+// placeholders before translation and RESTORE them afterwards. That makes
+// preservation deterministic, not best-effort.
+//
+// To add a term: drop it into PROTECTED_TERMS (longest, most specific first).
+// Multi-word terms and hyphenated terms are fine.
+// -----------------------------------------------------------------------------
+
+export const PROTECTED_TERMS: string[] = [
+  // Brand
+  "FLUX",
+  "Inside Flux",
+  // Core technology — the terms that read wrong when translated literally
+  "Radio Frequency",
+  "solid-state",
+  "solid state",
+  "RF",
+  // Units (also covered by the prompt rule, masked here for certainty)
+  "MHz",
+  "GHz",
+  "kHz",
+  "kWh",
+  "kW",
+  "MW",
+];
+
+// Stable, ASCII-safe placeholder. LLMs reliably preserve identifier-looking
+// tokens like this. Format chosen so a tolerant cleanup regex can still
+// recover the term even if the model inserts stray spaces/underscores.
+const placeholder = (i: number): string => `__FLUXTERM_${i}__`;
+
+function escapeRegExp(s: string): string {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+// Index terms by length desc so longer terms mask first (e.g. "Radio
+// Frequency" before "RF"), preventing partial overlaps.
+const ORDERED = PROTECTED_TERMS
+  .map((term, idx) => ({ term, idx }))
+  .sort((a, b) => b.term.length - a.term.length);
+
+/**
+ * Replace every protected term (case-insensitive, whole-word) with its
+ * placeholder. Returns the masked text. Use the SAME glossary order to
+ * unmask. Casing is normalised to the canonical glossary form on restore —
+ * desirable for brand consistency.
+ */
+export function maskProtectedTerms(text: string): string {
+  if (!text) return text;
+  let out = text;
+  for (const { term, idx } of ORDERED) {
+    // \b works at alphanumeric boundaries; for hyphenated terms the literal
+    // hyphen is matched inside the term, boundaries sit on the outer edges.
+    const re = new RegExp(`\\b${escapeRegExp(term)}\\b`, "gi");
+    out = out.replace(re, placeholder(idx));
+  }
+  return out;
+}
+
+/**
+ * Restore placeholders to their canonical English term. Tolerant of minor
+ * corruption the model may introduce (stray spaces/underscores around the
+ * token), so a mangled placeholder never leaks to the public site.
+ */
+export function unmaskProtectedTerms(text: string): string {
+  if (!text) return text;
+  let out = text;
+  // Exact restore first.
+  for (let i = 0; i < PROTECTED_TERMS.length; i++) {
+    out = out.split(placeholder(i)).join(PROTECTED_TERMS[i]);
+  }
+  // Tolerant cleanup for any placeholder the model slightly altered, e.g.
+  // "__ FLUXTERM_2 __" or "__fluxterm_2__".
+  out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => {
+    const idx = Number(n);
+    return PROTECTED_TERMS[idx] ?? "";
+  });
+  return out;
+}
+
+/** Comma-separated list for embedding in the translation prompt as reinforcement. */
+export function glossaryForPrompt(): string {
+  return PROTECTED_TERMS.join(", ");
+}
@@ -164,4 +164,61 @@ test("CSRF: garbage rejected", () => {
  assert.equal(verifyCsrfToken(null), false);
 });

+// 5. Translation glossary — protected technical terms must survive masking.
+// Re-implements the mask/unmask predicates so the test needs no TS loader;
+// keep PROTECTED_TERMS in sync with src/lib/translationGlossary.ts.
+const PROTECTED_TERMS = [
+  "FLUX", "Inside Flux", "Radio Frequency", "solid-state", "solid state",
+  "RF", "MHz", "GHz", "kHz", "kWh", "kW", "MW",
+];
+const ph = (i) => `__FLUXTERM_${i}__`;
+const escapeRe = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+const ORDERED = PROTECTED_TERMS.map((term, idx) => ({ term, idx })).sort((a, b) => b.term.length - a.term.length);
+function maskTerms(text) {
+  let out = text;
+  for (const { term, idx } of ORDERED) out = out.replace(new RegExp(`\\b${escapeRe(term)}\\b`, "gi"), ph(idx));
+  return out;
+}
+function unmaskTerms(text) {
+  let out = text;
+  for (let i = 0; i < PROTECTED_TERMS.length; i++) out = out.split(ph(i)).join(PROTECTED_TERMS[i]);
+  out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => PROTECTED_TERMS[Number(n)] ?? "");
+  return out;
+}
+
+test("glossary: Radio Frequency is masked then restored verbatim", () => {
+  const masked = maskTerms("Radio Frequency drying is efficient.");
+  assert.ok(!masked.includes("Radio Frequency"));
+  assert.ok(masked.includes("__FLUXTERM_"));
+  assert.equal(unmaskTerms(masked), "Radio Frequency drying is efficient.");
+});
+
+test("glossary: simulated translation preserves the English term", () => {
+  // Mask EN -> 'translate' the surrounding words to IT, keep token -> unmask.
+  const masked = maskTerms("Our Radio Frequency systems use solid-state technology.");
+  // Pretend the model translated everything except the tokens:
+  const fakeItalian = masked
+    .replace("Our", "I nostri")
+    .replace("systems use", "sistemi usano")
+    .replace("technology.", "tecnologia.");
+  const restored = unmaskTerms(fakeItalian);
+  assert.ok(restored.includes("Radio Frequency"));
+  assert.ok(restored.includes("solid-state"));
+  assert.ok(!restored.includes("__FLUXTERM_"));
+});
+
+test("glossary: RF whole-word only, not inside other words", () => {
+  const masked = maskTerms("surf the RF spectrum");
+  // "RF" masked, "surf" untouched
+  assert.ok(masked.includes("surf"));
+  assert.equal(unmaskTerms(masked), "surf the RF spectrum");
+});
+
+test("glossary: tolerant cleanup recovers a mangled placeholder", () => {
+  // Model inserted stray spaces around the token.
+  const recovered = unmaskTerms("La __ FLUXTERM_2 __ è efficiente.");
+  assert.ok(recovered.includes("Radio Frequency"));
+  assert.ok(!recovered.includes("FLUXTERM"));
+});
+
 console.log("Golden tests file resolved at:", pathToFileURL(resolve(import.meta.url)).href);