From e0399ccf3b433a879d60e74ef2f1045381f2c829 Mon Sep 17 00:00:00 2001 From: DavidHerran Date: Fri, 5 Jun 2026 17:15:34 -0500 Subject: [PATCH] feat(i18n): preserve English technical terms in AI translation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes terms like "Radio Frequency" and "solid-state" being mistranslated into each locale. English is the master language, so brand/technical terms must stay in English everywhere. - New src/lib/translationGlossary.ts: a curated PROTECTED_TERMS list plus deterministic mask/unmask helpers. Before translation each term is replaced with a stable __FLUXTERM_n__ placeholder; after translation the placeholders are restored to their canonical English form. Preservation is therefore guaranteed, not left to the model's discretion. - aiTranslator.ts now masks every field before sending, restores every field of every locale afterwards, and reinforces the rule in the prompt (explicit glossary + "keep tokens byte-for-byte"). A tolerant cleanup regex recovers placeholders even if the model adds stray spaces, so a mangled token never leaks to the public site. - Whole-word, case-insensitive matching ("RF" in "surf" is not touched); longest terms masked first to avoid overlaps; casing normalised to the canonical brand form on restore. - 4 new golden tests (17 total) cover round-trip, simulated translation, whole-word safety, and mangled-token recovery. To extend: add terms to PROTECTED_TERMS — no other change needed. Verified: build compiles, TypeScript clean, npm run test:ai 17/17 pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/lib/aiTranslator.ts | 55 +++++++++++++++----- src/lib/translationGlossary.ts | 91 ++++++++++++++++++++++++++++++++++ tests/ai/golden.test.mjs | 57 +++++++++++++++++++++ 3 files changed, 191 insertions(+), 12 deletions(-) create mode 100644 src/lib/translationGlossary.ts diff --git a/src/lib/aiTranslator.ts b/src/lib/aiTranslator.ts index 27f127d..ed6512c 100644 --- a/src/lib/aiTranslator.ts +++ b/src/lib/aiTranslator.ts @@ -1,18 +1,35 @@ import { generateText } from 'ai'; import { openai } from '@ai-sdk/openai'; +import { + maskProtectedTerms, + unmaskProtectedTerms, + glossaryForPrompt, +} from '@/lib/translationGlossary'; /** * Motor de traducción impulsado por Vercel AI SDK y OpenAI. - * Usa generateText para evitar bugs de compatibilidad con Zod. + * + * English is the master language. Protected technical/brand terms (e.g. + * "Radio Frequency", "solid-state", "FLUX") are MASKED with placeholders + * before translation and RESTORED to their English form afterwards, so they + * are preserved deterministically across every locale — not left to the + * model's discretion. See src/lib/translationGlossary.ts. + * * @param content Objeto con los textos a traducir. Ej: { title: "...", content: "..." } - * @returns Objeto con los idiomas y sus traducciones + * @returns Objeto con los idiomas y sus traducciones, o null on failure. */ export async function translateContentForCMS(content: Record) { try { + // 1. Mask protected terms in every field before sending to the model. + const maskedContent: Record = {}; + for (const [key, value] of Object.entries(content)) { + maskedContent[key] = maskProtectedTerms(value ?? ''); + } + const { text } = await generateText({ - model: openai('gpt-4o'), - system: `You are an elite technical translator for FLUX, a premium brand of Radio Frequency (RF) industrial machinery. - + model: openai('gpt-4o'), + system: `You are an elite technical translator for FLUX, a premium brand of solid-state Radio Frequency (RF) industrial machinery. + Your task is to translate the user's JSON content into 4 specific locales: 1. 'it': Standard Professional Italian. 2. 'vec': Venetian dialect (from Bassano del Grappa). Maintain a proud, industrial, and authentic tone. @@ -22,9 +39,11 @@ export async function translateContentForCMS(content: Record) { CRITICAL RULES: - NEVER translate Markdown syntax (#, **, *, >, |---|). - NEVER translate URLs, file paths (like /cases/img.jpg), or code blocks. - - NEVER translate technical acronyms like "RF", "kW", "MHz", "FLUX". + - NEVER translate technical acronyms or units like "RF", "kW", "MHz", "FLUX". + - English is the master language. Keep this protected glossary in ENGLISH, untranslated, in every locale: ${glossaryForPrompt()}. + - CRITICAL: The text contains placeholder tokens of the form __FLUXTERM_0__, __FLUXTERM_1__, etc. These stand in for protected English terms. Keep every such token EXACTLY as-is, byte for byte. Do not translate, space, reorder the underscores, or alter them in any way. Position them naturally in the translated sentence. - Keep the exact same JSON key names as the input. - + OUTPUT FORMAT: You MUST return ONLY a raw, valid JSON object. Do not wrap it in \`\`\`json blocks. No pleasantries. The output must strictly follow this structure: @@ -34,20 +53,32 @@ export async function translateContentForCMS(content: Record) { "es": { "key1": "translated text..." }, "de": { "key1": "translated text..." } }`, - - prompt: JSON.stringify(content), + + prompt: JSON.stringify(maskedContent), }); // Limpiamos el texto por si GPT-4o decide ponerle "```json" alrededor const cleanedText = text.replace(/```json/g, '').replace(/```/g, '').trim(); - + // Convertimos la respuesta de la IA en un objeto real de Javascript const parsedObject = JSON.parse(cleanedText); - + + // 2. Restore protected terms in every translated field of every locale. + for (const locale of Object.keys(parsedObject)) { + const fields = parsedObject[locale]; + if (fields && typeof fields === 'object') { + for (const key of Object.keys(fields)) { + if (typeof fields[key] === 'string') { + fields[key] = unmaskProtectedTerms(fields[key]); + } + } + } + } + return parsedObject; } catch (error) { console.error("Error in AI Translation:", error); return null; } -} \ No newline at end of file +} diff --git a/src/lib/translationGlossary.ts b/src/lib/translationGlossary.ts new file mode 100644 index 0000000..12e5d1c --- /dev/null +++ b/src/lib/translationGlossary.ts @@ -0,0 +1,91 @@ +// src/lib/translationGlossary.ts +// ----------------------------------------------------------------------------- +// Protected technical terminology for the AI translation engine. +// +// English is the master language of the site. Some technical / brand terms +// must stay in English across ALL locales — translating "Radio Frequency" +// literally into Venetian or Italian reads wrong. Instead of hoping the LLM +// obeys a "do not translate" instruction, we MASK these terms with stable +// placeholders before translation and RESTORE them afterwards. That makes +// preservation deterministic, not best-effort. +// +// To add a term: drop it into PROTECTED_TERMS (longest, most specific first). +// Multi-word terms and hyphenated terms are fine. +// ----------------------------------------------------------------------------- + +export const PROTECTED_TERMS: string[] = [ + // Brand + "FLUX", + "Inside Flux", + // Core technology — the terms that read wrong when translated literally + "Radio Frequency", + "solid-state", + "solid state", + "RF", + // Units (also covered by the prompt rule, masked here for certainty) + "MHz", + "GHz", + "kHz", + "kWh", + "kW", + "MW", +]; + +// Stable, ASCII-safe placeholder. LLMs reliably preserve identifier-looking +// tokens like this. Format chosen so a tolerant cleanup regex can still +// recover the term even if the model inserts stray spaces/underscores. +const placeholder = (i: number): string => `__FLUXTERM_${i}__`; + +function escapeRegExp(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +// Index terms by length desc so longer terms mask first (e.g. "Radio +// Frequency" before "RF"), preventing partial overlaps. +const ORDERED = PROTECTED_TERMS + .map((term, idx) => ({ term, idx })) + .sort((a, b) => b.term.length - a.term.length); + +/** + * Replace every protected term (case-insensitive, whole-word) with its + * placeholder. Returns the masked text. Use the SAME glossary order to + * unmask. Casing is normalised to the canonical glossary form on restore — + * desirable for brand consistency. + */ +export function maskProtectedTerms(text: string): string { + if (!text) return text; + let out = text; + for (const { term, idx } of ORDERED) { + // \b works at alphanumeric boundaries; for hyphenated terms the literal + // hyphen is matched inside the term, boundaries sit on the outer edges. + const re = new RegExp(`\\b${escapeRegExp(term)}\\b`, "gi"); + out = out.replace(re, placeholder(idx)); + } + return out; +} + +/** + * Restore placeholders to their canonical English term. Tolerant of minor + * corruption the model may introduce (stray spaces/underscores around the + * token), so a mangled placeholder never leaks to the public site. + */ +export function unmaskProtectedTerms(text: string): string { + if (!text) return text; + let out = text; + // Exact restore first. + for (let i = 0; i < PROTECTED_TERMS.length; i++) { + out = out.split(placeholder(i)).join(PROTECTED_TERMS[i]); + } + // Tolerant cleanup for any placeholder the model slightly altered, e.g. + // "__ FLUXTERM_2 __" or "__fluxterm_2__". + out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => { + const idx = Number(n); + return PROTECTED_TERMS[idx] ?? ""; + }); + return out; +} + +/** Comma-separated list for embedding in the translation prompt as reinforcement. */ +export function glossaryForPrompt(): string { + return PROTECTED_TERMS.join(", "); +} diff --git a/tests/ai/golden.test.mjs b/tests/ai/golden.test.mjs index dbde8dc..77840a5 100644 --- a/tests/ai/golden.test.mjs +++ b/tests/ai/golden.test.mjs @@ -164,4 +164,61 @@ test("CSRF: garbage rejected", () => { assert.equal(verifyCsrfToken(null), false); }); +// 5. Translation glossary — protected technical terms must survive masking. +// Re-implements the mask/unmask predicates so the test needs no TS loader; +// keep PROTECTED_TERMS in sync with src/lib/translationGlossary.ts. +const PROTECTED_TERMS = [ + "FLUX", "Inside Flux", "Radio Frequency", "solid-state", "solid state", + "RF", "MHz", "GHz", "kHz", "kWh", "kW", "MW", +]; +const ph = (i) => `__FLUXTERM_${i}__`; +const escapeRe = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +const ORDERED = PROTECTED_TERMS.map((term, idx) => ({ term, idx })).sort((a, b) => b.term.length - a.term.length); +function maskTerms(text) { + let out = text; + for (const { term, idx } of ORDERED) out = out.replace(new RegExp(`\\b${escapeRe(term)}\\b`, "gi"), ph(idx)); + return out; +} +function unmaskTerms(text) { + let out = text; + for (let i = 0; i < PROTECTED_TERMS.length; i++) out = out.split(ph(i)).join(PROTECTED_TERMS[i]); + out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => PROTECTED_TERMS[Number(n)] ?? ""); + return out; +} + +test("glossary: Radio Frequency is masked then restored verbatim", () => { + const masked = maskTerms("Radio Frequency drying is efficient."); + assert.ok(!masked.includes("Radio Frequency")); + assert.ok(masked.includes("__FLUXTERM_")); + assert.equal(unmaskTerms(masked), "Radio Frequency drying is efficient."); +}); + +test("glossary: simulated translation preserves the English term", () => { + // Mask EN -> 'translate' the surrounding words to IT, keep token -> unmask. + const masked = maskTerms("Our Radio Frequency systems use solid-state technology."); + // Pretend the model translated everything except the tokens: + const fakeItalian = masked + .replace("Our", "I nostri") + .replace("systems use", "sistemi usano") + .replace("technology.", "tecnologia."); + const restored = unmaskTerms(fakeItalian); + assert.ok(restored.includes("Radio Frequency")); + assert.ok(restored.includes("solid-state")); + assert.ok(!restored.includes("__FLUXTERM_")); +}); + +test("glossary: RF whole-word only, not inside other words", () => { + const masked = maskTerms("surf the RF spectrum"); + // "RF" masked, "surf" untouched + assert.ok(masked.includes("surf")); + assert.equal(unmaskTerms(masked), "surf the RF spectrum"); +}); + +test("glossary: tolerant cleanup recovers a mangled placeholder", () => { + // Model inserted stray spaces around the token. + const recovered = unmaskTerms("La __ FLUXTERM_2 __ è efficiente."); + assert.ok(recovered.includes("Radio Frequency")); + assert.ok(!recovered.includes("FLUXTERM")); +}); + console.log("Golden tests file resolved at:", pathToFileURL(resolve(import.meta.url)).href);