feat(i18n): preserve English technical terms in AI translation
Deploy to VPS / deploy (push) Has been cancelled
Deploy to VPS / deploy (push) Has been cancelled
Fixes terms like "Radio Frequency" and "solid-state" being mistranslated
into each locale. English is the master language, so brand/technical terms
must stay in English everywhere.
- New src/lib/translationGlossary.ts: a curated PROTECTED_TERMS list plus
deterministic mask/unmask helpers. Before translation each term is
replaced with a stable __FLUXTERM_n__ placeholder; after translation the
placeholders are restored to their canonical English form. Preservation
is therefore guaranteed, not left to the model's discretion.
- aiTranslator.ts now masks every field before sending, restores every
field of every locale afterwards, and reinforces the rule in the prompt
(explicit glossary + "keep tokens byte-for-byte"). A tolerant cleanup
regex recovers placeholders even if the model adds stray spaces, so a
mangled token never leaks to the public site.
- Whole-word, case-insensitive matching ("RF" in "surf" is not touched);
longest terms masked first to avoid overlaps; casing normalised to the
canonical brand form on restore.
- 4 new golden tests (17 total) cover round-trip, simulated translation,
whole-word safety, and mangled-token recovery.
To extend: add terms to PROTECTED_TERMS — no other change needed.
Verified: build compiles, TypeScript clean, npm run test:ai 17/17 pass.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -164,4 +164,61 @@ test("CSRF: garbage rejected", () => {
|
||||
assert.equal(verifyCsrfToken(null), false);
|
||||
});
|
||||
|
||||
// 5. Translation glossary — protected technical terms must survive masking.
|
||||
// Re-implements the mask/unmask predicates so the test needs no TS loader;
|
||||
// keep PROTECTED_TERMS in sync with src/lib/translationGlossary.ts.
|
||||
const PROTECTED_TERMS = [
|
||||
"FLUX", "Inside Flux", "Radio Frequency", "solid-state", "solid state",
|
||||
"RF", "MHz", "GHz", "kHz", "kWh", "kW", "MW",
|
||||
];
|
||||
const ph = (i) => `__FLUXTERM_${i}__`;
|
||||
const escapeRe = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const ORDERED = PROTECTED_TERMS.map((term, idx) => ({ term, idx })).sort((a, b) => b.term.length - a.term.length);
|
||||
function maskTerms(text) {
|
||||
let out = text;
|
||||
for (const { term, idx } of ORDERED) out = out.replace(new RegExp(`\\b${escapeRe(term)}\\b`, "gi"), ph(idx));
|
||||
return out;
|
||||
}
|
||||
function unmaskTerms(text) {
|
||||
let out = text;
|
||||
for (let i = 0; i < PROTECTED_TERMS.length; i++) out = out.split(ph(i)).join(PROTECTED_TERMS[i]);
|
||||
out = out.replace(/_{1,2}\s*FLUXTERM\s*_?\s*(\d+)\s*_{1,2}/gi, (_m, n) => PROTECTED_TERMS[Number(n)] ?? "");
|
||||
return out;
|
||||
}
|
||||
|
||||
test("glossary: Radio Frequency is masked then restored verbatim", () => {
|
||||
const masked = maskTerms("Radio Frequency drying is efficient.");
|
||||
assert.ok(!masked.includes("Radio Frequency"));
|
||||
assert.ok(masked.includes("__FLUXTERM_"));
|
||||
assert.equal(unmaskTerms(masked), "Radio Frequency drying is efficient.");
|
||||
});
|
||||
|
||||
test("glossary: simulated translation preserves the English term", () => {
|
||||
// Mask EN -> 'translate' the surrounding words to IT, keep token -> unmask.
|
||||
const masked = maskTerms("Our Radio Frequency systems use solid-state technology.");
|
||||
// Pretend the model translated everything except the tokens:
|
||||
const fakeItalian = masked
|
||||
.replace("Our", "I nostri")
|
||||
.replace("systems use", "sistemi usano")
|
||||
.replace("technology.", "tecnologia.");
|
||||
const restored = unmaskTerms(fakeItalian);
|
||||
assert.ok(restored.includes("Radio Frequency"));
|
||||
assert.ok(restored.includes("solid-state"));
|
||||
assert.ok(!restored.includes("__FLUXTERM_"));
|
||||
});
|
||||
|
||||
test("glossary: RF whole-word only, not inside other words", () => {
|
||||
const masked = maskTerms("surf the RF spectrum");
|
||||
// "RF" masked, "surf" untouched
|
||||
assert.ok(masked.includes("surf"));
|
||||
assert.equal(unmaskTerms(masked), "surf the RF spectrum");
|
||||
});
|
||||
|
||||
test("glossary: tolerant cleanup recovers a mangled placeholder", () => {
|
||||
// Model inserted stray spaces around the token.
|
||||
const recovered = unmaskTerms("La __ FLUXTERM_2 __ è efficiente.");
|
||||
assert.ok(recovered.includes("Radio Frequency"));
|
||||
assert.ok(!recovered.includes("FLUXTERM"));
|
||||
});
|
||||
|
||||
console.log("Golden tests file resolved at:", pathToFileURL(resolve(import.meta.url)).href);
|
||||
|
||||
Reference in New Issue
Block a user