lgtm

2026-05-12 22:00:56 +01:00 · 2026-05-12 22:00:56 +01:00 · 11711c57e6
commit 11711c57e6
parent 8708bf000d
38 changed files with 5361 additions and 265 deletions
--- a/video/render.sh
+++ b/video/render.sh
@ -2,10 +2,10 @@
 #
 # End-to-end re-render of the dashboard demo videos.
 #
-# All per-storyboard knobs (aspect, fps, bitrate, prompt text, voice persona,
-# poster timestamp, brand strings…) live on the Storyboard objects in
-# src/storyboard.ts. To add a vertical cut or change the voice, edit that
-# file — this script only handles target/auth/transport concerns.
+# All per-storyboard knobs (aspect, fps, bitrate, prompt text, localized
+# narration, voice persona, poster timestamp, brand strings…) live in
+# src/storyboard.ts. A single visual storyboard can expand into multiple
+# language variants there; this script renders every emitted slug.
 #
 # Two targets:
 #   local (default) — assumes the docker-compose stack on host.docker.internal,
--- a/video/src/browser.ts
+++ b/video/src/browser.ts
@ -50,6 +50,9 @@ export async function launchRecordingBrowser(
    deviceScaleFactor: storyboard.video.captureScale,
    recordVideo: { dir: opts.recordDir, size: viewport },
  });
+  await context.addInitScript((appLanguage) => {
+    if (appLanguage) localStorage.setItem('language', appLanguage);
+  }, storyboard.content.appLanguage ?? 'en');
  await suppressDevServerNoise(context);
  return { browser, context };
 }
--- a/video/src/dom.ts
+++ b/video/src/dom.ts
@ -307,12 +307,19 @@ export async function showOutro(
      document.getElementById('__demo-caption')?.classList.remove('visible');
      const el = document.createElement('div');
      el.id = '__demo-outro';
-      el.innerHTML = `
-        <div id="__demo-outro-card">
-          <div id="__demo-outro-brand">${brand}</div>
-          <div id="__demo-outro-tagline">${tagline}</div>
-          <div id="__demo-outro-url">${url}</div>
-        </div>`;
+      const card = document.createElement('div');
+      card.id = '__demo-outro-card';
+      const brandEl = document.createElement('div');
+      brandEl.id = '__demo-outro-brand';
+      brandEl.textContent = brand;
+      const taglineEl = document.createElement('div');
+      taglineEl.id = '__demo-outro-tagline';
+      taglineEl.textContent = tagline;
+      const urlEl = document.createElement('div');
+      urlEl.id = '__demo-outro-url';
+      urlEl.textContent = url;
+      card.append(brandEl, taglineEl, urlEl);
+      el.appendChild(card);
      document.body.appendChild(el);
      requestAnimationFrame(() => {
        requestAnimationFrame(() => el.classList.add('visible'));
--- a/video/src/preflight.ts
+++ b/video/src/preflight.ts
@ -46,6 +46,7 @@ function emitScript(storyboard: Storyboard): string {
    voice: {
      instruct: storyboard.voice.instruct,
      language: storyboard.voice.language,
+      referenceText: storyboard.voice.referenceText,
      temperature: storyboard.voice.temperature ?? 0.6,
      topP: storyboard.voice.topP ?? 0.9,
      seed: storyboard.voice.seed ?? 42,
@ -68,6 +69,7 @@ function main(): void {
  const index = {
    storyboards: storyboards.map((sb) => ({
      name: sb.name,
+      locale: sb.locale ?? sb.content.appLanguage,
      aspect: sb.video.aspect,
      outputFps: sb.video.outputFps,
      minDurationS: sb.video.minDurationS,
--- a/video/src/runner.ts
+++ b/video/src/runner.ts
@ -25,6 +25,7 @@ export interface RunnerResult {
 const MAP_ZOOM_WHEEL_DELTA = -120;
 const FALLBACK_MS_PER_WORD = 750;
 const FALLBACK_TAIL_BUFFER_MS = 800;
+const CJK_CHARS_PER_FALLBACK_WORD = 2;

 interface SynthCue {
  cueIndex: number;
@ -266,10 +267,15 @@ function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
  return storyboard.cues.map((cue, cueIndex) => ({
    cueIndex,
    text: cue.text,
-    durationMs:
-      cue.text.split(/\s+/).filter(Boolean).length * FALLBACK_MS_PER_WORD +
-      FALLBACK_TAIL_BUFFER_MS,
+    durationMs: estimateFallbackDurationMs(cue.text),
  }));
 }

+function estimateFallbackDurationMs(text: string): number {
+  const wordCount = text.split(/\s+/).filter(Boolean).length;
+  const cjkCount = text.match(/\p{Script=Han}/gu)?.length ?? 0;
+  const units = Math.max(wordCount, Math.ceil(cjkCount / CJK_CHARS_PER_FALLBACK_WORD), 1);
+  return units * FALLBACK_MS_PER_WORD + FALLBACK_TAIL_BUFFER_MS;
+}
+
 export type { Page };
--- a/video/src/script.ts
+++ b/video/src/script.ts
@ -124,6 +124,8 @@ export interface VoiceConfig {
  instruct: string;
  /** Qwen3-TTS language string, e.g. "English". */
  language: string;
+  /** Reference utterance used when minting a generated voice for this language. */
+  referenceText?: string;
  /** Sampling temperature (default 0.6). */
  temperature?: number;
  /** Top-p nucleus sampling (default 0.9). */
@ -147,6 +149,8 @@ export interface BrandConfig {
 export interface ContentConfig {
  /** Prompt text typed into the AI box during the cold open. */
  promptText: string;
+  /** Frontend i18n language code to set before loading the dashboard. */
+  appLanguage?: string;
  /** Cold-open zoom multiplier on the AI card. */
  aiZoomScale: number;
  initialMapView: { lat: number; lon: number; zoom: number };
@ -177,6 +181,8 @@ export interface TravelTimeFilter {
 */
 export interface Storyboard {
  name: string;
+  /** Optional language/variant code, used for manifests and logging. */
+  locale?: string;
  video: VideoConfig;
  voice: VoiceConfig;
  content: ContentConfig;
--- a/video/src/storyboard.ts
+++ b/video/src/storyboard.ts
@ -6,8 +6,9 @@ import { el, type Storyboard } from './script.js';
 * Each entry is a fully self-contained Storyboard: video knobs (aspect,
 * bitrate, fps), voice persona (Qwen3-TTS instruct + language + sampling),
 * stubbed AI response, brand strings, AND the cue list. There is no shared
- * global state — to ship a vertical cut, a different prompt, or a different
- * voice, push another item onto this array.
+ * global state. The exported array can contain generated variants, so a
+ * shared visual storyboard can render once per language without repeating
+ * its activity sequence.
 *
 * `name` doubles as the on-disk slug. The pipeline writes per-storyboard
 * artefacts to `output/<name>/` and publishes `<name>.mp4` / `<name>.jpg`
@ -23,15 +24,6 @@ import { el, type Storyboard } from './script.js';
 * before the next cue's gap).
 */

-const PROMPT_TEXT = 'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area';
-
-const BRAND = {
-  name: 'Perfect Postcode',
-  tagline: 'Your best chance to find your next perfect home.',
-  url: 'https://perfect-postcode.co.uk',
-};
-
-
 const AI_ZOOM_SCALE = 2.4;

 const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
@ -39,103 +31,238 @@ const TT_SLIDER_MAX = 120;
 const TT_DRAG_FROM_MIN = 35;
 const TT_DRAG_TO_MIN = 20;

-const BRITISH_MALE_NARRATOR =
-  'Calm and cheerful young British male narrator from the North of England with a ' +
-  'strong Manchester accent.';
+type RecordingLocale = 'en' | 'de' | 'zh' | 'hi';

-const DEFAULT_CUES: Storyboard['cues'] = [
-  {
-    text: 'Start by describing the type of place you\'re looking for',
-    gapBeforeMs: 0,
-    tail: [
-      {
-        kind: 'type',
-        selector: '[data-tutorial="ai-filters"] textarea',
-        text: PROMPT_TEXT,
-        durationMs: 3000,
-      },
-      { kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
-    ],
-  },
-  {
-    text: 'The dashboard will show you the likeliest places that will meet your expectations',
-    gapBeforeMs: 400,
-    during: [{ kind: 'zoomReset', durationMs: 1400 }],
-    tail: [{ kind: 'wait', durationMs: 500 }],
-  },
+interface RecordingLocalization {
+  name: string;
+  appLanguage: string;
+  ttsLanguage: string;
+  voiceInstruct: string;
+  voiceReferenceText: string;
+  promptText: string;
+  travelTimeLabel: string;
+  exportButtonTitle: string;
+  brand: {
+    name: string;
+    tagline: string;
+    url: string;
+  };
+  cues: {
+    describe: string;
+    dashboard: string;
+    filters: string;
+    details: string;
+    shortlist: string;
+  };
+}

-  {
-    text: `Adjust the filters to narrow down to the best candidates`,
-    gapBeforeMs: 500,
-    during: [
-      {
-        kind: 'dragSlider',
-        thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
-        trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
-        toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
-        durationMs: 1000,
-      },
-    ],
-    tail: [{ kind: 'wait', durationMs: 400 }],
-  },
+const BRAND_URL = 'https://perfect-postcode.co.uk';

-  {
-    text: 'And now it\'s time to dig into the details. Looks good to me!',
-    gapBeforeMs: 500,
-    during: [
-      { kind: 'cursorScale', scale: 1.4, durationMs: 200 },
-      {
-        kind: 'mapZoom',
-        target: { kind: 'point', x: 1140, y: 605 },
-        steps: 18,
-        durationMs: 1500,
-      },
-    ],
-    tail: [
-      // Wait for the post-zoom /api/postcodes response and a redraw
-      // before the click — otherwise the click can fire on a stale
-      // frame and miss the polygon.
-      { kind: 'wait', durationMs: 500 },
-      {
-        kind: 'click',
-        target: { kind: 'point', x: 1140, y: 605 },
-        durationMs: 700,
-      },
-      { kind: 'cursorScale', scale: 1, durationMs: 280 },
-      // Linger so the climax cue lands on the right-pane reveal.
-      { kind: 'wait', durationMs: 1500 },
-    ],
+const RECORDING_LOCALIZATIONS: Record<RecordingLocale, RecordingLocalization> = {
+  en: {
+    name: 'recording',
+    appLanguage: 'en',
+    ttsLanguage: 'English',
+    voiceInstruct:
+      'Calm and cheerful young British male narrator from the North of England with a ' +
+      'strong Manchester accent.',
+    voiceReferenceText:
+      "Welcome to the demonstration. This is the narrator voice you'll hear throughout the video.",
+    promptText:
+      'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area',
+    travelTimeLabel: 'Manchester city centre',
+    exportButtonTitle: 'Export to Excel',
+    brand: {
+      name: 'Perfect Postcode',
+      tagline: 'Your best chance to find your next perfect home.',
+      url: BRAND_URL,
+    },
+    cues: {
+      describe: "Start by describing the type of place you're looking for",
+      dashboard: 'The dashboard will show you the likeliest places that will meet your expectations',
+      filters: 'Adjust the filters to narrow down to the best candidates',
+      details: "And now it's time to dig into the details. Looks good to me!",
+      shortlist:
+        'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
+    },
  },
+  de: {
+    name: 'recording-de',
+    appLanguage: 'de',
+    ttsLanguage: 'German',
+    voiceInstruct:
+      'Calm and cheerful German male narrator with clear standard German pronunciation ' +
+      'and a friendly, practical delivery.',
+    voiceReferenceText:
+      'Willkommen zur Demonstration. Diese Sprecherstimme hören Sie im gesamten Video.',
+    promptText:
+      'Wohnungen unter £300k, 35 Min. Pendelzeit nach Manchester, nahe einer herausragenden Schule in einer sehr kriminalitätsarmen Gegend',
+    travelTimeLabel: 'Stadtzentrum Manchester',
+    exportButtonTitle: 'Als Excel exportieren',
+    brand: {
+      name: 'Perfect Postcode',
+      tagline: 'Ihre beste Chance, Ihr nächstes perfektes Zuhause zu finden.',
+      url: BRAND_URL,
+    },
+    cues: {
+      describe: 'Beschreiben Sie zuerst, wonach Sie suchen.',
+      dashboard: 'Das Dashboard zeigt die Orte, die Ihre Erwartungen am ehesten erfüllen.',
+      filters: 'Passen Sie die Filter an, um die besten Kandidaten einzugrenzen.',
+      details: 'Jetzt geht es in die Details. Sieht gut aus!',
+      shortlist:
+        'Jetzt können Sie Ihre Auswahl nehmen und Ihr nächstes Zuhause in Ihrem perfekten Postcode suchen.',
+    },
+  },
+  zh: {
+    name: 'recording-zh',
+    appLanguage: 'zh',
+    ttsLanguage: 'Chinese',
+    voiceInstruct:
+      'Calm and cheerful Mandarin Chinese male narrator with clear standard Mandarin ' +
+      'pronunciation and a friendly, practical delivery.',
+    voiceReferenceText: '欢迎观看演示。整段视频都会使用这位旁白的声音。',
+    promptText: '30万英镑以内的公寓，35分钟通勤到曼彻斯特，靠近优秀学校，犯罪率很低的区域',
+    travelTimeLabel: '曼彻斯特市中心',
+    exportButtonTitle: '导出为 Excel',
+    brand: {
+      name: 'Perfect Postcode',
+      tagline: '帮你更有把握找到下一个理想家。',
+      url: BRAND_URL,
+    },
+    cues: {
+      describe: '先描述你想找什么样的地方',
+      dashboard: '仪表板会显示最符合你期望的地点',
+      filters: '调整筛选条件，缩小到最合适的候选区域',
+      details: '现在深入查看细节。看起来不错！',
+      shortlist: '现在你可以带着候选清单，开始寻找理想邮编里的下一个家。',
+    },
+  },
+  hi: {
+    name: 'recording-hi',
+    appLanguage: 'hi',
+    ttsLanguage: 'English',
+    voiceInstruct:
+      'Calm and cheerful Indian male narrator speaking English with a strong Indian accent ' +
+      'and a friendly, practical delivery.',
+    voiceReferenceText:
+      "Welcome to the demonstration. This is the narrator voice you'll hear throughout the video.",
+    promptText:
+      'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area',
+    travelTimeLabel: 'Manchester city centre',
+    exportButtonTitle: 'Excel में निर्यात करें',
+    brand: {
+      name: 'Perfect Postcode',
+      tagline: 'Your best chance to find your next perfect home.',
+      url: BRAND_URL,
+    },
+    cues: {
+      describe: "Start by describing the type of place you're looking for",
+      dashboard: 'The dashboard will show you the likeliest places that will meet your expectations',
+      filters: 'Adjust the filters to narrow down to the best candidates',
+      details: "And now it's time to dig into the details. Looks good to me!",
+      shortlist:
+        'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
+    },
+  },
+};

-  {
-    text: 'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
-    gapBeforeMs: 500,
-    during: [
-      { kind: 'zoomReset', durationMs: 900 },
-      {
-        kind: 'click',
-        target: el('button[title="Export to Excel"]'),
-        durationMs: 800,
-      },
-    ],
-    tail: [{ kind: 'wait', durationMs: 800 }],
-  },
+function createCues(locale: RecordingLocale): Storyboard['cues'] {
+  const copy = RECORDING_LOCALIZATIONS[locale];

-  {
-    text: `${BRAND.name}. ${BRAND.tagline}`,
-    gapBeforeMs: 600,
-    during: [
-      {
-        kind: 'showOutro',
-        brand: BRAND.name,
-        tagline: BRAND.tagline,
-        url: BRAND.url,
-        durationMs: 0,
-      },
-    ],
-    tail: [{ kind: 'wait', durationMs: 1500 }],
-  },
-];
+  return [
+    {
+      text: copy.cues.describe,
+      gapBeforeMs: 0,
+      tail: [
+        {
+          kind: 'type',
+          selector: '[data-tutorial="ai-filters"] textarea',
+          text: copy.promptText,
+          durationMs: 3000,
+        },
+        { kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
+      ],
+    },
+    {
+      text: copy.cues.dashboard,
+      gapBeforeMs: 400,
+      during: [{ kind: 'zoomReset', durationMs: 1400 }],
+      tail: [{ kind: 'wait', durationMs: 500 }],
+    },
+
+    {
+      text: copy.cues.filters,
+      gapBeforeMs: 500,
+      during: [
+        {
+          kind: 'dragSlider',
+          thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
+          trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
+          toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
+          durationMs: 1000,
+        },
+      ],
+      tail: [{ kind: 'wait', durationMs: 400 }],
+    },
+
+    {
+      text: copy.cues.details,
+      gapBeforeMs: 500,
+      during: [
+        { kind: 'cursorScale', scale: 1.4, durationMs: 200 },
+        {
+          kind: 'mapZoom',
+          target: { kind: 'point', x: 1140, y: 605 },
+          steps: 18,
+          durationMs: 1500,
+        },
+      ],
+      tail: [
+        // Wait for the post-zoom /api/postcodes response and a redraw
+        // before the click — otherwise the click can fire on a stale
+        // frame and miss the polygon.
+        { kind: 'wait', durationMs: 500 },
+        {
+          kind: 'click',
+          target: { kind: 'point', x: 1140, y: 605 },
+          durationMs: 700,
+        },
+        { kind: 'cursorScale', scale: 1, durationMs: 280 },
+        // Linger so the climax cue lands on the right-pane reveal.
+        { kind: 'wait', durationMs: 1500 },
+      ],
+    },
+
+    {
+      text: copy.cues.shortlist,
+      gapBeforeMs: 500,
+      during: [
+        { kind: 'zoomReset', durationMs: 900 },
+        {
+          kind: 'click',
+          target: el(`button[title="${copy.exportButtonTitle}"]`),
+          durationMs: 800,
+        },
+      ],
+      tail: [{ kind: 'wait', durationMs: 800 }],
+    },
+
+    {
+      text: `${copy.brand.name}. ${copy.brand.tagline}`,
+      gapBeforeMs: 600,
+      during: [
+        {
+          kind: 'showOutro',
+          brand: copy.brand.name,
+          tagline: copy.brand.tagline,
+          url: copy.brand.url,
+          durationMs: 0,
+        },
+      ],
+      tail: [{ kind: 'wait', durationMs: 1500 }],
+    },
+  ];
+}

 const DEFAULT_PRE: Storyboard['pre'] = [
  { kind: 'clearVignette', durationMs: 0 },
@ -149,9 +276,12 @@ const DEFAULT_PRE: Storyboard['pre'] = [
  { kind: 'wait', durationMs: 140 },
 ];

-export const storyboards: Storyboard[] = [
-  {
-    name: 'recording',
+function createRecordingStoryboard(locale: RecordingLocale): Storyboard {
+  const copy = RECORDING_LOCALIZATIONS[locale];
+
+  return {
+    name: copy.name,
+    locale,
    video: {
      aspect: '16x9',
      captureScale: 1,
@ -168,23 +298,25 @@ export const storyboards: Storyboard[] = [
      posterTimeS: 16,
    },
    voice: {
-      instruct: BRITISH_MALE_NARRATOR,
-      language: 'English',
+      instruct: copy.voiceInstruct,
+      language: copy.ttsLanguage,
+      referenceText: copy.voiceReferenceText,
      temperature: 0.6,
      topP: 0.9,
      seed: 42,
    },
    content: {
-      promptText: PROMPT_TEXT,
+      promptText: copy.promptText,
+      appLanguage: copy.appLanguage,
      aiZoomScale: AI_ZOOM_SCALE,
      initialMapView: { lat: 53.4795, lon: -2.2451, zoom: 11.5 },
      // Filters returned by the AI stub. Keys MUST match real feature names
      // from /api/features (verified against the running server's schema).
      stubbedFilters: {
-        'Property type': ['Flats/Maisonettes', 'Terraced'],
-        'Estimated current price': [175000, 450000],
+        'Property type': ['Flats/Maisonettes'],
+        'Estimated current price': [0, 300000],
        'Serious crime per 1k residents (avg/yr)': [0, 55],
-        'Noise (dB)': [50, 68],
+        'Outstanding primary schools within 2km': [1, 10],
      },
      // Travel-time filters returned by the AI stub. Slug matches the real
      // /api/travel-destinations?mode=transit response.
@ -192,7 +324,7 @@ export const storyboards: Storyboard[] = [
        {
          mode: 'transit',
          slug: 'manchester',
-          label: 'Manchester city centre',
+          label: copy.travelTimeLabel,
          max: TT_DRAG_FROM_MIN,
        },
      ],
@ -200,12 +332,16 @@ export const storyboards: Storyboard[] = [
      travelTimeSliderMax: TT_SLIDER_MAX,
      travelTimeDragFromMin: TT_DRAG_FROM_MIN,
      travelTimeDragToMin: TT_DRAG_TO_MIN,
-      brand: BRAND,
+      brand: copy.brand,
    },
    pre: DEFAULT_PRE,
-    cues: DEFAULT_CUES,
-  },
-];
+    cues: createCues(locale),
+  };
+}
+
+export const storyboards: Storyboard[] = (['en', 'de', 'zh', 'hi'] as const).map((locale) =>
+  createRecordingStoryboard(locale)
+);

 export function getStoryboard(name: string): Storyboard {
  const sb = storyboards.find((s) => s.name === name);
--- a/video/tts/synth.py
+++ b/video/tts/synth.py
@ -116,6 +116,10 @@ def cached_index_matches(
    cues: list[dict],
    instruct: str,
    language: str,
+    reference_text: str,
+    design_model: str,
+    clone_model: str,
+    reference_audio: str,
    seed: int,
    temperature: float,
    top_p: float,
@ -123,7 +127,8 @@ def cached_index_matches(
    """Return True iff index_path's cue list lines up with `cues` 1:1.

    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
-    settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
+    settings (``instruct``, ``language``, reference text, models, ``seed``,
+    ``temperature``, ``top_p``).
    All cue WAV files must also exist on disk. Mismatched length, reordered
    cues, or a missing WAV invalidate the cache.
    """
@ -135,6 +140,12 @@ def cached_index_matches(
        return False
    if cached.get("instruct") != instruct or cached.get("language") != language:
        return False
+    if cached.get("referenceText") != reference_text:
+        return False
+    if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
+        return False
+    if cached.get("referenceAudio", "") != reference_audio:
+        return False
    if int(cached.get("seed", -1)) != seed:
        return False
    if float(cached.get("temperature", -1)) != temperature:
@ -170,6 +181,7 @@ def _resolve_reference(
    audio_dir: Path,
    instruct: str,
    language: str,
+    reference_text: str,
    seed: int,
    temperature: float,
    top_p: float,
@ -178,8 +190,8 @@ def _resolve_reference(

    If --reference-audio is supplied, validate and use it directly. Otherwise
    mint one via VoiceDesign (cached on disk; cache invalidates when the
-    persona/sampling/seed changes). The design model is unloaded before
-    returning so the clone model can claim the GPU.
+    persona/language/reference/sampling/seed changes). The design model is
+    unloaded before returning so the clone model can claim the GPU.
    """
    if args.reference_audio is not None:
        if not args.reference_audio.exists():
@ -201,7 +213,7 @@ def _resolve_reference(
        "seed": seed,
        "temperature": temperature,
        "topP": top_p,
-        "text": REFERENCE_TEXT,
+        "text": reference_text,
    }
    if (
        ref_wav_path.exists()
@ -209,16 +221,16 @@ def _resolve_reference(
        and _safe_load_json(ref_meta_path) == ref_meta
    ):
        print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
-        return ref_wav_path, REFERENCE_TEXT
+        return ref_wav_path, reference_text

    print(
-        f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
+        f"[synth] minting voice reference via VoiceDesign: «{reference_text}»",
        flush=True,
    )
    design_model = load_model(args.design_model, args.device)
    seed_everything(seed)
    ref_wavs, ref_sr = design_model.generate_voice_design(
-        text=[REFERENCE_TEXT],
+        text=[reference_text],
        language=language,
        instruct=instruct,
        do_sample=True,
@ -237,7 +249,7 @@ def _resolve_reference(
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

-    return ref_wav_path, REFERENCE_TEXT
+    return ref_wav_path, reference_text


 def main() -> int:
@ -266,21 +278,30 @@ def main() -> int:
        return 1
    instruct = voice["instruct"]
    language = voice["language"]
+    reference_text = str(voice.get("referenceText") or REFERENCE_TEXT)
    temperature = float(voice.get("temperature", 0.6))
    top_p = float(voice.get("topP", 0.9))
    seed = int(voice.get("seed", 42))
+    reference_audio_cache_key = (
+        str(args.reference_audio.resolve()) if args.reference_audio is not None else ""
+    )

    audio_dir.mkdir(parents=True, exist_ok=True)

    # Skip generation when the existing audio matches the script — same cue
    # texts and same gapBeforeMs values in the same order, AND same synth
-    # settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
-    # iterating on activity timing without changing narration or persona.
+    # settings (instruct/language/reference/model/seed/temperature/top_p).
+    # Saves ~30s of GPU time when iterating on activity timing without
+    # changing narration or persona.
    if cached_index_matches(
        audio_dir / "index.json",
        cues,
        instruct,
        language,
+        reference_text,
+        args.design_model,
+        args.clone_model,
+        reference_audio_cache_key,
        seed,
        temperature,
        top_p,
@ -308,7 +329,7 @@ def main() -> int:
    # own voice. The reference WAV is cached so subsequent runs only load
    # the clone model (saves ~20s + 3.4 GB of disk download).
    ref_wav_path, ref_text = _resolve_reference(
-        args, audio_dir, instruct, language, seed, temperature, top_p
+        args, audio_dir, instruct, language, reference_text, seed, temperature, top_p
    )

    print(
@ -367,6 +388,7 @@ def main() -> int:
        "language": language,
        "designModel": args.design_model,
        "cloneModel": args.clone_model,
+        "referenceAudio": reference_audio_cache_key,
        "referenceText": ref_text,
        "seed": seed,
        "temperature": temperature,