LGTM

2026-05-14 08:09:19 +01:00 · 2026-05-14 08:09:19 +01:00 · a4103b0896
commit a4103b0896
parent a8165249a4
64 changed files with 5376 additions and 3832 deletions
--- a/video/render.sh
+++ b/video/render.sh
@ -231,6 +231,19 @@ poster_time_for() {
  ' "$1"
 }

+# Resolve the FINAL published video dimensions for a storyboard. The
+# recording happens at the CSS viewport, but the encode pass upscales to
+# `captureScale x viewport` via lanczos so the published mp4 is true
+# 1080x1920 on mobile rather than a soft 540x960. Returns "WxH".
+published_size_for() {
+  node -e '
+    const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
+    const sb = idx.storyboards.find(s => s.name === process.argv[1]);
+    if (!sb || !sb.publishedSize) { process.exit(1); }
+    process.stdout.write(`${sb.publishedSize.width}x${sb.publishedSize.height}`);
+  ' "$1"
+}
+
 # -- per-storyboard wipe of leaking artefacts --------------------------------
 # output/<sb>/audio/ is preserved; tts/synth.py decides whether the cached
 # WAVs still match the script and skips generation when they do. In resume
@ -273,13 +286,36 @@ if [ "$DO_AUDIO" = "1" ]; then
  say "Synchronising tts/ Python deps"
  uv sync --project tts ${uv_sync_extras[@]+"${uv_sync_extras[@]}"} || fail "uv sync failed in video/tts"

+  # Voice consistency: every ad in this set declares the same AD_VOICE
+  # (instruct/seed/temperature/topP/referenceText). Even with seed-locked
+  # VoiceDesign, independent invocations across processes can produce
+  # mildly different reference waveforms — different enough that a
+  # listener notices the timbre shift across ads. To avoid that, we
+  # mint the reference WAV ONCE (from the first storyboard) and reuse
+  # it across the rest of the storyboards by copying _reference.wav +
+  # _reference.meta.json into their audio dirs before their synth runs.
+  # synth.py's _resolve_reference() reuses a matching cached reference
+  # as long as the meta block (instruct/language/seed/etc.) matches —
+  # which it always does, because every ad shares AD_VOICE.
+  shared_ref_wav=""
+  shared_ref_meta=""
  for sb in "${STORYBOARDS[@]}"; do
-    say "Synthesising narration for [$sb] — one batched call"
+    if [ -n "$shared_ref_wav" ] && [ -f "$shared_ref_wav" ] && [ -f "$shared_ref_meta" ]; then
+      mkdir -p "output/$sb/audio"
+      cp -f "$shared_ref_wav"  "output/$sb/audio/_reference.wav"
+      cp -f "$shared_ref_meta" "output/$sb/audio/_reference.meta.json"
+    fi
+    say "Synthesising narration for [$sb]"
    uv run --project tts python tts/synth.py --storyboard "$sb" \
      || fail "tts/synth.py failed for $sb"
    if [ ! -s "output/$sb/audio/index.json" ]; then
      fail "synth did not produce output/$sb/audio/index.json"
    fi
+    if [ -z "$shared_ref_wav" ] && [ -f "output/$sb/audio/_reference.wav" ]; then
+      shared_ref_wav="output/$sb/audio/_reference.wav"
+      shared_ref_meta="output/$sb/audio/_reference.meta.json"
+      say "Locked voice reference to $shared_ref_wav — reusing for the rest of the set"
+    fi
  done
 fi

@ -305,7 +341,16 @@ fi
 for sb in "${STORYBOARDS[@]}"; do
  if [ "$DO_ENCODE" = "1" ]; then
    say "[$sb] Encoding to MP4"
+    # Lanczos upscale the recording to its published dimensions
+    # (captureScale × viewport). For captureScale=1 the filter is a
+    # no-op and ffmpeg copies the size through; for captureScale=2
+    # mobile cuts go 540x960 → 1080x1920 sharply because Chromium
+    # already rasterised internally at DPR=2.
+    pub_size="$(published_size_for "$sb")"
+    pub_w="${pub_size%x*}"
+    pub_h="${pub_size#*x}"
    ffmpeg -y -loglevel warning -i "output/$sb/recording.webm" \
+      -vf "scale=${pub_w}:${pub_h}:flags=lanczos" \
      -c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
      -movflags +faststart \
      "output/$sb/recording.mp4"