This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -231,6 +231,19 @@ poster_time_for() {
' "$1"
}
# Resolve the FINAL published video dimensions for a storyboard. The
# recording happens at the CSS viewport, but the encode pass upscales to
# `captureScale x viewport` via lanczos so the published mp4 is true
# 1080x1920 on mobile rather than a soft 540x960. Returns "WxH".
published_size_for() {
node -e '
const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
const sb = idx.storyboards.find(s => s.name === process.argv[1]);
if (!sb || !sb.publishedSize) { process.exit(1); }
process.stdout.write(`${sb.publishedSize.width}x${sb.publishedSize.height}`);
' "$1"
}
# -- per-storyboard wipe of leaking artefacts --------------------------------
# output/<sb>/audio/ is preserved; tts/synth.py decides whether the cached
# WAVs still match the script and skips generation when they do. In resume
@ -273,13 +286,36 @@ if [ "$DO_AUDIO" = "1" ]; then
say "Synchronising tts/ Python deps"
uv sync --project tts ${uv_sync_extras[@]+"${uv_sync_extras[@]}"} || fail "uv sync failed in video/tts"
# Voice consistency: every ad in this set declares the same AD_VOICE
# (instruct/seed/temperature/topP/referenceText). Even with seed-locked
# VoiceDesign, independent invocations across processes can produce
# mildly different reference waveforms — different enough that a
# listener notices the timbre shift across ads. To avoid that, we
# mint the reference WAV ONCE (from the first storyboard) and reuse
# it across the rest of the storyboards by copying _reference.wav +
# _reference.meta.json into their audio dirs before their synth runs.
# synth.py's _resolve_reference() reuses a matching cached reference
# as long as the meta block (instruct/language/seed/etc.) matches —
# which it always does, because every ad shares AD_VOICE.
shared_ref_wav=""
shared_ref_meta=""
for sb in "${STORYBOARDS[@]}"; do
say "Synthesising narration for [$sb] — one batched call"
if [ -n "$shared_ref_wav" ] && [ -f "$shared_ref_wav" ] && [ -f "$shared_ref_meta" ]; then
mkdir -p "output/$sb/audio"
cp -f "$shared_ref_wav" "output/$sb/audio/_reference.wav"
cp -f "$shared_ref_meta" "output/$sb/audio/_reference.meta.json"
fi
say "Synthesising narration for [$sb]"
uv run --project tts python tts/synth.py --storyboard "$sb" \
|| fail "tts/synth.py failed for $sb"
if [ ! -s "output/$sb/audio/index.json" ]; then
fail "synth did not produce output/$sb/audio/index.json"
fi
if [ -z "$shared_ref_wav" ] && [ -f "output/$sb/audio/_reference.wav" ]; then
shared_ref_wav="output/$sb/audio/_reference.wav"
shared_ref_meta="output/$sb/audio/_reference.meta.json"
say "Locked voice reference to $shared_ref_wav — reusing for the rest of the set"
fi
done
fi
@ -305,7 +341,16 @@ fi
for sb in "${STORYBOARDS[@]}"; do
if [ "$DO_ENCODE" = "1" ]; then
say "[$sb] Encoding to MP4"
# Lanczos upscale the recording to its published dimensions
# (captureScale × viewport). For captureScale=1 the filter is a
# no-op and ffmpeg copies the size through; for captureScale=2
# mobile cuts go 540x960 → 1080x1920 sharply because Chromium
# already rasterised internally at DPR=2.
pub_size="$(published_size_for "$sb")"
pub_w="${pub_size%x*}"
pub_h="${pub_size#*x}"
ffmpeg -y -loglevel warning -i "output/$sb/recording.webm" \
-vf "scale=${pub_w}:${pub_h}:flags=lanczos" \
-c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
-movflags +faststart \
"output/$sb/recording.mp4"