This commit is contained in:
Andras Schmelczer 2026-05-11 21:38:26 +01:00
parent 9248e26af2
commit f2a2651b8a
95 changed files with 3993 additions and 1471 deletions

View file

@ -9,8 +9,6 @@
"bootstrap-admin": "tsc && node dist/pb-admin.js",
"setup-auth": "tsc && node dist/auth.js",
"record": "tsc && node dist/record.js",
"record:vertical": "tsc && ASPECT=9x16 node dist/record.js",
"encode": "ffmpeg -y -i output/recording.webm -c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast -movflags +faststart output/recording.mp4",
"verify-output": "tsc && node dist/verify.js",
"render": "./render.sh"
},

View file

@ -1,6 +1,11 @@
#!/usr/bin/env bash
#
# End-to-end re-render of the dashboard demo video.
# End-to-end re-render of the dashboard demo videos.
#
# All per-storyboard knobs (aspect, fps, bitrate, prompt text, voice persona,
# poster timestamp, brand strings…) live on the Storyboard objects in
# src/storyboard.ts. To add a vertical cut or change the voice, edit that
# file — this script only handles target/auth/transport concerns.
#
# Two targets:
# local (default) — assumes the docker-compose stack on host.docker.internal,
@ -17,7 +22,6 @@
# ./render.sh --no-audio # skip Qwen3-TTS narration; silent MP4
# FORCE_AUTH=1 ./render.sh # same as --fresh-auth
# APP_URL=http://localhost:3001 ./render.sh # override frontend URL
# TTS_SPEAKER=aiden ./render.sh # override CustomVoice speaker
#
# Cred env vars (read for both targets, but prod has no fallback defaults):
# LOGIN_EMAIL, LOGIN_PASSWORD — the dashboard account to record as
@ -48,7 +52,7 @@ case "$TARGET" in
*) echo "Unknown --target: $TARGET (expected: local, prod)" >&2; exit 2 ;;
esac
# -- config (override via env) -------------------------------------------------
# -- environment (target-specific URLs and credentials) ----------------------
if [ "$TARGET" = "prod" ]; then
# Prod serves frontend, /api/*, and /pb/* off the same domain.
export APP_URL="${APP_URL:-https://perfect-postcode.co.uk}"
@ -81,23 +85,6 @@ AUTH_TTL_HOURS="${AUTH_TTL_HOURS:-24}" # re-auth if cache older than this
# the built bundle, so updating this path is what makes the new clip appear
# on the homepage. Override if the dashboard ever moves.
PUBLISH_DIR="${PUBLISH_DIR:-../frontend/public/video}"
# When in the output timeline to grab the poster frame.
# Right-pane inspection (~16s output) is the clearest paused-state preview:
# Manchester map, filters applied, right pane populated, larger narration
# caption visible.
POSTER_TIME_S="${POSTER_TIME_S:-16}"
# Recorder/encoder knobs read by src/config.ts. config.ts treats these as
# required, so they live here (the only entry point) rather than as defaults
# scattered across TS modules. Override per-run via env.
export ASPECT="${ASPECT:-16x9}"
export CAPTURE_SCALE="${CAPTURE_SCALE:-1}"
export WEBM_BITRATE="${WEBM_BITRATE:-$(awk -v s="$CAPTURE_SCALE" 'BEGIN{print (s+0>1)?"18M":"8M"}')}"
export PROMPT_TEXT="${PROMPT_TEXT:-Flats or terraces <£450k, 35 min to Manchester, low crime}"
export AI_ZOOM_SCALE="${AI_ZOOM_SCALE:-2.4}"
export MAX_DURATION_S="${MAX_DURATION_S:-60}"
export MIN_DURATION_S="${MIN_DURATION_S:-10}"
export OUTPUT_FPS="${OUTPUT_FPS:-50}"
FRESH_AUTH="${FORCE_AUTH:-0}"
DO_ENCODE=1
@ -109,7 +96,7 @@ for arg in "${@:-}"; do
--no-encode) DO_ENCODE=0 ;;
--no-audio) DO_AUDIO=0 ;;
-h|--help)
sed -n '3,30p' "$0"
sed -n '3,32p' "$0"
exit 0 ;;
*) echo "Unknown arg: $arg" >&2; exit 2 ;;
esac
@ -207,22 +194,57 @@ else
say "Reusing existing $AUTH_STATE_FILE"
fi
# -- preflight + synth (Qwen3-TTS) -------------------------------------------
# Synth runs BEFORE recording: one batched generate_custom_voice call across
# all cues so the voice stays consistent. The recorder reads
# output/audio/index.json for measured per-cue durations and sizes each
# cue's wall-clock to fit; --no-audio skips synth and the recorder falls
# back to a worst-case estimate.
# -- preflight ---------------------------------------------------------------
# preflight emits per-storyboard narration scripts AND output/storyboards.json
# (the index this script loops over below). Run it BEFORE wiping per-storyboard
# files so we know what slugs to target.
mkdir -p output
# Wipe last run's leaking artifacts so the rename step picks up *this* run.
rm -f output/recording.webm output/recording.mp4 output/page@*.webm output/page@*.webm.untrimmed
rm -f output/narration-script.json output/narration.json
# output/audio/ is preserved; tts/synth.py decides whether the cached WAVs
# still match the script and skips generation when they do.
say "Preflight: emitting narration script"
say "Preflight: emitting narration scripts and storyboard index"
node dist/preflight.js
if [ ! -s output/storyboards.json ]; then
fail "preflight did not produce output/storyboards.json"
fi
# Pull the storyboard slugs out of the index. Use Node so we don't grow a jq
# dependency just for one read.
mapfile -t STORYBOARDS < <(node -e '
const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
for (const s of idx.storyboards) console.log(s.name);
')
if [ "${#STORYBOARDS[@]}" -eq 0 ]; then
fail "storyboards.json contains no storyboards"
fi
say "Storyboards to render: ${STORYBOARDS[*]}"
# Per-storyboard poster timestamp lookup (slug → seconds), set once so each
# loop body can read it without re-parsing the index.
poster_time_for() {
node -e '
const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
const sb = idx.storyboards.find(s => s.name === process.argv[1]);
if (!sb) { process.exit(1); }
process.stdout.write(String(sb.posterTimeS));
' "$1"
}
# -- per-storyboard wipe of leaking artefacts --------------------------------
# output/<sb>/audio/ is preserved; tts/synth.py decides whether the cached
# WAVs still match the script and skips generation when they do.
for sb in "${STORYBOARDS[@]}"; do
rm -f "output/$sb/recording.webm" "output/$sb/recording.mp4" \
"output/$sb/page@"*.webm "output/$sb/page@"*.webm.untrimmed \
"output/$sb/recording.raw.webm" "output/$sb/recording.raw.webm.untrimmed" \
"output/$sb/recording.narrated.mp4" "output/$sb/poster.jpg" \
"output/$sb/narration.json"
done
# -- synth (Qwen3-TTS) -------------------------------------------------------
# Synth runs BEFORE recording: one batched generate_voice_clone call per
# storyboard so the voice stays consistent within each video. The recorder
# reads output/<sb>/audio/index.json for measured per-cue durations and
# sizes each cue's wall-clock to fit; --no-audio skips synth and the recorder
# falls back to a worst-case estimate.
if [ "$DO_AUDIO" = "1" ]; then
if ! command -v uv >/dev/null 2>&1; then
fail "uv not on PATH (required for Qwen3-TTS synth). Install uv or rerun with --no-audio."
@ -236,95 +258,103 @@ if [ "$DO_AUDIO" = "1" ]; then
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then
uv_sync_extras+=(--extra gpu)
fi
say "Synthesising narration with Qwen3-TTS (speaker=${TTS_SPEAKER:-ryan}) — one batched call"
say "Synchronising tts/ Python deps"
uv sync --project tts ${uv_sync_extras[@]+"${uv_sync_extras[@]}"} || fail "uv sync failed in video/tts"
uv run --project tts python tts/synth.py || fail "tts/synth.py failed"
if [ ! -s output/audio/index.json ]; then
fail "synth did not produce output/audio/index.json"
fi
for sb in "${STORYBOARDS[@]}"; do
say "Synthesising narration for [$sb] — one batched call"
uv run --project tts python tts/synth.py --storyboard "$sb" \
|| fail "tts/synth.py failed for $sb"
if [ ! -s "output/$sb/audio/index.json" ]; then
fail "synth did not produce output/$sb/audio/index.json"
fi
done
fi
# -- record -------------------------------------------------------------------
say "Recording"
# -- record ------------------------------------------------------------------
# record.ts iterates over storyboards in-process and writes per-storyboard
# recording.webm + narration.json. One Node invocation handles all of them
# so we don't spin up Playwright + GPU/WebGL + auth more than necessary.
say "Recording all storyboards"
APP_URL="$APP_URL" node dist/record.js
if [ ! -s output/recording.webm ]; then
fail "recording.webm missing or empty"
fi
node dist/verify.js output/recording.webm
# -- encode -------------------------------------------------------------------
if [ "$DO_ENCODE" = "1" ]; then
if ! command -v ffmpeg >/dev/null 2>&1; then
fail "ffmpeg not on PATH; rerun with --no-encode if you only need the WebM"
for sb in "${STORYBOARDS[@]}"; do
if [ ! -s "output/$sb/recording.webm" ]; then
fail "[$sb] recording.webm missing or empty"
fi
say "Encoding to MP4"
ffmpeg -y -loglevel warning -i output/recording.webm \
-c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
-movflags +faststart \
output/recording.mp4
node dist/verify.js "$sb" "output/$sb/recording.webm"
done
# Poster: a single high-quality JPEG extracted from a representative
# moment in the output timeline. Used as the homepage <video poster=...>,
# which is what the visitor sees before pressing play.
# - -ss AFTER -i = output-side seek, frame-accurate (input-side seek
# would land on the nearest keyframe, drifting back up to ~2s).
# - -update 1 tells ffmpeg the output is a single image, not a sequence.
# - -q:v 2 = high JPEG quality (~95%); poster file is ~120KB at 1080p.
say "Extracting poster frame at ${POSTER_TIME_S}s"
ffmpeg -y -loglevel warning -i output/recording.mp4 -ss "$POSTER_TIME_S" \
-frames:v 1 -update 1 -q:v 2 \
output/poster.jpg
node dist/verify.js output/recording.mp4 output/poster.jpg
# -- encode + mux + publish (per storyboard) ---------------------------------
if [ "$DO_ENCODE" = "1" ] && ! command -v ffmpeg >/dev/null 2>&1; then
fail "ffmpeg not on PATH; rerun with --no-encode if you only need the WebM"
fi
# -- mux narration ------------------------------------------------------------
# Synth already produced per-cue WAVs (in output/audio/); the recorder logged
# each cue's videoTime against the trimmed timeline. Drop the WAVs onto the
# mp4 with one ffmpeg adelay+amix and replace the silent recording in place.
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
if [ ! -s output/narration.json ]; then
fail "narration.json missing — recorder did not log cues"
for sb in "${STORYBOARDS[@]}"; do
if [ "$DO_ENCODE" = "1" ]; then
say "[$sb] Encoding to MP4"
ffmpeg -y -loglevel warning -i "output/$sb/recording.webm" \
-c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
-movflags +faststart \
"output/$sb/recording.mp4"
# Poster: a single high-quality JPEG extracted from a representative
# moment in the output timeline. Used as the homepage <video poster=...>.
# - -ss AFTER -i = output-side seek, frame-accurate (input-side seek
# would land on the nearest keyframe, drifting back up to ~2s).
# - -update 1 tells ffmpeg the output is a single image, not a sequence.
# - -q:v 2 = high JPEG quality (~95%); poster file is ~120KB at 1080p.
poster_t="$(poster_time_for "$sb")"
say "[$sb] Extracting poster frame at ${poster_t}s"
ffmpeg -y -loglevel warning -i "output/$sb/recording.mp4" -ss "$poster_t" \
-frames:v 1 -update 1 -q:v 2 \
"output/$sb/poster.jpg"
node dist/verify.js "$sb" "output/$sb/recording.mp4" "output/$sb/poster.jpg"
fi
say "Muxing narration into output/recording.mp4"
uv run --project tts python tts/mux.py --replace \
|| fail "tts/mux.py failed"
node dist/verify.js output/recording.mp4
fi
# -- publish to homepage ------------------------------------------------------
# Only publish when we did the encode (otherwise we'd be copying a stale
# mp4 next to a fresh webm). --no-encode skips this whole block.
if [ "$DO_ENCODE" = "1" ]; then
if [ ! -d "$PUBLISH_DIR" ]; then
say "Creating $PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR"
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
if [ ! -s "output/$sb/narration.json" ]; then
fail "[$sb] narration.json missing — recorder did not log cues"
fi
say "[$sb] Muxing narration into output/$sb/recording.mp4"
uv run --project tts python tts/mux.py --storyboard "$sb" --replace \
|| fail "tts/mux.py failed for $sb"
node dist/verify.js "$sb" "output/$sb/recording.mp4"
fi
say "Publishing to $PUBLISH_DIR"
cp output/recording.mp4 "$PUBLISH_DIR/recording.mp4"
cp output/poster.jpg "$PUBLISH_DIR/poster.jpg"
node dist/verify.js "$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg"
fi
# -- report -------------------------------------------------------------------
# Only publish when we did the encode (otherwise we'd be copying a stale
# mp4 next to a fresh webm). --no-encode skips publish.
if [ "$DO_ENCODE" = "1" ]; then
if [ ! -d "$PUBLISH_DIR" ]; then
say "Creating $PUBLISH_DIR"
mkdir -p "$PUBLISH_DIR"
fi
say "[$sb] Publishing to $PUBLISH_DIR/$sb.{mp4,jpg}"
cp "output/$sb/recording.mp4" "$PUBLISH_DIR/$sb.mp4"
cp "output/$sb/poster.jpg" "$PUBLISH_DIR/$sb.jpg"
node dist/verify.js "$sb" "$PUBLISH_DIR/$sb.mp4" "$PUBLISH_DIR/$sb.jpg"
fi
done
# -- report ------------------------------------------------------------------
say "Done"
if command -v ffprobe >/dev/null 2>&1; then
for f in output/recording.webm output/recording.mp4 output/poster.jpg \
"$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg"; do
[ -f "$f" ] || continue
size=$(stat -c '%s' "$f" 2>/dev/null || stat -f '%z' "$f")
case "$f" in
*.mp4|*.webm)
dur=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$f")
printf ' %s %ss %s bytes\n' "$f" "$(printf '%.2f' "$dur")" "$size"
;;
*)
printf ' %s %s bytes\n' "$f" "$size"
;;
esac
for sb in "${STORYBOARDS[@]}"; do
for f in "output/$sb/recording.webm" "output/$sb/recording.mp4" \
"output/$sb/poster.jpg" \
"$PUBLISH_DIR/$sb.mp4" "$PUBLISH_DIR/$sb.jpg"; do
[ -f "$f" ] || continue
size=$(stat -c '%s' "$f" 2>/dev/null || stat -f '%z' "$f")
case "$f" in
*.mp4|*.webm)
dur=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$f")
printf ' %s %ss %s bytes\n' "$f" "$(printf '%.2f' "$dur")" "$size"
;;
*)
printf ' %s %s bytes\n' "$f" "$size"
;;
esac
done
done
else
ls -la output/recording.* output/poster.jpg \
"$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg" 2>/dev/null || true
fi

View file

@ -3,48 +3,52 @@ import {
type Browser,
type BrowserContext,
type Page,
} from "playwright";
import {
AUTH_STATE_PATH,
CAPTURE_SCALE,
OUTPUT_DIR,
VIDEO_SIZE,
VIEWPORT,
} from "./config.js";
} from 'playwright';
import { AUTH_STATE_PATH } from './config.js';
import { viewportFor, type Storyboard } from './script.js';
export interface RecordingBrowser {
browser: Browser;
context: BrowserContext;
}
export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
export interface LaunchOptions {
/** Directory the playwright recorder writes the raw .webm into. */
recordDir: string;
}
export async function launchRecordingBrowser(
storyboard: Storyboard,
opts: LaunchOptions
): Promise<RecordingBrowser> {
const browser = await chromium.launch({
headless: true,
args: [
"--disable-blink-features=AutomationControlled",
"--enable-gpu",
"--use-gl=angle",
"--use-angle=gl-egl",
"--ignore-gpu-blocklist",
"--enable-webgl",
"--enable-webgl2",
"--enable-gpu-rasterization",
"--enable-zero-copy",
"--disable-software-rasterizer",
"--disable-frame-rate-limit",
"--disable-gpu-vsync",
"--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
'--disable-blink-features=AutomationControlled',
'--enable-gpu',
'--use-gl=angle',
'--use-angle=gl-egl',
'--ignore-gpu-blocklist',
'--enable-webgl',
'--enable-webgl2',
'--enable-gpu-rasterization',
'--enable-zero-copy',
'--disable-software-rasterizer',
'--disable-frame-rate-limit',
'--disable-gpu-vsync',
'--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling',
'--disable-renderer-backgrounding',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
],
});
const viewport = viewportFor(storyboard.video);
const context = await browser.newContext({
storageState: AUTH_STATE_PATH,
viewport: VIEWPORT,
deviceScaleFactor: CAPTURE_SCALE,
recordVideo: { dir: OUTPUT_DIR, size: VIDEO_SIZE },
viewport,
deviceScaleFactor: storyboard.video.captureScale,
recordVideo: { dir: opts.recordDir, size: viewport },
});
await suppressDevServerNoise(context);
return { browser, context };
@ -52,11 +56,11 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
export async function assertHardwareWebGL(page: Page): Promise<void> {
const info = await page.evaluate(() => {
const canvas = document.createElement("canvas");
const gl = canvas.getContext("webgl2");
if (!gl) return { webgl: false, vendor: "", renderer: "" };
const canvas = document.createElement('canvas');
const gl = canvas.getContext('webgl2');
if (!gl) return { webgl: false, vendor: '', renderer: '' };
const ext = gl.getExtension("WEBGL_debug_renderer_info");
const ext = gl.getExtension('WEBGL_debug_renderer_info');
const vendor = String(
ext
? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL)
@ -71,15 +75,15 @@ export async function assertHardwareWebGL(page: Page): Promise<void> {
});
console.log(
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : "none"}`,
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : 'none'}`,
);
if (
process.env.ALLOW_SOFTWARE_GL !== "1" &&
process.env.ALLOW_SOFTWARE_GL !== '1' &&
(!info.webgl ||
/SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
) {
throw new Error(
"Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.",
'Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.',
);
}
}
@ -89,45 +93,45 @@ async function suppressDevServerNoise(context: BrowserContext) {
const RealWS = window.WebSocket;
window.WebSocket = new Proxy(RealWS, {
construct(target, args) {
const url = String(args[0] ?? "");
const proto = (args[1] as string | string[] | undefined) ?? "";
const protoStr = Array.isArray(proto) ? proto.join(",") : proto;
const url = String(args[0] ?? '');
const proto = (args[1] as string | string[] | undefined) ?? '';
const protoStr = Array.isArray(proto) ? proto.join(',') : proto;
if (
protoStr.includes("vite-hmr") ||
protoStr.includes("webpack") ||
url.includes("/ws") ||
url.includes("sockjs-node")
protoStr.includes('vite-hmr') ||
protoStr.includes('webpack') ||
url.includes('/ws') ||
url.includes('sockjs-node')
) {
const fake = new EventTarget() as WebSocket;
Object.defineProperties(fake, {
readyState: { value: RealWS.CLOSED },
url: { value: url },
protocol: { value: "" },
extensions: { value: "" },
protocol: { value: '' },
extensions: { value: '' },
bufferedAmount: { value: 0 },
binaryType: { value: "blob", writable: true },
binaryType: { value: 'blob', writable: true },
});
fake.send = () => {};
fake.close = () => fake.dispatchEvent(new Event("close"));
queueMicrotask(() => fake.dispatchEvent(new Event("close")));
fake.close = () => fake.dispatchEvent(new Event('close'));
queueMicrotask(() => fake.dispatchEvent(new Event('close')));
return fake;
}
return Reflect.construct(target, args);
},
});
Object.defineProperty(window.location, "reload", {
Object.defineProperty(window.location, 'reload', {
value: () => {},
configurable: true,
});
window.addEventListener("error", (e) => e.stopImmediatePropagation(), true);
window.addEventListener('error', (e) => e.stopImmediatePropagation(), true);
window.addEventListener(
"unhandledrejection",
'unhandledrejection',
(e) => e.stopImmediatePropagation(),
true,
);
const styleEl = document.createElement("style");
const styleEl = document.createElement('style');
styleEl.textContent = `
vite-error-overlay,
wds-overlay,
@ -148,12 +152,12 @@ async function suppressDevServerNoise(context: BrowserContext) {
const killOverlay = (node: Element) => {
const tag = node.tagName?.toLowerCase();
const id = (node as HTMLElement).id?.toLowerCase() ?? "";
const id = (node as HTMLElement).id?.toLowerCase() ?? '';
if (
tag === "vite-error-overlay" ||
tag === "wds-overlay" ||
id.includes("webpack-dev-server-client") ||
id.includes("webpack-error")
tag === 'vite-error-overlay' ||
tag === 'wds-overlay' ||
id.includes('webpack-dev-server-client') ||
id.includes('webpack-error')
) {
(node as HTMLElement).remove();
}
@ -168,7 +172,7 @@ async function suppressDevServerNoise(context: BrowserContext) {
if (document.body)
obs.observe(document.body, { childList: true, subtree: true });
else {
document.addEventListener("DOMContentLoaded", () =>
document.addEventListener('DOMContentLoaded', () =>
obs.observe(document.body, { childList: true, subtree: true }),
);
}

View file

@ -6,101 +6,19 @@ function requiredEnv(name: string): string {
return value;
}
function requiredNumberEnv(name: string): number {
const value = Number(requiredEnv(name));
if (!Number.isFinite(value)) {
throw new Error(`${name} must be a finite number`);
}
return value;
}
// Environment-only knobs. Per-storyboard tuning (aspect, fps, bitrate,
// voice, prompts, brand…) lives on the Storyboard object itself — see
// src/storyboard.ts.
export const APP_URL = requiredEnv("APP_URL");
export const DASHBOARD_PATH = "/dashboard";
export const APP_URL = requiredEnv('APP_URL');
export const DASHBOARD_PATH = '/dashboard';
// Per-target storage state. render.sh sets AUTH_STATE_FILE to auth.local.json
// or auth.prod.json so a stale local token can't be reused against prod.
export const AUTH_STATE_PATH = process.env.AUTH_STATE_FILE ?? "auth.json";
export const OUTPUT_DIR = "output";
const aspect = requiredEnv("ASPECT");
if (aspect !== "16x9" && aspect !== "9x16") {
throw new Error("ASPECT must be '16x9' or '9x16'");
}
export const VIEWPORT =
aspect === "9x16"
? { width: 1080, height: 1920 }
: { width: 1920, height: 1080 };
export const CAPTURE_SCALE = Math.max(1, requiredNumberEnv("CAPTURE_SCALE"));
export const VIDEO_SIZE = {
width: VIEWPORT.width,
height: VIEWPORT.height,
};
export const WEBM_BITRATE = requiredEnv("WEBM_BITRATE");
// Cold-open prompt. Punchy version of the user's intent, short enough to type
// on camera without making the opening scene drag.
export const PROMPT_TEXT = requiredEnv("PROMPT_TEXT");
// Filters returned by the AI stub. Keys MUST match real feature names from
// /api/features (verified against the running server's schema).
export const STUBBED_FILTERS: Record<string, [number, number] | string[]> = {
"Property type": ["Flats/Maisonettes", "Terraced"],
"Estimated current price": [175000, 450000],
"Serious crime per 1k residents (avg/yr)": [0, 55],
"Noise (dB)": [50, 68],
};
// Travel-time filters returned by the AI stub. Slug matches the real
// /api/travel-destinations?mode=transit response.
export const STUBBED_TRAVEL_TIME_FILTERS: {
mode: "transit" | "car" | "bicycle" | "walking";
slug: string;
label: string;
min?: number;
max?: number;
}[] = [
{
mode: "transit",
slug: "manchester",
label: "Manchester city centre",
max: 35,
},
];
// The travel-time card we'll drag manually after AI applies. The Filters
// component renders each travel-time entry with `data-filter-name="tt_${i}"`,
// and our stub only sets one entry, so it's tt_0.
export const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
export const TT_SLIDER_MAX = 120;
export const TT_DRAG_FROM_MIN = 35; // matches AI stub max above
export const TT_DRAG_TO_MIN = 20;
// Cold-open zoom: how aggressively to magnify the AI box.
// 2.4 fills most of the viewport with the prompt card without blowing up text.
export const AI_ZOOM_SCALE = requiredNumberEnv("AI_ZOOM_SCALE");
// Initial map view used while we navigate. The AI scene zooms in on the
// sidebar so this only matters once we zoom out.
export const INITIAL_MAP_VIEW = {
lat: 53.4795,
lon: -2.2451,
zoom: 11.5,
};
// Verification guard only. The renderer does not use this as an editing cap:
// if the storyboard needs more than 15 seconds to avoid jumps, keep the frames.
export const MAX_DURATION_S = requiredNumberEnv("MAX_DURATION_S");
export const MIN_DURATION_S = requiredNumberEnv("MIN_DURATION_S");
// Target fps of the FINAL output.
export const OUTPUT_FPS = requiredNumberEnv("OUTPUT_FPS");
export const AUTH_STATE_PATH = process.env.AUTH_STATE_FILE ?? 'auth.json';
export const OUTPUT_DIR = 'output';
// Frames of head-room kept in front of sceneStart when trimming. Shared by
// the video trim and the narration manifest so cue offsets line up with the
// trimmed timeline.
// trimmed timeline. Not tuned per storyboard — same lead-in for any cut.
export const LEAD_IN_S = 0.12;
// Brand strings for the outro card.
export const BRAND_NAME = "Perfect Postcode";
export const BRAND_TAGLINE = "Find where you actually want to live.";
export const BRAND_URL = "https://perfect-postcode.co.uk";

View file

@ -1,32 +1,83 @@
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
import { join } from 'node:path';
import { OUTPUT_DIR } from './config.js';
import { storyboard } from './storyboard.js';
import type { Storyboard } from './script.js';
import { storyboards } from './storyboard.js';
/**
* Emit the narration script for the synth step.
* Emit per-storyboard narration scripts for the synth step.
*
* Synth (tts/synth.py) runs BEFORE recording, so it needs the full ordered
* narration list text + per-cue gaps without depending on Playwright,
* the dashboard, or auth. Walk the storyboard cues, write a flat manifest,
* exit.
* narration list text + per-cue gaps + voice config without depending
* on Playwright, the dashboard, or auth. Walk each storyboard's cues, write
* a flat manifest under `output/<name>/narration-script.json`, then write
* an index manifest at `output/storyboards.json` so render.sh knows which
* storyboard slugs to loop over.
*
* The cue index in this manifest is the source of truth: the runner later
* The cue index in each manifest is the source of truth: the runner later
* matches storyboard cues to measured durations by index.
*/
function main(): void {
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
// Em/en-dashes and ellipses make Qwen3-TTS produce dramatic pauses, sighs,
// or audible breaths — the captions still render the original (unicode-rich)
// text from the storyboard; only the synth input is sanitised.
function normalizeForTts(text: string): string {
return text
.replace(/\s*[—–]\s*/g, ', ')
.replace(/…/g, '.')
.replace(/\.{3,}/g, '.')
.replace(/\s{2,}/g, ' ')
.trim();
}
function emitScript(storyboard: Storyboard): string {
const dir = join(OUTPUT_DIR, storyboard.name);
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
const items = storyboard.cues.map((cue, cueIndex) => ({
cueIndex,
text: cue.text.trim(),
text: normalizeForTts(cue.text),
gapBeforeMs: cue.gapBeforeMs,
}));
const manifest = { items };
const path = join(OUTPUT_DIR, 'narration-script.json');
// The voice block is consumed by tts/synth.py — see _resolve_reference and
// the cache check there for which fields invalidate cached audio.
const manifest = {
storyboard: storyboard.name,
voice: {
instruct: storyboard.voice.instruct,
language: storyboard.voice.language,
temperature: storyboard.voice.temperature ?? 0.6,
topP: storyboard.voice.topP ?? 0.9,
seed: storyboard.voice.seed ?? 42,
},
items,
};
const path = join(dir, 'narration-script.json');
writeFileSync(path, JSON.stringify(manifest, null, 2));
console.log(`Wrote ${items.length} narration cues to ${path}`);
console.log(`[preflight] [${storyboard.name}] wrote ${items.length} cues → ${path}`);
return path;
}
function main(): void {
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
for (const sb of storyboards) emitScript(sb);
// Index for shell loops — each entry has every field render.sh needs to
// address per-storyboard outputs without re-parsing the TS source.
const index = {
storyboards: storyboards.map((sb) => ({
name: sb.name,
aspect: sb.video.aspect,
outputFps: sb.video.outputFps,
minDurationS: sb.video.minDurationS,
maxDurationS: sb.video.maxDurationS,
posterTimeS: sb.video.posterTimeS,
})),
};
const indexPath = join(OUTPUT_DIR, 'storyboards.json');
writeFileSync(indexPath, JSON.stringify(index, null, 2));
console.log(`[preflight] wrote storyboard index → ${indexPath}`);
}
main();

View file

@ -1,11 +1,15 @@
import { chromium } from 'playwright';
import { APP_URL, AUTH_STATE_PATH, DASHBOARD_PATH, VIEWPORT } from './config.js';
import { APP_URL, AUTH_STATE_PATH, DASHBOARD_PATH } from './config.js';
import { viewportFor } from './script.js';
import { storyboards } from './storyboard.js';
async function main() {
// probe is a debug utility — pin it to the first storyboard's viewport.
const viewport = viewportFor(storyboards[0].video);
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({
storageState: AUTH_STATE_PATH,
viewport: VIEWPORT,
viewport,
});
const page = await context.newPage();
page.on('request', (r) => {

View file

@ -4,18 +4,20 @@ import { AUTH_STATE_PATH, LEAD_IN_S, OUTPUT_DIR } from './config.js';
import { assertHardwareWebGL, launchRecordingBrowser } from './browser.js';
import { narrationLog } from './narration.js';
import { installDemoRoutes } from './routes.js';
import { storyboard } from './storyboard.js';
import type { Storyboard } from './script.js';
import { storyboards } from './storyboard.js';
import { prepareTimeline, runTimeline } from './timeline.js';
import { trimRecording } from './video.js';
async function main() {
if (!existsSync(AUTH_STATE_PATH)) {
console.error(`No ${AUTH_STATE_PATH} found. Run "npm run setup-auth" first.`);
process.exit(1);
}
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
async function recordOne(storyboard: Storyboard): Promise<void> {
const dir = join(OUTPUT_DIR, storyboard.name);
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
const { browser, context } = await launchRecordingBrowser();
console.log(`\n=== [${storyboard.name}] recording ===`);
const { browser, context } = await launchRecordingBrowser(storyboard, {
recordDir: dir,
});
const page = await context.newPage();
await assertHardwareWebGL(page);
const recordedVideo = page.video();
@ -37,22 +39,21 @@ async function main() {
if (u.includes('ai-filters')) console.log(`[req] ${r.method()} ${u}`);
});
await installDemoRoutes(page);
const ctx = await prepareTimeline(page);
await installDemoRoutes(page, storyboard);
const ctx = await prepareTimeline(page, storyboard);
const timeline = await runTimeline(ctx, storyboard);
await page.close();
const rawPath = join(OUTPUT_DIR, 'recording.raw.webm');
const rawPath = join(dir, 'recording.raw.webm');
if (recordedVideo) await recordedVideo.saveAs(rawPath);
await context.close();
await browser.close();
if (!recordedVideo || !statSync(rawPath).size) {
console.error('no recorded webm found');
process.exit(1);
throw new Error(`[${storyboard.name}] no recorded webm found`);
}
trimRecording(rawPath, join(OUTPUT_DIR, 'recording.webm'), {
trimRecording(rawPath, join(dir, 'recording.webm'), storyboard, {
recordStartMs,
...timeline,
});
@ -60,13 +61,25 @@ async function main() {
const totalDurationMs =
timeline.sceneEndMs - timeline.sceneStartMs + LEAD_IN_S * 1000;
const cues = narrationLog.flush(
join(OUTPUT_DIR, 'narration.json'),
join(dir, 'narration.json'),
totalDurationMs
);
console.log(
`Wrote ${cues.length} narration cues to ${join(OUTPUT_DIR, 'narration.json')}`
`[${storyboard.name}] wrote ${cues.length} narration cues → ${join(dir, 'narration.json')}`
);
console.log('Run "npm run encode" to produce output/recording.mp4');
}
async function main(): Promise<void> {
if (!existsSync(AUTH_STATE_PATH)) {
console.error(`No ${AUTH_STATE_PATH} found. Run "npm run setup-auth" first.`);
process.exit(1);
}
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
for (const sb of storyboards) {
await recordOne(sb);
}
console.log(`\n=== recorded ${storyboards.length} storyboard(s) ===`);
}
main().catch((err) => {

View file

@ -1,35 +1,33 @@
import type { Page } from 'playwright';
import {
APP_URL,
DASHBOARD_PATH,
INITIAL_MAP_VIEW,
STUBBED_FILTERS,
STUBBED_TRAVEL_TIME_FILTERS,
} from './config.js';
import { APP_URL, DASHBOARD_PATH } from './config.js';
import type { Storyboard } from './script.js';
export async function installDemoRoutes(page: Page) {
await Promise.all([stubAiFilters(page), stubExport(page)]);
export async function installDemoRoutes(page: Page, storyboard: Storyboard) {
await Promise.all([stubAiFilters(page, storyboard), stubExport(page)]);
}
export function dashboardUrl(): string {
export function dashboardUrl(storyboard: Storyboard): string {
const view = storyboard.content.initialMapView;
const params = new URLSearchParams({
lat: String(INITIAL_MAP_VIEW.lat),
lon: String(INITIAL_MAP_VIEW.lon),
zoom: String(INITIAL_MAP_VIEW.zoom),
lat: String(view.lat),
lon: String(view.lon),
zoom: String(view.zoom),
});
addInitialTravelTimeParams(params);
for (const tt of storyboard.content.stubbedTravelTimeFilters) {
params.append('tt', `${tt.mode}:${tt.slug}:${tt.label}:${tt.min ?? 0}:${tt.max ?? 120}`);
}
return `${APP_URL}${DASHBOARD_PATH}?${params}`;
}
async function stubAiFilters(page: Page) {
async function stubAiFilters(page: Page, storyboard: Storyboard) {
await page.route('**/api/ai-filters', async (route) => {
await new Promise((r) => setTimeout(r, 120));
await route.fulfill({
status: 200,
contentType: 'application/json',
body: JSON.stringify({
filters: STUBBED_FILTERS,
travel_time_filters: STUBBED_TRAVEL_TIME_FILTERS,
filters: storyboard.content.stubbedFilters,
travel_time_filters: storyboard.content.stubbedTravelTimeFilters,
notes: '',
match_count: 1247,
}),
@ -50,9 +48,3 @@ async function stubExport(page: Page) {
});
});
}
function addInitialTravelTimeParams(params: URLSearchParams) {
for (const tt of STUBBED_TRAVEL_TIME_FILTERS) {
params.append('tt', `${tt.mode}:${tt.slug}:${tt.label}:${tt.min ?? 0}:${tt.max ?? 120}`);
}
}

View file

@ -243,7 +243,7 @@ async function resolveTarget(
* against.
*/
function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
const path = join(OUTPUT_DIR, 'audio', 'index.json');
const path = join(OUTPUT_DIR, storyboard.name, 'audio', 'index.json');
if (existsSync(path)) {
const raw = JSON.parse(readFileSync(path, 'utf-8')) as {
items: SynthCue[];

View file

@ -97,13 +97,97 @@ export interface Cue {
tail?: Activity[];
}
/** Recorder + encoder knobs. Set per storyboard so vertical/horizontal cuts
* can coexist without env-var juggling. */
export interface VideoConfig {
/** "16x9" → 1920x1080, "9x16" → 1080x1920. */
aspect: '16x9' | '9x16';
/** Browser deviceScaleFactor. >1 supersamples for sharper text. */
captureScale: number;
/** WebM bitrate passed to libvpx, e.g. "8M" or "18M". */
webmBitrate: string;
/** Final fps after the trim/resample pass. */
outputFps: number;
/** verify.ts duration window. */
minDurationS: number;
maxDurationS: number;
/** Timestamp (seconds, in the trimmed mp4) used to extract the homepage
* poster JPEG. Pick a frame that previews well on a paused player. */
posterTimeS: number;
}
/** Qwen3-TTS voice + language settings, sent to synth.py via the narration
* script. Per storyboard so we can ship a British male narrator on one cut
* and a different persona on another. */
export interface VoiceConfig {
/** VoiceDesign persona prompt (accent, register, anti-filler directives). */
instruct: string;
/** Qwen3-TTS language string, e.g. "English". */
language: string;
/** Sampling temperature (default 0.6). */
temperature?: number;
/** Top-p nucleus sampling (default 0.9). */
topP?: number;
/** Reproducibility seed (default 42). */
seed?: number;
}
/** Brand strings rendered by the outro card. */
export interface BrandConfig {
name: string;
tagline: string;
url: string;
}
/** Story-specific content: the AI prompt typed on camera, the stubbed AI
* response, the initial map view, and the travel-time slider tuning. The
* storyboard cues reference these via the active Storyboard rather than
* through globals so multiple storyboards can declare different prompts /
* filters / drag targets without colliding. */
export interface ContentConfig {
/** Prompt text typed into the AI box during the cold open. */
promptText: string;
/** Cold-open zoom multiplier on the AI card. */
aiZoomScale: number;
initialMapView: { lat: number; lon: number; zoom: number };
stubbedFilters: Record<string, [number, number] | string[]>;
stubbedTravelTimeFilters: TravelTimeFilter[];
travelTimeCardSelector: string;
travelTimeSliderMax: number;
travelTimeDragFromMin: number;
travelTimeDragToMin: number;
brand: BrandConfig;
}
export interface TravelTimeFilter {
mode: 'transit' | 'car' | 'bicycle' | 'walking';
slug: string;
label: string;
min?: number;
max?: number;
}
/**
* Top-level storyboard. `pre` runs once before the first cue's gapBefore;
* `post` runs once after the last cue's tail finishes. The cue list is what
* gets handed to the synth step.
*
* `name` doubles as the on-disk slug outputs go to `output/<name>/` and
* publish as `<name>.mp4` + `<name>.jpg`. Keep names URL/path-safe.
*/
export interface Storyboard {
name: string;
video: VideoConfig;
voice: VoiceConfig;
content: ContentConfig;
pre?: Activity[];
cues: Cue[];
post?: Activity[];
}
/** Convenience: derive the viewport from aspect. */
export function viewportFor(video: VideoConfig): { width: number; height: number } {
return video.aspect === '9x16'
? { width: 1080, height: 1920 }
: { width: 1920, height: 1080 };
}

View file

@ -1,31 +1,33 @@
import {
AI_ZOOM_SCALE,
BRAND_NAME,
BRAND_TAGLINE,
BRAND_URL,
PROMPT_TEXT,
TT_CARD_SELECTOR,
TT_DRAG_TO_MIN,
TT_SLIDER_MAX,
} from './config.js';
import { el, type Storyboard } from './script.js';
/**
* The demo video, top to bottom.
* The list of demo videos to render, in order.
*
* Audio is generated first (one batched Qwen call), so each cue's actual
* duration is known before recording. The runner sizes each cue's wall-time
* to the measured audio length, padding short `during` blocks with a
* trailing wait. Inter-cue spacing is controlled here via `gapBeforeMs`
* (silence in audio) plus optional `tail` activities (visual movement after
* the caption hides, before the next cue's gap).
* Each entry is a fully self-contained Storyboard: video knobs (aspect,
* bitrate, fps), voice persona (Qwen3-TTS instruct + language + sampling),
* stubbed AI response, brand strings, AND the cue list. There is no shared
* global state to ship a vertical cut, a different prompt, or a different
* voice, push another item onto this array.
*
* `name` doubles as the on-disk slug. The pipeline writes per-storyboard
* artefacts to `output/<name>/` and publishes `<name>.mp4` / `<name>.jpg`
* to the homepage. The default storyboard is named `recording` so the
* existing homepage `/video/recording.mp4` keeps working unchanged.
*
* Audio is generated first (one batched Qwen call per storyboard, using
* its own voice config), so each cue's actual duration is known before
* recording. The runner sizes each cue's wall-time to the measured audio
* length, padding short `during` blocks with a trailing wait. Inter-cue
* spacing is controlled here via `gapBeforeMs` (silence in audio) plus
* optional `tail` activities (visual movement after the caption hides,
* before the next cue's gap).
*
* Sum of `during` declared durations MUST be measured cue duration. If
* synth comes back tighter than the activities can fit, the runner throws
* with a pointer to the offending cue bump that cue's text, lengthen its
* gapBefore, or trim a during step.
*
* Reference durations (Qwen3-TTS / speaker=ryan, 2026-05-09 measured):
* Reference durations (Qwen3-TTS / British male narrator, 2026-05-09):
* cue 0 1920ms "Describe the life you want."
* cue 1 2720ms "Every matching neighbourhood, side by side."
* cue 2 2160ms "Tighten the commute to 20 minutes."
@ -34,137 +36,238 @@ import { el, type Storyboard } from './script.js';
* cue 5 1760ms "Take the shortlist into Excel."
* cue 6 4400ms "Perfect Postcode. Find where you actually want to live."
*/
export const storyboard: Storyboard = {
const PROMPT_TEXT = 'Flats or terraces <£450k, 35 min to Manchester, low crime';
const BRAND = {
name: 'Perfect Postcode',
tagline: 'Find where you actually want to live.',
url: 'https://perfect-postcode.co.uk',
};
// Cold-open zoom: how aggressively to magnify the AI box.
// 2.4 fills most of the viewport with the prompt card without blowing up text.
const AI_ZOOM_SCALE = 2.4;
// The travel-time card we'll drag manually after AI applies. The Filters
// component renders each travel-time entry with `data-filter-name="tt_${i}"`,
// and our stub only sets one entry, so it's tt_0.
const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
const TT_SLIDER_MAX = 120;
const TT_DRAG_FROM_MIN = 35; // matches AI stub max below
const TT_DRAG_TO_MIN = 20;
// Calm British male narrator. Matches what tts/synth.py used to default to;
// kept identical so existing audio caches don't invalidate on first run.
const BRITISH_MALE_NARRATOR =
'Calm, professional middle-aged Chinese male narrator with a ' +
'strong Chinese accent. Even, measured pace; warm but ' +
'understated; product-demo register. Do not laugh, sigh, gasp, or add ' +
'filler sounds; no audible breaths between sentences.';
const DEFAULT_CUES: Storyboard['cues'] = [
// -- Scene 1: AI prompt ----------------------------------------------
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
// happen silently in the tail. The natural beat is: viewer hears the
// brief, then watches the prompt being typed.
{
text: 'Describe the life you want.',
gapBeforeMs: 0,
tail: [
{ kind: 'wait', durationMs: 140 },
{
kind: 'type',
selector: '[data-tutorial="ai-filters"] textarea',
text: PROMPT_TEXT,
durationMs: 3000,
},
{ kind: 'wait', durationMs: 140 },
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
{ kind: 'wait', durationMs: 700 },
],
},
// -- Scene 2: zoom out reveal ---------------------------------------
{
text: 'Every matching neighbourhood, side by side.',
gapBeforeMs: 400,
during: [{ kind: 'zoomReset', durationMs: 1400 }],
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 3: travel-time slider ------------------------------------
{
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
gapBeforeMs: 500,
during: [
{
kind: 'dragSlider',
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
durationMs: 1400,
},
],
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 4a: deep zoom into a hexagon -----------------------------
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
// earlier in this cue's during, the click + payoff dwell are in tail.
{
text: 'Drill into a single block.',
gapBeforeMs: 500,
during: [
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
{
kind: 'mapZoom',
target: { kind: 'point', x: 1140, y: 605 },
steps: 18,
durationMs: 1500,
},
],
tail: [
// Wait for the post-zoom /api/postcodes response and a redraw
// before the click — otherwise the click can fire on a stale
// frame and miss the polygon.
{ kind: 'wait', durationMs: 1200 },
{
kind: 'click',
target: { kind: 'point', x: 1140, y: 605 },
durationMs: 700,
},
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
// Linger so the climax cue lands on the right-pane reveal.
{ kind: 'wait', durationMs: 1500 },
],
},
// -- Scene 4b: right-pane payoff -----------------------------------
// 4480ms cue, no during — the camera holds on the populated right pane
// for the whole climax line. Tail dwells before the export beat.
{
text: 'Stats, listings, Street View, price history — all in one pane.',
gapBeforeMs: 0,
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 5: export ------------------------------------------------
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
{
text: 'Take the shortlist into Excel.',
gapBeforeMs: 500,
during: [
{ kind: 'zoomReset', durationMs: 900 },
{
kind: 'click',
target: el('button[title="Export to Excel"]'),
durationMs: 800,
},
],
tail: [{ kind: 'wait', durationMs: 800 }],
},
// -- Scene 6: outro -------------------------------------------------
{
text: `${BRAND.name}. ${BRAND.tagline}`,
gapBeforeMs: 600,
during: [
{
kind: 'showOutro',
brand: BRAND.name,
tagline: BRAND.tagline,
url: BRAND.url,
durationMs: 0,
},
],
tail: [{ kind: 'wait', durationMs: 1500 }],
},
];
const DEFAULT_PRE: Storyboard['pre'] = [
// Camera push-in to the AI box happens before the first caption — silent
// setup keeps the cold open from feeling rushed.
pre: [
{ kind: 'clearVignette', durationMs: 0 },
{ kind: 'wait', durationMs: 200 },
{
kind: 'zoomTo',
target: el('[data-tutorial="ai-filters"]'),
scale: AI_ZOOM_SCALE,
durationMs: 1300,
},
{ kind: 'wait', durationMs: 140 },
],
{ kind: 'clearVignette', durationMs: 0 },
{ kind: 'wait', durationMs: 200 },
{
kind: 'zoomTo',
target: el('[data-tutorial="ai-filters"]'),
scale: AI_ZOOM_SCALE,
durationMs: 1300,
},
{ kind: 'wait', durationMs: 140 },
];
cues: [
// -- Scene 1: AI prompt ----------------------------------------------
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
// happen silently in the tail. The natural beat is: viewer hears the
// brief, then watches the prompt being typed.
{
text: 'Describe the life you want.',
gapBeforeMs: 0,
tail: [
{ kind: 'wait', durationMs: 140 },
export const storyboards: Storyboard[] = [
{
name: 'recording',
video: {
aspect: '16x9',
captureScale: 1,
// 8M is enough for 1920x1080 at captureScale=1; bump to 18M when
// captureScale > 1 (supersampled) — see render.sh history if reviving
// higher-quality cuts.
webmBitrate: '8M',
outputFps: 50,
minDurationS: 10,
maxDurationS: 60,
// Right-pane inspection (~16s into the trimmed timeline) is the
// clearest paused-state preview: Manchester map, filters applied,
// right pane populated, larger narration caption visible.
posterTimeS: 16,
},
voice: {
instruct: BRITISH_MALE_NARRATOR,
language: 'English',
// Sampling pinned for cue-to-cue consistency. Lower temp/top_p make
// the decoder less likely to sample non-speech tokens (laughter,
// random noise) at the cost of slightly flatter intonation. Seed
// makes runs reproducible.
temperature: 0.6,
topP: 0.9,
seed: 42,
},
content: {
promptText: PROMPT_TEXT,
aiZoomScale: AI_ZOOM_SCALE,
// Initial map view used while we navigate. The AI scene zooms in on
// the sidebar so this only matters once we zoom out.
initialMapView: { lat: 53.4795, lon: -2.2451, zoom: 11.5 },
// Filters returned by the AI stub. Keys MUST match real feature names
// from /api/features (verified against the running server's schema).
stubbedFilters: {
'Property type': ['Flats/Maisonettes', 'Terraced'],
'Estimated current price': [175000, 450000],
'Serious crime per 1k residents (avg/yr)': [0, 55],
'Noise (dB)': [50, 68],
},
// Travel-time filters returned by the AI stub. Slug matches the real
// /api/travel-destinations?mode=transit response.
stubbedTravelTimeFilters: [
{
kind: 'type',
selector: '[data-tutorial="ai-filters"] textarea',
text: PROMPT_TEXT,
durationMs: 3000,
},
{ kind: 'wait', durationMs: 140 },
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
{ kind: 'wait', durationMs: 700 },
],
},
// -- Scene 2: zoom out reveal ---------------------------------------
{
text: 'Every matching neighbourhood, side by side.',
gapBeforeMs: 400,
during: [{ kind: 'zoomReset', durationMs: 1400 }],
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 3: travel-time slider ------------------------------------
{
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
gapBeforeMs: 500,
during: [
{
kind: 'dragSlider',
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
durationMs: 1400,
mode: 'transit',
slug: 'manchester',
label: 'Manchester city centre',
max: TT_DRAG_FROM_MIN,
},
],
tail: [{ kind: 'wait', durationMs: 1200 }],
travelTimeCardSelector: TT_CARD_SELECTOR,
travelTimeSliderMax: TT_SLIDER_MAX,
travelTimeDragFromMin: TT_DRAG_FROM_MIN,
travelTimeDragToMin: TT_DRAG_TO_MIN,
brand: BRAND,
},
pre: DEFAULT_PRE,
cues: DEFAULT_CUES,
},
];
// -- Scene 4a: deep zoom into a hexagon -----------------------------
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
// earlier in this cue's during, the click + payoff dwell are in tail.
{
text: 'Drill into a single block.',
gapBeforeMs: 500,
during: [
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
{
kind: 'mapZoom',
target: { kind: 'point', x: 1140, y: 605 },
steps: 18,
durationMs: 1500,
},
],
tail: [
// Wait for the post-zoom /api/postcodes response and a redraw
// before the click — otherwise the click can fire on a stale
// frame and miss the polygon.
{ kind: 'wait', durationMs: 1200 },
{
kind: 'click',
target: { kind: 'point', x: 1140, y: 605 },
durationMs: 700,
},
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
// Linger so the climax cue lands on the right-pane reveal.
{ kind: 'wait', durationMs: 1500 },
],
},
// -- Scene 4b: right-pane payoff -----------------------------------
// 4480ms cue, no during — the camera holds on the populated right pane
// for the whole climax line. Tail dwells before the export beat.
{
text: 'Stats, listings, Street View, price history — all in one pane.',
gapBeforeMs: 0,
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 5: export ------------------------------------------------
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
{
text: 'Take the shortlist into Excel.',
gapBeforeMs: 500,
during: [
{ kind: 'zoomReset', durationMs: 900 },
{
kind: 'click',
target: el('button[title="Export to Excel"]'),
durationMs: 800,
},
],
tail: [{ kind: 'wait', durationMs: 800 }],
},
// -- Scene 6: outro -------------------------------------------------
{
text: `${BRAND_NAME}. ${BRAND_TAGLINE}`,
gapBeforeMs: 600,
during: [
{
kind: 'showOutro',
brand: BRAND_NAME,
tagline: BRAND_TAGLINE,
url: BRAND_URL,
durationMs: 0,
},
],
tail: [{ kind: 'wait', durationMs: 1500 }],
},
],
};
export function getStoryboard(name: string): Storyboard {
const sb = storyboards.find((s) => s.name === name);
if (!sb) {
throw new Error(
`Unknown storyboard "${name}". Known: ${storyboards.map((s) => s.name).join(', ')}`
);
}
return sb;
}

View file

@ -13,10 +13,13 @@ export type TimelineResult = RunnerResult;
* recording chrome (cursor, zoom wrapper, caption layer). Also opens the
* AI prompt textarea so the storyboard can begin typing immediately.
*/
export async function prepareTimeline(page: Page): Promise<ScriptCtx> {
export async function prepareTimeline(
page: Page,
storyboard: Storyboard
): Promise<ScriptCtx> {
const dashboard = new DashboardRecorder(page);
const initialMapVersion = dashboard.getMapDataVersion();
await page.goto(dashboardUrl(), { waitUntil: 'domcontentloaded' });
await page.goto(dashboardUrl(storyboard), { waitUntil: 'domcontentloaded' });
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
await page
.locator('[data-tutorial="ai-filters"]')

View file

@ -1,6 +1,8 @@
import { execFileSync } from 'node:child_process';
import { existsSync, statSync } from 'node:fs';
import { MAX_DURATION_S, MIN_DURATION_S, OUTPUT_FPS, OUTPUT_DIR, VIDEO_SIZE } from './config.js';
import { OUTPUT_DIR } from './config.js';
import { viewportFor, type Storyboard } from './script.js';
import { getStoryboard } from './storyboard.js';
interface Probe {
streams?: {
@ -48,7 +50,7 @@ function probe(path: string): Probe {
return JSON.parse(raw) as Probe;
}
function verifyVideo(path: string) {
function verifyVideo(path: string, storyboard: Storyboard) {
if (!existsSync(path)) fail(`${path} is missing`);
if (statSync(path).size === 0) fail(`${path} is empty`);
@ -56,18 +58,23 @@ function verifyVideo(path: string) {
const stream = data.streams?.[0];
if (!stream) fail(`${path} has no video stream`);
const expectedSize = viewportFor(storyboard.video);
const { minDurationS, maxDurationS, outputFps } = storyboard.video;
const duration = Number(data.format?.duration ?? 0);
const fps = parseRate(stream.avg_frame_rate || stream.r_frame_rate);
if (stream.width !== VIDEO_SIZE.width || stream.height !== VIDEO_SIZE.height) {
fail(`${path} is ${stream.width}x${stream.height}, expected ${VIDEO_SIZE.width}x${VIDEO_SIZE.height}`);
}
if (duration < MIN_DURATION_S || duration > MAX_DURATION_S) {
if (stream.width !== expectedSize.width || stream.height !== expectedSize.height) {
fail(
`${path} duration is ${duration.toFixed(2)}s, expected ${MIN_DURATION_S}-${MAX_DURATION_S}s`
`${path} is ${stream.width}x${stream.height}, expected ${expectedSize.width}x${expectedSize.height}`
);
}
if (Math.abs(fps - OUTPUT_FPS) > 0.1) {
fail(`${path} is ${fps.toFixed(2)}fps, expected ${OUTPUT_FPS}fps`);
if (duration < minDurationS || duration > maxDurationS) {
fail(
`${path} duration is ${duration.toFixed(2)}s, expected ${minDurationS}-${maxDurationS}s`
);
}
if (Math.abs(fps - outputFps) > 0.1) {
fail(`${path} is ${fps.toFixed(2)}fps, expected ${outputFps}fps`);
}
console.log(
@ -81,8 +88,20 @@ function verifyImage(path: string) {
console.log(`[verify] ${path}: ${statSync(path).size} bytes`);
}
const videoPath = process.argv[2] ?? `${OUTPUT_DIR}/recording.mp4`;
const posterPath = process.argv[3] ?? (process.argv[2] ? undefined : `${OUTPUT_DIR}/poster.jpg`);
// Usage:
// node dist/verify.js <storyboard> [videoPath] [posterPath]
// Defaults: videoPath=output/<storyboard>/recording.mp4,
// posterPath=output/<storyboard>/poster.jpg.
// If videoPath is given but posterPath is not, the poster check is skipped.
const storyboardName = process.argv[2];
if (!storyboardName) {
fail('verify: missing <storyboard> argument (e.g. `node dist/verify.js recording`)');
}
const storyboard = getStoryboard(storyboardName);
verifyVideo(videoPath);
const videoPath = process.argv[3] ?? `${OUTPUT_DIR}/${storyboard.name}/recording.mp4`;
const posterPath =
process.argv[4] ?? (process.argv[3] ? undefined : `${OUTPUT_DIR}/${storyboard.name}/poster.jpg`);
verifyVideo(videoPath, storyboard);
if (posterPath) verifyImage(posterPath);

View file

@ -1,10 +1,12 @@
import { execSync } from 'node:child_process';
import { renameSync, statSync } from 'node:fs';
import { LEAD_IN_S, MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
import { LEAD_IN_S } from './config.js';
import { viewportFor, type Storyboard } from './script.js';
export function trimRecording(
rawPath: string,
trimmedPath: string,
storyboard: Storyboard,
times: { recordStartMs: number; sceneStartMs: number; sceneEndMs: number }
) {
const sceneSpan = (times.sceneEndMs - times.sceneStartMs) / 1000;
@ -16,22 +18,26 @@ export function trimRecording(
const wallDuration = trimEnd - trimStart;
const finalDuration = wallDuration;
if (finalDuration > MAX_DURATION_S) {
const { outputFps, webmBitrate, maxDurationS } = storyboard.video;
const viewport = viewportFor(storyboard.video);
if (finalDuration > maxDurationS) {
console.log(
`Scene output duration is ${finalDuration.toFixed(2)}s (guard ${MAX_DURATION_S.toFixed(2)}s); keeping the full take.`
`[${storyboard.name}] Scene output duration is ${finalDuration.toFixed(2)}s ` +
`(guard ${maxDurationS.toFixed(2)}s); keeping the full take.`
);
}
const filter =
`trim=start=${trimStart.toFixed(3)}:duration=${wallDuration.toFixed(3)},` +
`setpts=PTS-STARTPTS,fps=${OUTPUT_FPS},` +
`setpts=PTS-STARTPTS,fps=${outputFps},` +
`trim=duration=${finalDuration.toFixed(3)},setpts=PTS-STARTPTS`;
// Keep trimming inside the filter graph: it is frame-accurate for WebM
// without the keyframe leakage of input seeking.
execSync(
`ffmpeg -y -i "${rawPath}" -vf "${filter}" ` +
`-fps_mode cfr -r ${OUTPUT_FPS} -c:v libvpx -b:v ${WEBM_BITRATE} -deadline good -cpu-used 5 ` +
`-fps_mode cfr -r ${outputFps} -c:v libvpx -b:v ${webmBitrate} -deadline good -cpu-used 5 ` +
`"${trimmedPath}"`,
{ stdio: 'inherit' }
);
@ -44,6 +50,6 @@ export function trimRecording(
}
console.log(
`Wrote ${trimmedPath} (${finalDuration.toFixed(2)}s, scene=${sceneSpan.toFixed(2)}s, capture=${VIDEO_SIZE.width}x${VIDEO_SIZE.height})`
`[${storyboard.name}] Wrote ${trimmedPath} (${finalDuration.toFixed(2)}s, scene=${sceneSpan.toFixed(2)}s, capture=${viewport.width}x${viewport.height})`
);
}

View file

@ -1,19 +1,19 @@
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
"""Mux per-cue WAVs into one storyboard's recording.mp4 at narration offsets.
Reads two manifests:
Reads two manifests inside ``output/<storyboard>/``:
* ``output/audio/index.json`` (synth output) per-cue WAV filename + measured
* ``audio/index.json`` (synth output) per-cue WAV filename + measured
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
* ``output/narration.json`` (recorder output) per-cue ``videoTimeMs`` against
* ``narration.json`` (recorder output) per-cue ``videoTimeMs`` against
the trimmed video. Generated DURING recording.
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
video stream, and writes ``output/recording.narrated.mp4``.
video stream, and writes ``output/<storyboard>/recording.narrated.mp4``.
Run from the ``video/`` directory after recording:
uv run --project tts python tts/mux.py
uv run --project tts python tts/mux.py --storyboard recording
"""
from __future__ import annotations
@ -28,23 +28,21 @@ from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
parser.add_argument(
"--narration",
type=Path,
default=Path("output/narration.json"),
help="Per-cue videoTimeMs manifest written by the recorder.",
"--storyboard",
required=True,
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
)
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
parser.add_argument(
"--out",
"--output-dir",
type=Path,
default=Path("output/recording.narrated.mp4"),
default=Path("output"),
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
)
parser.add_argument(
"--replace",
action="store_true",
help="After muxing, atomically replace --video with --out.",
help="After muxing, atomically replace the storyboard's recording.mp4.",
)
return parser.parse_args()
@ -56,7 +54,13 @@ def main() -> int:
print("[mux] ffmpeg not on PATH", file=sys.stderr)
return 1
audio_index_path = args.audio_dir / "index.json"
storyboard_dir = args.output_dir / args.storyboard
audio_dir = storyboard_dir / "audio"
narration_path = storyboard_dir / "narration.json"
video_path = storyboard_dir / "recording.mp4"
out_path = storyboard_dir / "recording.narrated.mp4"
audio_index_path = audio_dir / "index.json"
if not audio_index_path.exists():
print(
f"[mux] {audio_index_path} not found; run tts/synth.py first",
@ -64,25 +68,25 @@ def main() -> int:
)
return 1
if not args.narration.exists():
if not narration_path.exists():
print(
f"[mux] {args.narration} not found; the recorder must run before mux",
f"[mux] {narration_path} not found; the recorder must run before mux",
file=sys.stderr,
)
return 1
if not args.video.exists():
print(f"[mux] video not found: {args.video}", file=sys.stderr)
if not video_path.exists():
print(f"[mux] video not found: {video_path}", file=sys.stderr)
return 1
audio_index = json.loads(audio_index_path.read_text())
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
if not audio_items:
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
shutil.copyfile(args.video, args.out)
shutil.copyfile(video_path, out_path)
return 0
narration = json.loads(args.narration.read_text())
narration = json.loads(narration_path.read_text())
nar_cues = list(narration.get("cues", []))
if len(nar_cues) != len(audio_items):
print(
@ -130,9 +134,9 @@ def main() -> int:
+ "\n - ".join(overlaps)
)
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(video_path)]
for it in items:
cmd += ["-i", str(args.audio_dir / it["wav"])]
cmd += ["-i", str(audio_dir / it["wav"])]
filter_parts: list[str] = []
mix_inputs: list[str] = []
@ -168,18 +172,21 @@ def main() -> int:
"-shortest",
"-movflags",
"+faststart",
str(args.out),
str(out_path),
]
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
print(
f"[mux] [{args.storyboard}] muxing {len(items)} narration cues into {out_path}",
flush=True,
)
result = subprocess.run(cmd)
if result.returncode != 0:
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
return result.returncode
if args.replace:
args.out.replace(args.video)
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
out_path.replace(video_path)
print(f"[mux] replaced {video_path} with narrated copy", flush=True)
return 0

View file

@ -1,15 +1,28 @@
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
"""Synthesize one storyboard's narration in ONE batched Qwen3-TTS call.
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
batched list that way every cue shares the same model state, which keeps
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
go to ``output/audio/`` for the recording step (which reads measured cue
durations) and the mux step (which drops each WAV at its videoTime).
Reads ``output/<storyboard>/narration-script.json`` (emitted by
``dist/preflight.js``) and runs ``Qwen3TTSModel.generate_voice_design`` with
all cue texts as a single batched list that way every cue shares the same
model state, which keeps prosody and timbre consistent across cues. Per-cue
WAVs and an index manifest go to ``output/<storyboard>/audio/`` for the
recording step (which reads measured cue durations) and the mux step (which
drops each WAV at its videoTime).
Voice persona, language, and sampling come from the storyboard via the
``voice`` block of the narration script. CLI flags can still override them
for ad-hoc experimentation; storyboards remain the source of truth for
production runs.
We use the VoiceDesign sibling of CustomVoice because it accepts a free-form
voice persona (British accent, narrator register, "no laughter") via the
``instruct`` parameter. CustomVoice's preset speakers are all American or
non-English, and its ``instruct`` is documented for emotion only it
ignored accent directives and bled non-speech tokens (laughter, sighs)
between cues.
Run from the ``video/`` directory:
uv run --project tts python tts/synth.py
uv run --project tts python tts/synth.py --storyboard recording
"""
from __future__ import annotations
@ -17,55 +30,78 @@ from __future__ import annotations
import argparse
import json
import os
import random
import sys
from pathlib import Path
import numpy as np
import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_SPEAKER = "ryan"
DEFAULT_LANGUAGE = "English"
# Two checkpoints: the design model mints the reference clip in the desired
# persona; the clone model conditions every cue on that reference's x-vector.
# Neither CustomVoice nor VoiceDesign support generate_voice_clone — only the
# Base checkpoint does.
DEFAULT_DESIGN_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
DEFAULT_CLONE_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
# Fixed reference utterance used to anchor the speaker timbre. The reference
# is generated once per (model, instruct, sampling, seed) tuple and reused
# for every cue, so all narration shares the same x-vector. Two short
# sentences exercise enough phonemes for a stable embedding without bloating
# generation time.
REFERENCE_TEXT = (
"Welcome to the demonstration. This is the narrator voice you'll hear throughout the video."
)
def _safe_load_json(path: Path) -> object | None:
try:
return json.loads(path.read_text())
except (FileNotFoundError, json.JSONDecodeError):
return None
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--script",
"--storyboard",
required=True,
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("output/narration-script.json"),
help="Narration script emitted by dist/preflight.js.",
default=Path("output"),
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
)
parser.add_argument(
"--out-dir",
"--design-model",
default=os.environ.get("TTS_DESIGN_MODEL", DEFAULT_DESIGN_MODEL),
help="Checkpoint used to mint the voice reference (VoiceDesign by default).",
)
parser.add_argument(
"--clone-model",
default=os.environ.get("TTS_CLONE_MODEL", DEFAULT_CLONE_MODEL),
help="Checkpoint used to clone the cue audio from the reference (Base by default).",
)
parser.add_argument(
"--reference-audio",
type=Path,
default=Path("output/audio"),
help="Directory to write WAV files and index.json into.",
default=(Path(os.environ["TTS_REFERENCE_AUDIO"]) if os.environ.get("TTS_REFERENCE_AUDIO") else None),
help="Path to an existing reference WAV. If set, skip VoiceDesign and clone from this.",
)
parser.add_argument(
"--model",
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
)
parser.add_argument(
"--speaker",
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
)
parser.add_argument(
"--language",
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
"--reference-text",
default=os.environ.get("TTS_REFERENCE_TEXT"),
help="Transcript of --reference-audio. Required if --reference-audio is set.",
)
parser.add_argument(
"--device",
default=os.environ.get("TTS_DEVICE", "cuda:0"),
)
parser.add_argument(
"--list-speakers",
action="store_true",
help="Load the model, print available speaker names, and exit.",
)
return parser.parse_args()
@ -78,15 +114,18 @@ def load_model(model_id: str, device: str) -> Qwen3TTSModel:
def cached_index_matches(
index_path: Path,
cues: list[dict],
speaker: str,
instruct: str,
language: str,
seed: int,
temperature: float,
top_p: float,
) -> bool:
"""Return True iff index_path's cue list lines up with `cues` 1:1.
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
settings (``speaker``, ``language``). All cue WAV files must also exist
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
the cache.
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
All cue WAV files must also exist on disk. Mismatched length, reordered
cues, or a missing WAV invalidate the cache.
"""
if not index_path.exists():
return False
@ -94,7 +133,13 @@ def cached_index_matches(
cached = json.loads(index_path.read_text())
except json.JSONDecodeError:
return False
if cached.get("speaker") != speaker or cached.get("language") != language:
if cached.get("instruct") != instruct or cached.get("language") != language:
return False
if int(cached.get("seed", -1)) != seed:
return False
if float(cached.get("temperature", -1)) != temperature:
return False
if float(cached.get("topP", -1)) != top_p:
return False
cached_items = cached.get("items", [])
if len(cached_items) != len(cues):
@ -112,52 +157,179 @@ def cached_index_matches(
return True
def seed_everything(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def _resolve_reference(
args: argparse.Namespace,
audio_dir: Path,
instruct: str,
language: str,
seed: int,
temperature: float,
top_p: float,
) -> tuple[Path, str]:
"""Return (ref_wav_path, ref_text) for the clone step.
If --reference-audio is supplied, validate and use it directly. Otherwise
mint one via VoiceDesign (cached on disk; cache invalidates when the
persona/sampling/seed changes). The design model is unloaded before
returning so the clone model can claim the GPU.
"""
if args.reference_audio is not None:
if not args.reference_audio.exists():
raise SystemExit(f"[synth] --reference-audio does not exist: {args.reference_audio}")
if not args.reference_text:
raise SystemExit("[synth] --reference-text is required when --reference-audio is set")
print(
f"[synth] using user-supplied reference {args.reference_audio} «{args.reference_text}»",
flush=True,
)
return args.reference_audio, args.reference_text
ref_wav_path = audio_dir / "_reference.wav"
ref_meta_path = audio_dir / "_reference.meta.json"
ref_meta = {
"model": args.design_model,
"instruct": instruct,
"language": language,
"seed": seed,
"temperature": temperature,
"topP": top_p,
"text": REFERENCE_TEXT,
}
if (
ref_wav_path.exists()
and ref_meta_path.exists()
and _safe_load_json(ref_meta_path) == ref_meta
):
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
return ref_wav_path, REFERENCE_TEXT
print(
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
flush=True,
)
design_model = load_model(args.design_model, args.device)
seed_everything(seed)
ref_wavs, ref_sr = design_model.generate_voice_design(
text=[REFERENCE_TEXT],
language=language,
instruct=instruct,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
ref_audio = ref_wavs[0]
if hasattr(ref_audio, "cpu"):
ref_audio = ref_audio.cpu().float().numpy()
sf.write(str(ref_wav_path), ref_audio, ref_sr)
ref_meta_path.write_text(json.dumps(ref_meta, indent=2))
# Free the design model before loading the clone model — both are 1.7B,
# we don't want them resident at the same time.
del design_model
if torch.cuda.is_available():
torch.cuda.empty_cache()
return ref_wav_path, REFERENCE_TEXT
def main() -> int:
args = parse_args()
if args.list_speakers:
model = load_model(args.model, args.device)
speakers = model.get_supported_speakers()
print(json.dumps(speakers, indent=2, ensure_ascii=False))
return 0
storyboard_dir = args.output_dir / args.storyboard
script_path = storyboard_dir / "narration-script.json"
audio_dir = storyboard_dir / "audio"
if not args.script.exists():
print(f"[synth] script not found: {args.script}", file=sys.stderr)
if not script_path.exists():
print(f"[synth] script not found: {script_path}", file=sys.stderr)
return 1
script = json.loads(args.script.read_text())
script = json.loads(script_path.read_text())
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
if not cues:
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
return 1
args.out_dir.mkdir(parents=True, exist_ok=True)
voice = script.get("voice")
if not voice:
print(
f"[synth] {script_path} has no `voice` block — re-run preflight.",
file=sys.stderr,
)
return 1
instruct = voice["instruct"]
language = voice["language"]
temperature = float(voice.get("temperature", 0.6))
top_p = float(voice.get("topP", 0.9))
seed = int(voice.get("seed", 42))
audio_dir.mkdir(parents=True, exist_ok=True)
# Skip generation when the existing audio matches the script — same cue
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
# time when iterating on activity timing without changing narration.
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
# texts and same gapBeforeMs values in the same order, AND same synth
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
# iterating on activity timing without changing narration or persona.
if cached_index_matches(
audio_dir / "index.json",
cues,
instruct,
language,
seed,
temperature,
top_p,
):
print(
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
f"[synth] [{args.storyboard}] cached audio matches the current script — skipping generation",
flush=True,
)
return 0
model = load_model(args.model, args.device)
texts = [c["text"].strip() for c in cues]
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
print(f"[synth] [{args.storyboard}] persona: {instruct}", flush=True)
print(
f"[synth] [{args.storyboard}] sampling: temperature={temperature} top_p={top_p} seed={seed} language={language}",
flush=True,
)
# Two-stage generation:
# 1. VoiceDesign mints a single reference clip in the target persona
# (or the user supplies one via --reference-audio).
# 2. Base + generate_voice_clone(x_vector_only_mode=True) conditions
# every cue on the reference's speaker embedding.
# Without (2), batched generation drifts timbre across cues — a persona
# prompt anchors style but not identity, so each batch item picks its
# own voice. The reference WAV is cached so subsequent runs only load
# the clone model (saves ~20s + 3.4 GB of disk download).
ref_wav_path, ref_text = _resolve_reference(
args, audio_dir, instruct, language, seed, temperature, top_p
)
print(
f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
flush=True,
)
for i, t in enumerate(texts):
print(f"[synth] {i:2d}: {t}", flush=True)
# ONE batched call. generate_custom_voice handles text=List[str] natively
# and broadcasts the speaker/language across all items, so the entire
# narration is decoded in one model pass — same RNG state, same batch,
# consistent voice from cue to cue.
wavs, sr = model.generate_custom_voice(
clone_model = load_model(args.clone_model, args.device)
seed_everything(seed)
wavs, sr = clone_model.generate_voice_clone(
text=texts,
language=args.language,
speaker=args.speaker,
language=language,
ref_audio=str(ref_wav_path),
ref_text=ref_text,
x_vector_only_mode=True,
non_streaming_mode=True,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
if len(wavs) != len(texts):
print(
@ -171,7 +343,7 @@ def main() -> int:
if hasattr(audio, "cpu"):
audio = audio.cpu().float().numpy()
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
wav_path = args.out_dir / wav_name
wav_path = audio_dir / wav_name
sf.write(str(wav_path), audio, sr)
duration_ms = int(round(len(audio) * 1000 / sr))
items.append(
@ -190,15 +362,21 @@ def main() -> int:
)
out_index = {
"speaker": args.speaker,
"language": args.language,
"model": args.model,
"storyboard": args.storyboard,
"instruct": instruct,
"language": language,
"designModel": args.design_model,
"cloneModel": args.clone_model,
"referenceText": ref_text,
"seed": seed,
"temperature": temperature,
"topP": top_p,
"items": items,
}
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
(audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
print(
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
flush=True,
)
return 0