More FE changes

This commit is contained in:
Andras Schmelczer 2026-05-09 09:43:41 +01:00
parent f114ada255
commit a48eb945e0
48 changed files with 4127 additions and 1751 deletions

View file

@ -10,15 +10,17 @@
# ./render.sh # full pipeline (uses cached auth.json if fresh)
# ./render.sh --fresh-auth # force re-auth even if auth.json exists
# ./render.sh --no-encode # stop at WebM, skip MP4 encode
# ./render.sh --no-audio # skip Qwen3-TTS narration; publish silent MP4
# FORCE_AUTH=1 ./render.sh # same as --fresh-auth
# APP_URL=http://localhost:3001 ./render.sh # override frontend URL
# TTS_SPEAKER=aiden ./render.sh # override CustomVoice speaker
set -euo pipefail
# -- config (override via env) -------------------------------------------------
APP_URL="${APP_URL:-http://host.docker.internal:3001}"
PB_URL="${PB_URL:-http://host.docker.internal:8090}"
API_URL="${API_URL:-http://host.docker.internal:8001}"
export APP_URL="${APP_URL:-http://host.docker.internal:3001}"
export PB_URL="${PB_URL:-http://host.docker.internal:8090}"
export API_URL="${API_URL:-http://host.docker.internal:8001}"
PB_ADMIN_EMAIL="${PB_ADMIN_EMAIL:-admin@propertymap.local}"
PB_ADMIN_PASSWORD="${PB_ADMIN_PASSWORD:-propertymap-dev-2024}"
PB_EMAIL="${PB_EMAIL:-demo-video@local.test}"
@ -34,14 +36,28 @@ PUBLISH_DIR="${PUBLISH_DIR:-../frontend/public/video}"
# caption visible.
POSTER_TIME_S="${POSTER_TIME_S:-16}"
# Recorder/encoder knobs read by src/config.ts. config.ts treats these as
# required, so they live here (the only entry point) rather than as defaults
# scattered across TS modules. Override per-run via env.
export ASPECT="${ASPECT:-16x9}"
export CAPTURE_SCALE="${CAPTURE_SCALE:-1}"
export WEBM_BITRATE="${WEBM_BITRATE:-$(awk -v s="$CAPTURE_SCALE" 'BEGIN{print (s+0>1)?"18M":"8M"}')}"
export PROMPT_TEXT="${PROMPT_TEXT:-Flats or terraces <£450k, 35 min to Manchester, low crime}"
export AI_ZOOM_SCALE="${AI_ZOOM_SCALE:-2.4}"
export MAX_DURATION_S="${MAX_DURATION_S:-45}"
export MIN_DURATION_S="${MIN_DURATION_S:-10}"
export OUTPUT_FPS="${OUTPUT_FPS:-50}"
FRESH_AUTH="${FORCE_AUTH:-0}"
DO_ENCODE=1
DO_AUDIO=1
for arg in "$@"; do
case "$arg" in
--fresh-auth) FRESH_AUTH=1 ;;
--no-encode) DO_ENCODE=0 ;;
--no-audio) DO_AUDIO=0 ;;
-h|--help)
sed -n '3,18p' "$0"
sed -n '3,20p' "$0"
exit 0 ;;
*) echo "Unknown arg: $arg" >&2; exit 2 ;;
esac
@ -124,12 +140,36 @@ else
say "Reusing existing auth.json"
fi
# -- record -------------------------------------------------------------------
say "Recording"
# -- preflight + synth (Qwen3-TTS) -------------------------------------------
# Synth runs BEFORE recording: one batched generate_custom_voice call across
# all cues so the voice stays consistent. The recorder reads
# output/audio/index.json for measured per-cue durations and sizes each
# cue's wall-clock to fit; --no-audio skips synth and the recorder falls
# back to a worst-case estimate.
mkdir -p output
# Wipe last run's leaking artifacts so the rename step picks up *this* run.
rm -f output/recording.webm output/recording.mp4 output/page@*.webm output/page@*.webm.untrimmed
rm -f output/narration-script.json output/narration.json
# output/audio/ is preserved; tts/synth.py decides whether the cached WAVs
# still match the script and skips generation when they do.
say "Preflight: emitting narration script"
node dist/preflight.js
if [ "$DO_AUDIO" = "1" ]; then
if ! command -v uv >/dev/null 2>&1; then
fail "uv not on PATH (required for Qwen3-TTS synth). Install uv or rerun with --no-audio."
fi
say "Synthesising narration with Qwen3-TTS (speaker=${TTS_SPEAKER:-ryan}) — one batched call"
uv sync --project tts || fail "uv sync failed in video/tts"
uv run --project tts python tts/synth.py || fail "tts/synth.py failed"
if [ ! -s output/audio/index.json ]; then
fail "synth did not produce output/audio/index.json"
fi
fi
# -- record -------------------------------------------------------------------
say "Recording"
APP_URL="$APP_URL" node dist/record.js
if [ ! -s output/recording.webm ]; then
@ -163,6 +203,20 @@ if [ "$DO_ENCODE" = "1" ]; then
node dist/verify.js output/recording.mp4 output/poster.jpg
fi
# -- mux narration ------------------------------------------------------------
# Synth already produced per-cue WAVs (in output/audio/); the recorder logged
# each cue's videoTime against the trimmed timeline. Drop the WAVs onto the
# mp4 with one ffmpeg adelay+amix and replace the silent recording in place.
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
if [ ! -s output/narration.json ]; then
fail "narration.json missing — recorder did not log cues"
fi
say "Muxing narration into output/recording.mp4"
uv run --project tts python tts/mux.py --replace \
|| fail "tts/mux.py failed"
node dist/verify.js output/recording.mp4
fi
# -- publish to homepage ------------------------------------------------------
# Only publish when we did the encode (otherwise we'd be copying a stale
# mp4 next to a fresh webm). --no-encode skips this whole block.

View file

@ -1,5 +1,16 @@
import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
import { AUTH_STATE_PATH, CAPTURE_SCALE, OUTPUT_DIR, VIDEO_SIZE, VIEWPORT } from './config.js';
import {
chromium,
type Browser,
type BrowserContext,
type Page,
} from "playwright";
import {
AUTH_STATE_PATH,
CAPTURE_SCALE,
OUTPUT_DIR,
VIDEO_SIZE,
VIEWPORT,
} from "./config.js";
export interface RecordingBrowser {
browser: Browser;
@ -10,22 +21,22 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
const browser = await chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
'--enable-gpu',
'--use-gl=angle',
'--use-angle=gl-egl',
'--ignore-gpu-blocklist',
'--enable-webgl',
'--enable-webgl2',
'--enable-gpu-rasterization',
'--enable-zero-copy',
'--disable-software-rasterizer',
'--disable-frame-rate-limit',
'--disable-gpu-vsync',
'--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling',
'--disable-renderer-backgrounding',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
"--disable-blink-features=AutomationControlled",
"--enable-gpu",
"--use-gl=angle",
"--use-angle=gl-egl",
"--ignore-gpu-blocklist",
"--enable-webgl",
"--enable-webgl2",
"--enable-gpu-rasterization",
"--enable-zero-copy",
"--disable-software-rasterizer",
"--disable-frame-rate-limit",
"--disable-gpu-vsync",
"--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
],
});
@ -41,27 +52,34 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
export async function assertHardwareWebGL(page: Page): Promise<void> {
const info = await page.evaluate(() => {
const canvas = document.createElement('canvas');
const gl = canvas.getContext('webgl2') ?? canvas.getContext('webgl');
if (!gl) return { webgl: false, vendor: '', renderer: '' };
const canvas = document.createElement("canvas");
const gl = canvas.getContext("webgl2");
if (!gl) return { webgl: false, vendor: "", renderer: "" };
const ext = gl.getExtension('WEBGL_debug_renderer_info');
const ext = gl.getExtension("WEBGL_debug_renderer_info");
const vendor = String(
ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : gl.getParameter(gl.VENDOR)
ext
? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL)
: gl.getParameter(gl.VENDOR),
);
const renderer = String(
ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : gl.getParameter(gl.RENDERER)
ext
? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL)
: gl.getParameter(gl.RENDERER),
);
return { webgl: true, vendor, renderer };
});
console.log(`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : 'none'}`);
console.log(
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : "none"}`,
);
if (
process.env.ALLOW_SOFTWARE_GL !== '1' &&
(!info.webgl || /SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
process.env.ALLOW_SOFTWARE_GL !== "1" &&
(!info.webgl ||
/SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
) {
throw new Error(
'Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.'
"Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.",
);
}
}
@ -71,41 +89,45 @@ async function suppressDevServerNoise(context: BrowserContext) {
const RealWS = window.WebSocket;
window.WebSocket = new Proxy(RealWS, {
construct(target, args) {
const url = String(args[0] ?? '');
const proto = (args[1] as string | string[] | undefined) ?? '';
const protoStr = Array.isArray(proto) ? proto.join(',') : proto;
const url = String(args[0] ?? "");
const proto = (args[1] as string | string[] | undefined) ?? "";
const protoStr = Array.isArray(proto) ? proto.join(",") : proto;
if (
protoStr.includes('vite-hmr') ||
protoStr.includes('webpack') ||
url.includes('/ws') ||
url.includes('sockjs-node')
protoStr.includes("vite-hmr") ||
protoStr.includes("webpack") ||
url.includes("/ws") ||
url.includes("sockjs-node")
) {
const fake = new EventTarget() as WebSocket;
Object.defineProperties(fake, {
readyState: { value: RealWS.CLOSED },
url: { value: url },
protocol: { value: '' },
extensions: { value: '' },
protocol: { value: "" },
extensions: { value: "" },
bufferedAmount: { value: 0 },
binaryType: { value: 'blob', writable: true },
binaryType: { value: "blob", writable: true },
});
fake.send = () => {};
fake.close = () => fake.dispatchEvent(new Event('close'));
queueMicrotask(() => fake.dispatchEvent(new Event('close')));
fake.close = () => fake.dispatchEvent(new Event("close"));
queueMicrotask(() => fake.dispatchEvent(new Event("close")));
return fake;
}
return Reflect.construct(target, args);
},
});
Object.defineProperty(window.location, 'reload', {
Object.defineProperty(window.location, "reload", {
value: () => {},
configurable: true,
});
window.addEventListener('error', (e) => e.stopImmediatePropagation(), true);
window.addEventListener('unhandledrejection', (e) => e.stopImmediatePropagation(), true);
window.addEventListener("error", (e) => e.stopImmediatePropagation(), true);
window.addEventListener(
"unhandledrejection",
(e) => e.stopImmediatePropagation(),
true,
);
const styleEl = document.createElement('style');
const styleEl = document.createElement("style");
styleEl.textContent = `
vite-error-overlay,
wds-overlay,
@ -126,12 +148,12 @@ async function suppressDevServerNoise(context: BrowserContext) {
const killOverlay = (node: Element) => {
const tag = node.tagName?.toLowerCase();
const id = (node as HTMLElement).id?.toLowerCase() ?? '';
const id = (node as HTMLElement).id?.toLowerCase() ?? "";
if (
tag === 'vite-error-overlay' ||
tag === 'wds-overlay' ||
id.includes('webpack-dev-server-client') ||
id.includes('webpack-error')
tag === "vite-error-overlay" ||
tag === "wds-overlay" ||
id.includes("webpack-dev-server-client") ||
id.includes("webpack-error")
) {
(node as HTMLElement).remove();
}
@ -143,10 +165,11 @@ async function suppressDevServerNoise(context: BrowserContext) {
});
}
});
if (document.body) obs.observe(document.body, { childList: true, subtree: true });
if (document.body)
obs.observe(document.body, { childList: true, subtree: true });
else {
document.addEventListener('DOMContentLoaded', () =>
obs.observe(document.body, { childList: true, subtree: true })
document.addEventListener("DOMContentLoaded", () =>
obs.observe(document.body, { childList: true, subtree: true }),
);
}
});

View file

@ -1,46 +1,66 @@
export const APP_URL = process.env.APP_URL ?? 'http://host.docker.internal:3001';
export const DASHBOARD_PATH = '/dashboard';
function requiredEnv(name: string): string {
const value = process.env[name];
if (!value) {
throw new Error(`${name} is required`);
}
return value;
}
export const AUTH_STATE_PATH = 'auth.json';
export const OUTPUT_DIR = 'output';
function requiredNumberEnv(name: string): number {
const value = Number(requiredEnv(name));
if (!Number.isFinite(value)) {
throw new Error(`${name} must be a finite number`);
}
return value;
}
const aspect = process.env.ASPECT ?? '16x9';
export const APP_URL = requiredEnv("APP_URL");
export const DASHBOARD_PATH = "/dashboard";
export const AUTH_STATE_PATH = "auth.json";
export const OUTPUT_DIR = "output";
const aspect = requiredEnv("ASPECT");
if (aspect !== "16x9" && aspect !== "9x16") {
throw new Error("ASPECT must be '16x9' or '9x16'");
}
export const VIEWPORT =
aspect === '9x16' ? { width: 1080, height: 1920 } : { width: 1920, height: 1080 };
export const CAPTURE_SCALE = Math.max(1, Number(process.env.CAPTURE_SCALE ?? 1));
aspect === "9x16"
? { width: 1080, height: 1920 }
: { width: 1920, height: 1080 };
export const CAPTURE_SCALE = Math.max(1, requiredNumberEnv("CAPTURE_SCALE"));
export const VIDEO_SIZE = {
width: VIEWPORT.width,
height: VIEWPORT.height,
};
export const WEBM_BITRATE = process.env.WEBM_BITRATE ?? (CAPTURE_SCALE > 1 ? '18M' : '8M');
export const WEBM_BITRATE = requiredEnv("WEBM_BITRATE");
// Cold-open prompt. Punchy version of the user's intent, short enough to type
// on camera without making the opening scene drag.
export const PROMPT_TEXT =
process.env.PROMPT_TEXT ?? 'Flats or terraces <£450k, 35 min to Manchester, low crime';
export const PROMPT_TEXT = requiredEnv("PROMPT_TEXT");
// Filters returned by the AI stub. Keys MUST match real feature names from
// /api/features (verified against the running server's schema).
export const STUBBED_FILTERS: Record<string, [number, number] | string[]> = {
'Property type': ['Flats/Maisonettes', 'Terraced'],
'Estimated current price': [175000, 450000],
'Serious crime per 1k residents (avg/yr)': [0, 55],
'Noise (dB)': [50, 68],
"Property type": ["Flats/Maisonettes", "Terraced"],
"Estimated current price": [175000, 450000],
"Serious crime per 1k residents (avg/yr)": [0, 55],
"Noise (dB)": [50, 68],
};
// Travel-time filters returned by the AI stub. Slug matches the real
// /api/travel-destinations?mode=transit response.
export const STUBBED_TRAVEL_TIME_FILTERS: {
mode: 'transit' | 'car' | 'bicycle' | 'walking';
mode: "transit" | "car" | "bicycle" | "walking";
slug: string;
label: string;
min?: number;
max?: number;
}[] = [
{
mode: 'transit',
slug: 'manchester',
label: 'Manchester city centre',
mode: "transit",
slug: "manchester",
label: "Manchester city centre",
max: 35,
},
];
@ -55,7 +75,7 @@ export const TT_DRAG_TO_MIN = 20;
// Cold-open zoom: how aggressively to magnify the AI box.
// 2.4 fills most of the viewport with the prompt card without blowing up text.
export const AI_ZOOM_SCALE = Number(process.env.AI_ZOOM_SCALE ?? 2.4);
export const AI_ZOOM_SCALE = requiredNumberEnv("AI_ZOOM_SCALE");
// Initial map view used while we navigate. The AI scene zooms in on the
// sidebar so this only matters once we zoom out.
@ -67,13 +87,18 @@ export const INITIAL_MAP_VIEW = {
// Verification guard only. The renderer does not use this as an editing cap:
// if the storyboard needs more than 15 seconds to avoid jumps, keep the frames.
export const MAX_DURATION_S = Number(process.env.MAX_DURATION_S ?? 45);
export const MIN_DURATION_S = Number(process.env.MIN_DURATION_S ?? 10);
export const MAX_DURATION_S = requiredNumberEnv("MAX_DURATION_S");
export const MIN_DURATION_S = requiredNumberEnv("MIN_DURATION_S");
// Target fps of the FINAL output.
export const OUTPUT_FPS = Number(process.env.OUTPUT_FPS ?? 50);
export const OUTPUT_FPS = requiredNumberEnv("OUTPUT_FPS");
// Frames of head-room kept in front of sceneStart when trimming. Shared by
// the video trim and the narration manifest so cue offsets line up with the
// trimmed timeline.
export const LEAD_IN_S = 0.12;
// Brand strings for the outro card.
export const BRAND_NAME = 'Perfect Postcode';
export const BRAND_TAGLINE = 'Find where you actually want to live.';
export const BRAND_URL = 'https://perfect-postcode.co.uk';
export const BRAND_NAME = "Perfect Postcode";
export const BRAND_TAGLINE = "Find where you actually want to live.";
export const BRAND_URL = "https://perfect-postcode.co.uk";

View file

@ -20,8 +20,10 @@ export async function installCursor(page: Page): Promise<void> {
pointer-events: none;
z-index: 2147483646;
transform: translate(-2px, -2px);
transform-origin: 2px 2px;
transition: transform 60ms linear, scale 120ms ease-out;
will-change: transform;
will-change: transform, scale;
scale: 1;
}
#__demo-cursor svg {
filter: drop-shadow(0 2px 4px rgba(0,0,0,0.35));
@ -225,6 +227,30 @@ export async function showCaption(page: Page, text: string): Promise<void> {
}, text);
}
/**
* Animate the visible cursor to a new CSS scale. The injected cursor element
* uses the `scale` shorthand (separate from `transform: translate(...)`),
* which means resizing it doesn't fight the per-frame translate updates from
* mousemove. The transition duration is set inline so each call decides its
* own pace.
*/
export async function setCursorScale(
page: Page,
scale: number,
durationMs: number
): Promise<void> {
await page.evaluate(
({ scale, durationMs }) => {
const cursor = document.getElementById('__demo-cursor');
if (!cursor) return;
cursor.style.transition =
`transform 60ms linear, scale ${Math.max(0, durationMs)}ms cubic-bezier(0.22, 1, 0.36, 1)`;
cursor.style.scale = String(scale);
},
{ scale, durationMs }
);
}
export async function hideCaption(page: Page): Promise<void> {
await page.evaluate(() => {
document.getElementById('__demo-caption')?.classList.remove('visible');

View file

@ -72,18 +72,31 @@ export async function smoothMove(
/**
* "Fake" type: progressively set the textarea value, dispatching
* React-compatible input events. This stays Node-driven so typing cadence is
* stable even when the map is busy rendering.
* React-compatible input events.
*
* Cadence is generated as a per-char weight ratio (so spaces and punctuation
* read as natural pauses), then **rescaled** so that the sum of delays equals
* `totalDurationMs` exactly. The runner depends on this: it budgets a
* specific number of ms for the type step, and any divergence would cascade
* into narration drift.
*/
export async function fakeType(
page: Page,
selector: string,
text: string,
delayMs: number
totalDurationMs: number
): Promise<void> {
const steps = text.length;
if (steps === 0) {
if (totalDurationMs > 0) await sleep(totalDurationMs);
return;
}
const weights = computeTypingWeights(text);
const weightSum = weights.reduce((a, b) => a + b, 0);
const msPerWeight = totalDurationMs / weightSum;
for (let i = 1; i <= steps; i++) {
const end = Math.ceil((text.length * i) / steps);
await page.evaluate(
({ selector, value }) => {
const ta = document.querySelector(selector) as HTMLTextAreaElement | null;
@ -97,28 +110,25 @@ export async function fakeType(
setValue.call(ta, value);
ta.dispatchEvent(new Event('input', { bubbles: true }));
},
{ selector, value: text.slice(0, end) }
{ selector, value: text.slice(0, i) }
);
if (delayMs > 0 && i < steps) {
await new Promise((resolve) =>
setTimeout(resolve, humanTypingDelay(text[i - 1], text[i], i, delayMs))
);
if (i < steps) {
const ms = Math.max(0, Math.round(weights[i - 1] * msPerWeight));
if (ms > 0) await sleep(ms);
}
}
}
function humanTypingDelay(
char: string,
nextChar: string | undefined,
index: number,
baseDelayMs: number
): number {
function computeTypingWeights(text: string): number[] {
const cadence = [0.82, 1.08, 0.94, 1.22, 0.88, 1.14, 0.98, 1.28];
let delay = baseDelayMs * cadence[index % cadence.length];
if (char === ' ') delay += baseDelayMs * 0.9;
if (/[,.!?;:]/.test(char)) delay += baseDelayMs * 1.8;
if (nextChar === ' ' && index % 4 === 0) delay += baseDelayMs * 0.55;
return Math.round(delay);
return Array.from(text, (char, index) => {
let weight = cadence[index % cadence.length];
if (char === ' ') weight += 0.9;
if (/[,.!?;:]/.test(char)) weight += 1.8;
const next = text[index + 1];
if (next === ' ' && index % 4 === 0) weight += 0.55;
return weight;
});
}
/**

37
video/src/narration.ts Normal file
View file

@ -0,0 +1,37 @@
import { writeFileSync } from 'node:fs';
export interface NarrationCue {
text: string;
videoTimeMs: number;
durationMs: number;
}
/**
* Narration manifest writer.
*
* The runner knows the exact video-time of each narration block from the
* storyboard itself, so cues come in with an explicit `videoTimeMs` instead
* of being stamped against a wall-clock origin. That keeps the manifest in
* lockstep with the trimmed video even if step durations drift slightly.
*/
class NarrationLog {
private cues: NarrationCue[] = [];
reset(): void {
this.cues = [];
}
add(cue: NarrationCue): void {
if (cue.videoTimeMs < 0) return;
this.cues.push(cue);
}
flush(path: string, totalDurationMs: number): NarrationCue[] {
const sorted = [...this.cues].sort((a, b) => a.videoTimeMs - b.videoTimeMs);
const manifest = { totalDurationMs, cues: sorted };
writeFileSync(path, JSON.stringify(manifest, null, 2));
return sorted;
}
}
export const narrationLog = new NarrationLog();

32
video/src/preflight.ts Normal file
View file

@ -0,0 +1,32 @@
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
import { join } from 'node:path';
import { OUTPUT_DIR } from './config.js';
import { storyboard } from './storyboard.js';
/**
* Emit the narration script for the synth step.
*
* Synth (tts/synth.py) runs BEFORE recording, so it needs the full ordered
* narration list text + per-cue gaps without depending on Playwright,
* the dashboard, or auth. Walk the storyboard cues, write a flat manifest,
* exit.
*
* The cue index in this manifest is the source of truth: the runner later
* matches storyboard cues to measured durations by index.
*/
function main(): void {
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
const items = storyboard.cues.map((cue, cueIndex) => ({
cueIndex,
text: cue.text.trim(),
gapBeforeMs: cue.gapBeforeMs,
}));
const manifest = { items };
const path = join(OUTPUT_DIR, 'narration-script.json');
writeFileSync(path, JSON.stringify(manifest, null, 2));
console.log(`Wrote ${items.length} narration cues to ${path}`);
}
main();

View file

@ -1,8 +1,10 @@
import { existsSync, mkdirSync, statSync } from 'node:fs';
import { join } from 'node:path';
import { AUTH_STATE_PATH, OUTPUT_DIR } from './config.js';
import { AUTH_STATE_PATH, LEAD_IN_S, OUTPUT_DIR } from './config.js';
import { assertHardwareWebGL, launchRecordingBrowser } from './browser.js';
import { narrationLog } from './narration.js';
import { installDemoRoutes } from './routes.js';
import { storyboard } from './storyboard.js';
import { prepareTimeline, runTimeline } from './timeline.js';
import { trimRecording } from './video.js';
@ -37,7 +39,7 @@ async function main() {
await installDemoRoutes(page);
const ctx = await prepareTimeline(page);
const timeline = await runTimeline(ctx);
const timeline = await runTimeline(ctx, storyboard);
await page.close();
const rawPath = join(OUTPUT_DIR, 'recording.raw.webm');
@ -54,6 +56,16 @@ async function main() {
recordStartMs,
...timeline,
});
const totalDurationMs =
timeline.sceneEndMs - timeline.sceneStartMs + LEAD_IN_S * 1000;
const cues = narrationLog.flush(
join(OUTPUT_DIR, 'narration.json'),
totalDurationMs
);
console.log(
`Wrote ${cues.length} narration cues to ${join(OUTPUT_DIR, 'narration.json')}`
);
console.log('Run "npm run encode" to produce output/recording.mp4');
}

275
video/src/runner.ts Normal file
View file

@ -0,0 +1,275 @@
import { existsSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import type { Page } from 'playwright';
import { LEAD_IN_S, OUTPUT_DIR } from './config.js';
import {
clearVignette,
hideCaption,
setCursorScale,
showCaption,
showOutro,
zoomReset,
zoomTo,
} from './dom.js';
import { fakeType, sleep, smoothDragSliderThumb, smoothMove } from './motion.js';
import { narrationLog } from './narration.js';
import type { Activity, Cue, ScriptCtx, Storyboard, Target } from './script.js';
export interface RunnerResult {
/** Wall-clock when the first activity started. */
sceneStartMs: number;
/** Wall-clock when the last activity finished (after padding). */
sceneEndMs: number;
}
const MAP_ZOOM_WHEEL_DELTA = -120;
const FALLBACK_MS_PER_WORD = 750;
const FALLBACK_TAIL_BUFFER_MS = 800;
interface SynthCue {
cueIndex: number;
text: string;
durationMs: number;
}
/**
* Drive the recording from a cue-anchored storyboard.
*
* Synth runs first and writes ``output/audio/index.json`` with per-cue
* measured durations. The runner reads that manifest and sizes each cue's
* wall-clock to its measured audio length: ``during`` activities run
* sequentially with their declared budgets, then a final wait pads to the
* full cue duration so the caption stays on for as long as the audio
* plays. ``tail`` activities run after the caption hides; ``gapBeforeMs``
* inserts pure silence before the next cue.
*
* The activity cursor is wall-clock honest: each step advances it by
* ``max(declared, actual)`` so an overrun extends the timeline rather than
* silently desyncing the narration manifest from reality. videoTimeMs
* recorded for each cue therefore matches the trimmed mp4 frame-for-frame,
* which is what the mux step needs to drop audio at the right moment.
*
* If the audio manifest is missing (``--no-audio`` runs), we fall back to a
* worst-case estimate (750ms/word + 800ms buffer) so the visual flow still
* works, just without sound.
*/
export async function runStoryboard(
ctx: ScriptCtx,
storyboard: Storyboard
): Promise<RunnerResult> {
narrationLog.reset();
const synth = loadSynthIndex(storyboard);
const sceneStartMs = Date.now();
const leadInMs = LEAD_IN_S * 1000;
const cursor = { ms: 0 };
for (const step of storyboard.pre ?? []) {
cursor.ms += await runStep(ctx, step);
}
for (let i = 0; i < storyboard.cues.length; i++) {
await runCue(ctx, storyboard.cues[i], synth[i], cursor, leadInMs);
}
for (const step of storyboard.post ?? []) {
cursor.ms += await runStep(ctx, step);
}
return { sceneStartMs, sceneEndMs: sceneStartMs + cursor.ms };
}
async function runCue(
ctx: ScriptCtx,
cue: Cue,
synth: SynthCue,
cursor: { ms: number },
leadInMs: number
): Promise<void> {
if (cue.gapBeforeMs > 0) {
await sleep(cue.gapBeforeMs);
cursor.ms += cue.gapBeforeMs;
}
const measuredAudioMs = synth.durationMs;
narrationLog.add({
text: cue.text,
videoTimeMs: cursor.ms + leadInMs,
durationMs: measuredAudioMs,
});
await showCaption(ctx.page, cue.text);
const during = cue.during ?? [];
const declaredSum = during.reduce((s, a) => s + a.durationMs, 0);
if (declaredSum > measuredAudioMs + 50) {
throw new Error(
`Cue ${synth.cueIndex} "${cue.text.slice(0, 40)}…" has ${declaredSum}ms of ` +
`during activities but the measured audio is only ${measuredAudioMs}ms. ` +
`Trim a during step, lengthen the cue text, or move work into tail.`
);
}
// Time the during block as a whole — individual steps may overrun their
// budgets, but what matters at the cue boundary is total wall-clock.
const duringStart = Date.now();
for (const step of during) {
await runStep(ctx, step);
}
const duringElapsed = Date.now() - duringStart;
if (duringElapsed < measuredAudioMs) {
await sleep(measuredAudioMs - duringElapsed);
cursor.ms += measuredAudioMs;
} else {
cursor.ms += duringElapsed;
}
await hideCaption(ctx.page);
for (const step of cue.tail ?? []) {
cursor.ms += await runStep(ctx, step);
}
}
/**
* Run a single activity. Pads short steps to their declared budget, lets
* long ones bleed past it, and returns ``max(declared, actual)`` so the
* caller can advance the wall-clock-honest cursor.
*/
async function runStep(ctx: ScriptCtx, step: Activity): Promise<number> {
const startedAt = Date.now();
await runActivity(ctx, step);
const realMs = Date.now() - startedAt;
if (realMs < step.durationMs) {
await sleep(step.durationMs - realMs);
return step.durationMs;
}
if (realMs > step.durationMs + 50) {
console.log(
`[runner] step ${step.kind} ran ${realMs}ms over a ${step.durationMs}ms budget (drift +${realMs - step.durationMs}ms)`
);
}
return realMs;
}
async function runActivity(ctx: ScriptCtx, step: Activity): Promise<void> {
switch (step.kind) {
case 'wait':
return;
case 'clearVignette':
await clearVignette(ctx.page);
return;
case 'zoomTo': {
const focus = await resolveTarget(ctx, step.target);
await zoomTo(ctx.page, {
scale: step.scale,
focusX: focus.x,
focusY: focus.y,
durationMs: step.durationMs,
});
return;
}
case 'zoomReset':
await zoomReset(ctx.page, step.durationMs);
return;
case 'cursorScale':
await setCursorScale(ctx.page, step.scale, step.durationMs);
return;
case 'moveCursor': {
const to = await resolveTarget(ctx, step.target);
await smoothMove(ctx.page, ctx.cursor, to, { durationMs: step.durationMs });
ctx.cursor = to;
return;
}
case 'click': {
const to = await resolveTarget(ctx, step.target);
const moveMs = Math.max(120, Math.round(step.durationMs * 0.7));
await smoothMove(ctx.page, ctx.cursor, to, { durationMs: moveMs });
ctx.cursor = to;
await ctx.page.mouse.click(to.x, to.y);
return;
}
case 'type':
await fakeType(ctx.page, step.selector, step.text, step.durationMs);
return;
case 'mapZoom': {
const point = await resolveTarget(ctx, step.target);
await ctx.page.mouse.move(point.x, point.y);
const perStepMs = Math.floor(step.durationMs / Math.max(1, step.steps));
for (let i = 0; i < step.steps; i++) {
await ctx.page.mouse.wheel(0, MAP_ZOOM_WHEEL_DELTA);
if (perStepMs > 0) await sleep(perStepMs);
}
return;
}
case 'dragSlider':
ctx.cursor = await smoothDragSliderThumb(
ctx.page,
step.thumbSelector,
step.trackSelector,
ctx.cursor,
step.toFraction,
step.durationMs
);
return;
case 'submitForm':
await ctx.page.evaluate((selector) => {
document.querySelector<HTMLFormElement>(selector)?.requestSubmit();
}, step.formSelector);
return;
case 'showOutro':
await showOutro(ctx.page, step.brand, step.tagline, step.url);
return;
}
}
async function resolveTarget(
ctx: ScriptCtx,
target: Target
): Promise<{ x: number; y: number }> {
if (target.kind === 'point') return { x: target.x, y: target.y };
if (target.kind === 'hexagon') {
const targets = await ctx.dashboard.visibleHexagonTargets(1);
if (targets.length === 0) throw new Error('No visible hexagon to target');
return { x: targets[0].x, y: targets[0].y };
}
const box = await ctx.page.locator(target.selector).boundingBox();
if (!box) throw new Error(`No bounding box for selector: ${target.selector}`);
return { x: box.x + box.width / 2, y: box.y + box.height / 2 };
}
/**
* Load synth's measured cue durations. Falls back to a worst-case estimate
* if the manifest is missing that path is only used for ``--no-audio``
* runs, where the visual flow needs to play even without speech to time
* against.
*/
function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
const path = join(OUTPUT_DIR, 'audio', 'index.json');
if (existsSync(path)) {
const raw = JSON.parse(readFileSync(path, 'utf-8')) as {
items: SynthCue[];
};
const byIndex = new Map(raw.items.map((it) => [it.cueIndex, it] as const));
return storyboard.cues.map((cue, i) => {
const m = byIndex.get(i);
if (!m) {
throw new Error(
`Synth manifest is missing cue ${i} ("${cue.text.slice(0, 40)}…"). ` +
`Re-run preflight + synth so the audio matches the storyboard.`
);
}
return m;
});
}
console.log(
`[runner] no ${path} found — using worst-case fallback durations (${FALLBACK_MS_PER_WORD}ms/word + ${FALLBACK_TAIL_BUFFER_MS}ms buffer). Audio will be missing.`
);
return storyboard.cues.map((cue, cueIndex) => ({
cueIndex,
text: cue.text,
durationMs:
cue.text.split(/\s+/).filter(Boolean).length * FALLBACK_MS_PER_WORD +
FALLBACK_TAIL_BUFFER_MS,
}));
}
export type { Page };

109
video/src/script.ts Normal file
View file

@ -0,0 +1,109 @@
import type { Page } from 'playwright';
import type { DashboardRecorder } from './dashboard.js';
/**
* Public scripting API for the demo video.
*
* The storyboard is a `Storyboard` an ordered list of narration cues, each
* carrying the activities that play alongside it. Audio is generated FIRST
* (one batched Qwen call so the voice stays consistent across cues); the
* runner then reads the measured per-cue durations and slots `during`
* activities inside each cue's audio window.
*
* Why cue-anchored: the audio drives pacing. Re-running synth produces a new
* set of measured durations and the storyboard self-aligns you don't have
* to retune activity numbers. Author intent stays declarative ("zoom + type
* happen during this cue, dwell 4s after, then next cue starts").
*/
export interface ScriptCtx {
page: Page;
dashboard: DashboardRecorder;
cursor: { x: number; y: number };
}
/** A point on screen, either absolute pixel coords or the centre of an element. */
export type Target =
| { kind: 'point'; x: number; y: number }
| { kind: 'element'; selector: string }
/**
* Resolved at runtime to the centre of a visible hexagon/postcode polygon,
* picked from the dashboard's most recent map response. Robust to any zoom
* level use this when the click MUST land on a polygon and a fixed pixel
* coordinate would risk landing on a road or river at deep zoom.
*/
| { kind: 'hexagon' };
export const at = (x: number, y: number): Target => ({ kind: 'point', x, y });
export const el = (selector: string): Target => ({ kind: 'element', selector });
export const hex = (): Target => ({ kind: 'hexagon' });
/**
* Activities are the runner's atomic operations. Each one has a fixed
* `durationMs` budget; the runner pads short overruns and warns on long ones.
*/
export type Activity =
/** Pure pause. Useful for spacing. */
| { kind: 'wait'; durationMs: number }
/** Smoothly zoom the dashboard wrapper so `target` lands at viewport centre. */
| { kind: 'zoomTo'; target: Target; scale: number; durationMs: number }
/** Animate the wrapper back to identity. */
| { kind: 'zoomReset'; durationMs: number }
/** Slide the cursor from its current position to `target`. */
| { kind: 'moveCursor'; target: Target; durationMs: number }
/** Move + click + ripple. `durationMs` is the whole gesture, including settle. */
| { kind: 'click'; target: Target; durationMs: number }
/** Type into a textarea/input over exactly `durationMs`. */
| { kind: 'type'; selector: string; text: string; durationMs: number }
/** Grow or shrink the visible cursor (CSS scale). */
| { kind: 'cursorScale'; scale: number; durationMs: number }
/**
* Wheel-zoom the underlying map at `target`. `steps` controls intensity
* (each step is one ~120px wheel notch).
*/
| { kind: 'mapZoom'; target: Target; steps: number; durationMs: number }
/** Drag the right thumb of a Radix slider to a fraction in [0,1]. */
| {
kind: 'dragSlider';
thumbSelector: string;
trackSelector: string;
toFraction: number;
durationMs: number;
}
/** Submit a form found by selector and wait `durationMs`. */
| { kind: 'submitForm'; formSelector: string; durationMs: number }
/** Reveal the closing brand card. */
| { kind: 'showOutro'; brand: string; tagline: string; url: string; durationMs: number }
/** Fade away the opening vignette. */
| { kind: 'clearVignette'; durationMs: number };
/**
* A narration cue + the activities that play alongside it.
*
* gapBeforeMs : silent wall-time before the caption appears (= silence in
* audio between the previous cue ending and this one).
* during : activities that play WHILE the caption is on screen. The
* sum of declared durations must be the measured audio
* duration; the runner pads short blocks so the caption stays
* on for the full cue. Sum > measured is a hard error.
* tail : activities that run AFTER the caption hides, before the
* next cue's gapBefore starts. Use it for dwells/transitions
* that aren't tied to spoken words.
*/
export interface Cue {
text: string;
gapBeforeMs: number;
during?: Activity[];
tail?: Activity[];
}
/**
* Top-level storyboard. `pre` runs once before the first cue's gapBefore;
* `post` runs once after the last cue's tail finishes. The cue list is what
* gets handed to the synth step.
*/
export interface Storyboard {
pre?: Activity[];
cues: Cue[];
post?: Activity[];
}

170
video/src/storyboard.ts Normal file
View file

@ -0,0 +1,170 @@
import {
AI_ZOOM_SCALE,
BRAND_NAME,
BRAND_TAGLINE,
BRAND_URL,
PROMPT_TEXT,
TT_CARD_SELECTOR,
TT_DRAG_TO_MIN,
TT_SLIDER_MAX,
} from './config.js';
import { el, type Storyboard } from './script.js';
/**
* The demo video, top to bottom.
*
* Audio is generated first (one batched Qwen call), so each cue's actual
* duration is known before recording. The runner sizes each cue's wall-time
* to the measured audio length, padding short `during` blocks with a
* trailing wait. Inter-cue spacing is controlled here via `gapBeforeMs`
* (silence in audio) plus optional `tail` activities (visual movement after
* the caption hides, before the next cue's gap).
*
* Sum of `during` declared durations MUST be measured cue duration. If
* synth comes back tighter than the activities can fit, the runner throws
* with a pointer to the offending cue bump that cue's text, lengthen its
* gapBefore, or trim a during step.
*
* Reference durations (Qwen3-TTS / speaker=ryan, 2026-05-09 measured):
* cue 0 1920ms "Describe the life you want."
* cue 1 2720ms "Every matching neighbourhood, side by side."
* cue 2 2160ms "Tighten the commute to 20 minutes."
* cue 3 1840ms "Drill into a single block."
* cue 4 4480ms "Stats, listings, Street View, price history…"
* cue 5 1760ms "Take the shortlist into Excel."
* cue 6 4400ms "Perfect Postcode. Find where you actually want to live."
*/
export const storyboard: Storyboard = {
// Camera push-in to the AI box happens before the first caption — silent
// setup keeps the cold open from feeling rushed.
pre: [
{ kind: 'clearVignette', durationMs: 0 },
{ kind: 'wait', durationMs: 200 },
{
kind: 'zoomTo',
target: el('[data-tutorial="ai-filters"]'),
scale: AI_ZOOM_SCALE,
durationMs: 1300,
},
{ kind: 'wait', durationMs: 140 },
],
cues: [
// -- Scene 1: AI prompt ----------------------------------------------
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
// happen silently in the tail. The natural beat is: viewer hears the
// brief, then watches the prompt being typed.
{
text: 'Describe the life you want.',
gapBeforeMs: 0,
tail: [
{ kind: 'wait', durationMs: 140 },
{
kind: 'type',
selector: '[data-tutorial="ai-filters"] textarea',
text: PROMPT_TEXT,
durationMs: 3000,
},
{ kind: 'wait', durationMs: 140 },
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
{ kind: 'wait', durationMs: 700 },
],
},
// -- Scene 2: zoom out reveal ---------------------------------------
{
text: 'Every matching neighbourhood, side by side.',
gapBeforeMs: 400,
during: [{ kind: 'zoomReset', durationMs: 1400 }],
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 3: travel-time slider ------------------------------------
{
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
gapBeforeMs: 500,
during: [
{
kind: 'dragSlider',
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
durationMs: 1400,
},
],
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 4a: deep zoom into a hexagon -----------------------------
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
// earlier in this cue's during, the click + payoff dwell are in tail.
{
text: 'Drill into a single block.',
gapBeforeMs: 500,
during: [
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
{
kind: 'mapZoom',
target: { kind: 'point', x: 1140, y: 605 },
steps: 18,
durationMs: 1500,
},
],
tail: [
// Wait for the post-zoom /api/postcodes response and a redraw
// before the click — otherwise the click can fire on a stale
// frame and miss the polygon.
{ kind: 'wait', durationMs: 1200 },
{
kind: 'click',
target: { kind: 'point', x: 1140, y: 605 },
durationMs: 700,
},
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
// Linger so the climax cue lands on the right-pane reveal.
{ kind: 'wait', durationMs: 1500 },
],
},
// -- Scene 4b: right-pane payoff -----------------------------------
// 4480ms cue, no during — the camera holds on the populated right pane
// for the whole climax line. Tail dwells before the export beat.
{
text: 'Stats, listings, Street View, price history — all in one pane.',
gapBeforeMs: 0,
tail: [{ kind: 'wait', durationMs: 1200 }],
},
// -- Scene 5: export ------------------------------------------------
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
{
text: 'Take the shortlist into Excel.',
gapBeforeMs: 500,
during: [
{ kind: 'zoomReset', durationMs: 900 },
{
kind: 'click',
target: el('button[title="Export to Excel"]'),
durationMs: 800,
},
],
tail: [{ kind: 'wait', durationMs: 800 }],
},
// -- Scene 6: outro -------------------------------------------------
{
text: `${BRAND_NAME}. ${BRAND_TAGLINE}`,
gapBeforeMs: 600,
during: [
{
kind: 'showOutro',
brand: BRAND_NAME,
tagline: BRAND_TAGLINE,
url: BRAND_URL,
durationMs: 0,
},
],
tail: [{ kind: 'wait', durationMs: 1500 }],
},
],
};

View file

@ -1,24 +1,19 @@
import type { Page } from 'playwright';
import { installCursor, installZoomWrapper } from './dom.js';
import { DashboardRecorder } from './dashboard.js';
import { installCursor, installZoomWrapper } from './dom.js';
import { sleep } from './motion.js';
import { dashboardUrl } from './routes.js';
import {
prepareAiBox,
sceneAiCloseUp,
sceneClusterClick,
sceneExportAndOutro,
sceneTravelTimeSlider,
sceneZoomOutResults,
type SceneCtx,
} from './scenes.js';
import { runStoryboard, type RunnerResult } from './runner.js';
import type { ScriptCtx, Storyboard } from './script.js';
export interface TimelineResult {
sceneStartMs: number;
sceneEndMs: number;
}
export type TimelineResult = RunnerResult;
export async function prepareTimeline(page: Page): Promise<SceneCtx> {
/**
* Boot the dashboard, wait for the first map response, and inject the
* recording chrome (cursor, zoom wrapper, caption layer). Also opens the
* AI prompt textarea so the storyboard can begin typing immediately.
*/
export async function prepareTimeline(page: Page): Promise<ScriptCtx> {
const dashboard = new DashboardRecorder(page);
const initialMapVersion = dashboard.getMapDataVersion();
await page.goto(dashboardUrl(), { waitUntil: 'domcontentloaded' });
@ -29,33 +24,46 @@ export async function prepareTimeline(page: Page): Promise<SceneCtx> {
await page.locator('canvas').first().waitFor({ state: 'attached', timeout: 15000 });
await dashboard.waitForMapSettled(initialMapVersion, 15000);
await new Promise((r) => setTimeout(r, 400));
await sleep(400);
await installZoomWrapper(page);
await installCursor(page);
const ctx: SceneCtx = { page, dashboard, cursor: { x: 200, y: 240 } };
const ctx: ScriptCtx = { page, dashboard, cursor: { x: 200, y: 240 } };
await page.mouse.move(ctx.cursor.x, ctx.cursor.y);
await prepareAiBox(ctx);
await sleep(80);
return ctx;
}
export async function runTimeline(ctx: SceneCtx): Promise<TimelineResult> {
const sceneStartMs = Date.now();
let mark = sceneStartMs;
mark = await runScene('AI close-up', mark, () => sceneAiCloseUp(ctx));
mark = await runScene('Zoom out', mark, () => sceneZoomOutResults(ctx));
mark = await runScene('TT slider', mark, () => sceneTravelTimeSlider(ctx));
mark = await runScene('Cluster click', mark, () => sceneClusterClick(ctx));
mark = await runScene('Export + outro', mark, () => sceneExportAndOutro(ctx));
return { sceneStartMs, sceneEndMs: mark };
export async function runTimeline(
ctx: ScriptCtx,
storyboard: Storyboard
): Promise<TimelineResult> {
return runStoryboard(ctx, storyboard);
}
async function runScene(label: string, prev: number, scene: () => Promise<void>): Promise<number> {
await scene();
const now = Date.now();
console.log(`[scene] ${label}: ${((now - prev) / 1000).toFixed(2)}s wall`);
return now;
/**
* Open the AI prompt before the timed scene starts. This is preparation
* work, not part of the storyboard, because waiting for the textarea to
* appear has indeterminate duration.
*/
async function prepareAiBox(ctx: ScriptCtx): Promise<void> {
const { page } = ctx;
const aiRoot = page.locator('[data-tutorial="ai-filters"]').first();
await aiRoot.waitFor({ state: 'visible', timeout: 15000 });
const textarea = page.locator('[data-tutorial="ai-filters"] textarea');
if (!(await textarea.isVisible().catch(() => false))) {
const aiButton = aiRoot.locator('button').first();
await aiButton.waitFor({ state: 'visible', timeout: 8000 });
const btnBox = await aiButton.boundingBox();
if (btnBox) await page.mouse.click(btnBox.x + btnBox.width / 2, btnBox.y + btnBox.height / 2);
}
if (!(await textarea.isVisible().catch(() => false))) {
await page.evaluate(() => {
document.querySelector<HTMLElement>('[data-tutorial="ai-filters"] button')?.click();
});
}
await textarea.waitFor({ state: 'visible', timeout: 15000 });
await sleep(100);
}

View file

@ -1,8 +1,6 @@
import { execSync } from 'node:child_process';
import { renameSync, statSync } from 'node:fs';
import { MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
const LEAD_IN_S = 0.12;
import { LEAD_IN_S, MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
export function trimRecording(
rawPath: string,

View file

@ -10,6 +10,7 @@
"skipLibCheck": true,
"forceConsistentCasingInFileNames": true,
"resolveJsonModule": true,
"types": ["node"],
"declaration": false,
"sourceMap": true
},

188
video/tts/mux.py Normal file
View file

@ -0,0 +1,188 @@
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
Reads two manifests:
* ``output/audio/index.json`` (synth output) per-cue WAV filename + measured
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
* ``output/narration.json`` (recorder output) per-cue ``videoTimeMs`` against
the trimmed video. Generated DURING recording.
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
video stream, and writes ``output/recording.narrated.mp4``.
Run from the ``video/`` directory after recording:
uv run --project tts python tts/mux.py
"""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import sys
from pathlib import Path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
parser.add_argument(
"--narration",
type=Path,
default=Path("output/narration.json"),
help="Per-cue videoTimeMs manifest written by the recorder.",
)
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
parser.add_argument(
"--out",
type=Path,
default=Path("output/recording.narrated.mp4"),
)
parser.add_argument(
"--replace",
action="store_true",
help="After muxing, atomically replace --video with --out.",
)
return parser.parse_args()
def main() -> int:
args = parse_args()
if not shutil.which("ffmpeg"):
print("[mux] ffmpeg not on PATH", file=sys.stderr)
return 1
audio_index_path = args.audio_dir / "index.json"
if not audio_index_path.exists():
print(
f"[mux] {audio_index_path} not found; run tts/synth.py first",
file=sys.stderr,
)
return 1
if not args.narration.exists():
print(
f"[mux] {args.narration} not found; the recorder must run before mux",
file=sys.stderr,
)
return 1
if not args.video.exists():
print(f"[mux] video not found: {args.video}", file=sys.stderr)
return 1
audio_index = json.loads(audio_index_path.read_text())
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
if not audio_items:
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
shutil.copyfile(args.video, args.out)
return 0
narration = json.loads(args.narration.read_text())
nar_cues = list(narration.get("cues", []))
if len(nar_cues) != len(audio_items):
print(
f"[mux] cue count mismatch: synth has {len(audio_items)} cues, "
f"recorder logged {len(nar_cues)}. Re-run preflight + synth + record.",
file=sys.stderr,
)
return 1
# Sort audio items by cueIndex so list-order matches the recorder's
# cue list (which is also in cue order). Then pair 1:1.
audio_by_index = {int(it["cueIndex"]): it for it in audio_items}
items = []
for i, nar in enumerate(nar_cues):
audio = audio_by_index.get(i)
if audio is None:
print(f"[mux] no synth wav for cue {i}", file=sys.stderr)
return 1
items.append(
{
"cueIndex": i,
"wav": audio["wav"],
"durationMs": int(audio["durationMs"]),
"videoTimeMs": int(nar["videoTimeMs"]),
"text": nar.get("text", ""),
}
)
# Refuse to mux overlapping cues — amix would silently mash voices on top
# of each other. Sort by start so the order matches what we'll actually
# play, then check that each cue ends before the next one starts.
ordered = sorted(items, key=lambda it: it["videoTimeMs"])
overlaps: list[str] = []
for prev, nxt in zip(ordered, ordered[1:]):
prev_end = prev["videoTimeMs"] + prev["durationMs"]
nxt_start = nxt["videoTimeMs"]
if prev_end > nxt_start:
overlaps.append(
f"cue {prev['cueIndex']} ends at {prev_end}ms but cue {nxt['cueIndex']} "
f"starts at {nxt_start}ms (overlap {prev_end - nxt_start}ms)"
)
if overlaps:
raise SystemExit(
"[mux] refusing to produce overlapping narration:\n - "
+ "\n - ".join(overlaps)
)
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
for it in items:
cmd += ["-i", str(args.audio_dir / it["wav"])]
filter_parts: list[str] = []
mix_inputs: list[str] = []
for n, it in enumerate(items, start=1):
delay_ms = max(0, it["videoTimeMs"])
label = f"a{n}"
# adelay needs one delay per channel; "all=1" applies the same delay
# to every channel, which is what we want for mono narration.
filter_parts.append(
f"[{n}:a]aresample=async=1,adelay={delay_ms}|{delay_ms}:all=1[{label}]"
)
mix_inputs.append(f"[{label}]")
mix = (
f"{''.join(mix_inputs)}amix=inputs={len(items)}"
f":duration=longest:dropout_transition=0:normalize=0[aout]"
)
filter_complex = ";".join(filter_parts + [mix])
cmd += [
"-filter_complex",
filter_complex,
"-map",
"0:v:0",
"-map",
"[aout]",
"-c:v",
"copy",
"-c:a",
"aac",
"-b:a",
"192k",
"-shortest",
"-movflags",
"+faststart",
str(args.out),
]
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
result = subprocess.run(cmd)
if result.returncode != 0:
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
return result.returncode
if args.replace:
args.out.replace(args.video)
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())

208
video/tts/synth.py Normal file
View file

@ -0,0 +1,208 @@
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
batched list that way every cue shares the same model state, which keeps
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
go to ``output/audio/`` for the recording step (which reads measured cue
durations) and the mux step (which drops each WAV at its videoTime).
Run from the ``video/`` directory:
uv run --project tts python tts/synth.py
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_SPEAKER = "ryan"
DEFAULT_LANGUAGE = "English"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--script",
type=Path,
default=Path("output/narration-script.json"),
help="Narration script emitted by dist/preflight.js.",
)
parser.add_argument(
"--out-dir",
type=Path,
default=Path("output/audio"),
help="Directory to write WAV files and index.json into.",
)
parser.add_argument(
"--model",
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
)
parser.add_argument(
"--speaker",
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
)
parser.add_argument(
"--language",
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
)
parser.add_argument(
"--device",
default=os.environ.get("TTS_DEVICE", "cuda:0"),
)
parser.add_argument(
"--list-speakers",
action="store_true",
help="Load the model, print available speaker names, and exit.",
)
return parser.parse_args()
def load_model(model_id: str, device: str) -> Qwen3TTSModel:
dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)
def cached_index_matches(
index_path: Path,
cues: list[dict],
speaker: str,
language: str,
) -> bool:
"""Return True iff index_path's cue list lines up with `cues` 1:1.
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
settings (``speaker``, ``language``). All cue WAV files must also exist
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
the cache.
"""
if not index_path.exists():
return False
try:
cached = json.loads(index_path.read_text())
except json.JSONDecodeError:
return False
if cached.get("speaker") != speaker or cached.get("language") != language:
return False
cached_items = cached.get("items", [])
if len(cached_items) != len(cues):
return False
for live, prev in zip(cues, cached_items):
if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
return False
if live["text"].strip() != str(prev.get("text", "")).strip():
return False
if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
return False
wav = prev.get("wav")
if not wav or not (index_path.parent / wav).exists():
return False
return True
def main() -> int:
args = parse_args()
if args.list_speakers:
model = load_model(args.model, args.device)
speakers = model.get_supported_speakers()
print(json.dumps(speakers, indent=2, ensure_ascii=False))
return 0
if not args.script.exists():
print(f"[synth] script not found: {args.script}", file=sys.stderr)
return 1
script = json.loads(args.script.read_text())
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
if not cues:
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
return 1
args.out_dir.mkdir(parents=True, exist_ok=True)
# Skip generation when the existing audio matches the script — same cue
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
# time when iterating on activity timing without changing narration.
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
print(
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
flush=True,
)
return 0
model = load_model(args.model, args.device)
texts = [c["text"].strip() for c in cues]
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
for i, t in enumerate(texts):
print(f"[synth] {i:2d}: {t}", flush=True)
# ONE batched call. generate_custom_voice handles text=List[str] natively
# and broadcasts the speaker/language across all items, so the entire
# narration is decoded in one model pass — same RNG state, same batch,
# consistent voice from cue to cue.
wavs, sr = model.generate_custom_voice(
text=texts,
language=args.language,
speaker=args.speaker,
)
if len(wavs) != len(texts):
print(
f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
file=sys.stderr,
)
return 1
items = []
for cue, audio in zip(cues, wavs):
if hasattr(audio, "cpu"):
audio = audio.cpu().float().numpy()
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
wav_path = args.out_dir / wav_name
sf.write(str(wav_path), audio, sr)
duration_ms = int(round(len(audio) * 1000 / sr))
items.append(
{
"cueIndex": cue["cueIndex"],
"text": cue["text"],
"gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
"wav": wav_name,
"sampleRate": sr,
"durationMs": duration_ms,
}
)
print(
f"[synth] wrote {wav_name} {duration_ms:>5d}ms «{cue['text']}»",
flush=True,
)
out_index = {
"speaker": args.speaker,
"language": args.language,
"model": args.model,
"items": items,
}
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
print(
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
flush=True,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())