More FE changes
This commit is contained in:
parent
f114ada255
commit
a48eb945e0
48 changed files with 4127 additions and 1751 deletions
|
|
@ -10,15 +10,17 @@
|
|||
# ./render.sh # full pipeline (uses cached auth.json if fresh)
|
||||
# ./render.sh --fresh-auth # force re-auth even if auth.json exists
|
||||
# ./render.sh --no-encode # stop at WebM, skip MP4 encode
|
||||
# ./render.sh --no-audio # skip Qwen3-TTS narration; publish silent MP4
|
||||
# FORCE_AUTH=1 ./render.sh # same as --fresh-auth
|
||||
# APP_URL=http://localhost:3001 ./render.sh # override frontend URL
|
||||
# TTS_SPEAKER=aiden ./render.sh # override CustomVoice speaker
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# -- config (override via env) -------------------------------------------------
|
||||
APP_URL="${APP_URL:-http://host.docker.internal:3001}"
|
||||
PB_URL="${PB_URL:-http://host.docker.internal:8090}"
|
||||
API_URL="${API_URL:-http://host.docker.internal:8001}"
|
||||
export APP_URL="${APP_URL:-http://host.docker.internal:3001}"
|
||||
export PB_URL="${PB_URL:-http://host.docker.internal:8090}"
|
||||
export API_URL="${API_URL:-http://host.docker.internal:8001}"
|
||||
PB_ADMIN_EMAIL="${PB_ADMIN_EMAIL:-admin@propertymap.local}"
|
||||
PB_ADMIN_PASSWORD="${PB_ADMIN_PASSWORD:-propertymap-dev-2024}"
|
||||
PB_EMAIL="${PB_EMAIL:-demo-video@local.test}"
|
||||
|
|
@ -34,14 +36,28 @@ PUBLISH_DIR="${PUBLISH_DIR:-../frontend/public/video}"
|
|||
# caption visible.
|
||||
POSTER_TIME_S="${POSTER_TIME_S:-16}"
|
||||
|
||||
# Recorder/encoder knobs read by src/config.ts. config.ts treats these as
|
||||
# required, so they live here (the only entry point) rather than as defaults
|
||||
# scattered across TS modules. Override per-run via env.
|
||||
export ASPECT="${ASPECT:-16x9}"
|
||||
export CAPTURE_SCALE="${CAPTURE_SCALE:-1}"
|
||||
export WEBM_BITRATE="${WEBM_BITRATE:-$(awk -v s="$CAPTURE_SCALE" 'BEGIN{print (s+0>1)?"18M":"8M"}')}"
|
||||
export PROMPT_TEXT="${PROMPT_TEXT:-Flats or terraces <£450k, 35 min to Manchester, low crime}"
|
||||
export AI_ZOOM_SCALE="${AI_ZOOM_SCALE:-2.4}"
|
||||
export MAX_DURATION_S="${MAX_DURATION_S:-45}"
|
||||
export MIN_DURATION_S="${MIN_DURATION_S:-10}"
|
||||
export OUTPUT_FPS="${OUTPUT_FPS:-50}"
|
||||
|
||||
FRESH_AUTH="${FORCE_AUTH:-0}"
|
||||
DO_ENCODE=1
|
||||
DO_AUDIO=1
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--fresh-auth) FRESH_AUTH=1 ;;
|
||||
--no-encode) DO_ENCODE=0 ;;
|
||||
--no-audio) DO_AUDIO=0 ;;
|
||||
-h|--help)
|
||||
sed -n '3,18p' "$0"
|
||||
sed -n '3,20p' "$0"
|
||||
exit 0 ;;
|
||||
*) echo "Unknown arg: $arg" >&2; exit 2 ;;
|
||||
esac
|
||||
|
|
@ -124,12 +140,36 @@ else
|
|||
say "Reusing existing auth.json"
|
||||
fi
|
||||
|
||||
# -- record -------------------------------------------------------------------
|
||||
say "Recording"
|
||||
# -- preflight + synth (Qwen3-TTS) -------------------------------------------
|
||||
# Synth runs BEFORE recording: one batched generate_custom_voice call across
|
||||
# all cues so the voice stays consistent. The recorder reads
|
||||
# output/audio/index.json for measured per-cue durations and sizes each
|
||||
# cue's wall-clock to fit; --no-audio skips synth and the recorder falls
|
||||
# back to a worst-case estimate.
|
||||
mkdir -p output
|
||||
# Wipe last run's leaking artifacts so the rename step picks up *this* run.
|
||||
rm -f output/recording.webm output/recording.mp4 output/page@*.webm output/page@*.webm.untrimmed
|
||||
rm -f output/narration-script.json output/narration.json
|
||||
# output/audio/ is preserved; tts/synth.py decides whether the cached WAVs
|
||||
# still match the script and skips generation when they do.
|
||||
|
||||
say "Preflight: emitting narration script"
|
||||
node dist/preflight.js
|
||||
|
||||
if [ "$DO_AUDIO" = "1" ]; then
|
||||
if ! command -v uv >/dev/null 2>&1; then
|
||||
fail "uv not on PATH (required for Qwen3-TTS synth). Install uv or rerun with --no-audio."
|
||||
fi
|
||||
say "Synthesising narration with Qwen3-TTS (speaker=${TTS_SPEAKER:-ryan}) — one batched call"
|
||||
uv sync --project tts || fail "uv sync failed in video/tts"
|
||||
uv run --project tts python tts/synth.py || fail "tts/synth.py failed"
|
||||
if [ ! -s output/audio/index.json ]; then
|
||||
fail "synth did not produce output/audio/index.json"
|
||||
fi
|
||||
fi
|
||||
|
||||
# -- record -------------------------------------------------------------------
|
||||
say "Recording"
|
||||
APP_URL="$APP_URL" node dist/record.js
|
||||
|
||||
if [ ! -s output/recording.webm ]; then
|
||||
|
|
@ -163,6 +203,20 @@ if [ "$DO_ENCODE" = "1" ]; then
|
|||
node dist/verify.js output/recording.mp4 output/poster.jpg
|
||||
fi
|
||||
|
||||
# -- mux narration ------------------------------------------------------------
|
||||
# Synth already produced per-cue WAVs (in output/audio/); the recorder logged
|
||||
# each cue's videoTime against the trimmed timeline. Drop the WAVs onto the
|
||||
# mp4 with one ffmpeg adelay+amix and replace the silent recording in place.
|
||||
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
|
||||
if [ ! -s output/narration.json ]; then
|
||||
fail "narration.json missing — recorder did not log cues"
|
||||
fi
|
||||
say "Muxing narration into output/recording.mp4"
|
||||
uv run --project tts python tts/mux.py --replace \
|
||||
|| fail "tts/mux.py failed"
|
||||
node dist/verify.js output/recording.mp4
|
||||
fi
|
||||
|
||||
# -- publish to homepage ------------------------------------------------------
|
||||
# Only publish when we did the encode (otherwise we'd be copying a stale
|
||||
# mp4 next to a fresh webm). --no-encode skips this whole block.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,16 @@
|
|||
import { chromium, type Browser, type BrowserContext, type Page } from 'playwright';
|
||||
import { AUTH_STATE_PATH, CAPTURE_SCALE, OUTPUT_DIR, VIDEO_SIZE, VIEWPORT } from './config.js';
|
||||
import {
|
||||
chromium,
|
||||
type Browser,
|
||||
type BrowserContext,
|
||||
type Page,
|
||||
} from "playwright";
|
||||
import {
|
||||
AUTH_STATE_PATH,
|
||||
CAPTURE_SCALE,
|
||||
OUTPUT_DIR,
|
||||
VIDEO_SIZE,
|
||||
VIEWPORT,
|
||||
} from "./config.js";
|
||||
|
||||
export interface RecordingBrowser {
|
||||
browser: Browser;
|
||||
|
|
@ -10,22 +21,22 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
|
|||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--enable-gpu',
|
||||
'--use-gl=angle',
|
||||
'--use-angle=gl-egl',
|
||||
'--ignore-gpu-blocklist',
|
||||
'--enable-webgl',
|
||||
'--enable-webgl2',
|
||||
'--enable-gpu-rasterization',
|
||||
'--enable-zero-copy',
|
||||
'--disable-software-rasterizer',
|
||||
'--disable-frame-rate-limit',
|
||||
'--disable-gpu-vsync',
|
||||
'--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--enable-gpu",
|
||||
"--use-gl=angle",
|
||||
"--use-angle=gl-egl",
|
||||
"--ignore-gpu-blocklist",
|
||||
"--enable-webgl",
|
||||
"--enable-webgl2",
|
||||
"--enable-gpu-rasterization",
|
||||
"--enable-zero-copy",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-frame-rate-limit",
|
||||
"--disable-gpu-vsync",
|
||||
"--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
],
|
||||
});
|
||||
|
||||
|
|
@ -41,27 +52,34 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
|
|||
|
||||
export async function assertHardwareWebGL(page: Page): Promise<void> {
|
||||
const info = await page.evaluate(() => {
|
||||
const canvas = document.createElement('canvas');
|
||||
const gl = canvas.getContext('webgl2') ?? canvas.getContext('webgl');
|
||||
if (!gl) return { webgl: false, vendor: '', renderer: '' };
|
||||
const canvas = document.createElement("canvas");
|
||||
const gl = canvas.getContext("webgl2");
|
||||
if (!gl) return { webgl: false, vendor: "", renderer: "" };
|
||||
|
||||
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
const ext = gl.getExtension("WEBGL_debug_renderer_info");
|
||||
const vendor = String(
|
||||
ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : gl.getParameter(gl.VENDOR)
|
||||
ext
|
||||
? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL)
|
||||
: gl.getParameter(gl.VENDOR),
|
||||
);
|
||||
const renderer = String(
|
||||
ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : gl.getParameter(gl.RENDERER)
|
||||
ext
|
||||
? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL)
|
||||
: gl.getParameter(gl.RENDERER),
|
||||
);
|
||||
return { webgl: true, vendor, renderer };
|
||||
});
|
||||
|
||||
console.log(`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : 'none'}`);
|
||||
console.log(
|
||||
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : "none"}`,
|
||||
);
|
||||
if (
|
||||
process.env.ALLOW_SOFTWARE_GL !== '1' &&
|
||||
(!info.webgl || /SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
|
||||
process.env.ALLOW_SOFTWARE_GL !== "1" &&
|
||||
(!info.webgl ||
|
||||
/SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
|
||||
) {
|
||||
throw new Error(
|
||||
'Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.'
|
||||
"Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -71,41 +89,45 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
const RealWS = window.WebSocket;
|
||||
window.WebSocket = new Proxy(RealWS, {
|
||||
construct(target, args) {
|
||||
const url = String(args[0] ?? '');
|
||||
const proto = (args[1] as string | string[] | undefined) ?? '';
|
||||
const protoStr = Array.isArray(proto) ? proto.join(',') : proto;
|
||||
const url = String(args[0] ?? "");
|
||||
const proto = (args[1] as string | string[] | undefined) ?? "";
|
||||
const protoStr = Array.isArray(proto) ? proto.join(",") : proto;
|
||||
if (
|
||||
protoStr.includes('vite-hmr') ||
|
||||
protoStr.includes('webpack') ||
|
||||
url.includes('/ws') ||
|
||||
url.includes('sockjs-node')
|
||||
protoStr.includes("vite-hmr") ||
|
||||
protoStr.includes("webpack") ||
|
||||
url.includes("/ws") ||
|
||||
url.includes("sockjs-node")
|
||||
) {
|
||||
const fake = new EventTarget() as WebSocket;
|
||||
Object.defineProperties(fake, {
|
||||
readyState: { value: RealWS.CLOSED },
|
||||
url: { value: url },
|
||||
protocol: { value: '' },
|
||||
extensions: { value: '' },
|
||||
protocol: { value: "" },
|
||||
extensions: { value: "" },
|
||||
bufferedAmount: { value: 0 },
|
||||
binaryType: { value: 'blob', writable: true },
|
||||
binaryType: { value: "blob", writable: true },
|
||||
});
|
||||
fake.send = () => {};
|
||||
fake.close = () => fake.dispatchEvent(new Event('close'));
|
||||
queueMicrotask(() => fake.dispatchEvent(new Event('close')));
|
||||
fake.close = () => fake.dispatchEvent(new Event("close"));
|
||||
queueMicrotask(() => fake.dispatchEvent(new Event("close")));
|
||||
return fake;
|
||||
}
|
||||
return Reflect.construct(target, args);
|
||||
},
|
||||
});
|
||||
|
||||
Object.defineProperty(window.location, 'reload', {
|
||||
Object.defineProperty(window.location, "reload", {
|
||||
value: () => {},
|
||||
configurable: true,
|
||||
});
|
||||
window.addEventListener('error', (e) => e.stopImmediatePropagation(), true);
|
||||
window.addEventListener('unhandledrejection', (e) => e.stopImmediatePropagation(), true);
|
||||
window.addEventListener("error", (e) => e.stopImmediatePropagation(), true);
|
||||
window.addEventListener(
|
||||
"unhandledrejection",
|
||||
(e) => e.stopImmediatePropagation(),
|
||||
true,
|
||||
);
|
||||
|
||||
const styleEl = document.createElement('style');
|
||||
const styleEl = document.createElement("style");
|
||||
styleEl.textContent = `
|
||||
vite-error-overlay,
|
||||
wds-overlay,
|
||||
|
|
@ -126,12 +148,12 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
|
||||
const killOverlay = (node: Element) => {
|
||||
const tag = node.tagName?.toLowerCase();
|
||||
const id = (node as HTMLElement).id?.toLowerCase() ?? '';
|
||||
const id = (node as HTMLElement).id?.toLowerCase() ?? "";
|
||||
if (
|
||||
tag === 'vite-error-overlay' ||
|
||||
tag === 'wds-overlay' ||
|
||||
id.includes('webpack-dev-server-client') ||
|
||||
id.includes('webpack-error')
|
||||
tag === "vite-error-overlay" ||
|
||||
tag === "wds-overlay" ||
|
||||
id.includes("webpack-dev-server-client") ||
|
||||
id.includes("webpack-error")
|
||||
) {
|
||||
(node as HTMLElement).remove();
|
||||
}
|
||||
|
|
@ -143,10 +165,11 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
});
|
||||
}
|
||||
});
|
||||
if (document.body) obs.observe(document.body, { childList: true, subtree: true });
|
||||
if (document.body)
|
||||
obs.observe(document.body, { childList: true, subtree: true });
|
||||
else {
|
||||
document.addEventListener('DOMContentLoaded', () =>
|
||||
obs.observe(document.body, { childList: true, subtree: true })
|
||||
document.addEventListener("DOMContentLoaded", () =>
|
||||
obs.observe(document.body, { childList: true, subtree: true }),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,46 +1,66 @@
|
|||
export const APP_URL = process.env.APP_URL ?? 'http://host.docker.internal:3001';
|
||||
export const DASHBOARD_PATH = '/dashboard';
|
||||
function requiredEnv(name: string): string {
|
||||
const value = process.env[name];
|
||||
if (!value) {
|
||||
throw new Error(`${name} is required`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
export const AUTH_STATE_PATH = 'auth.json';
|
||||
export const OUTPUT_DIR = 'output';
|
||||
function requiredNumberEnv(name: string): number {
|
||||
const value = Number(requiredEnv(name));
|
||||
if (!Number.isFinite(value)) {
|
||||
throw new Error(`${name} must be a finite number`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
const aspect = process.env.ASPECT ?? '16x9';
|
||||
export const APP_URL = requiredEnv("APP_URL");
|
||||
export const DASHBOARD_PATH = "/dashboard";
|
||||
|
||||
export const AUTH_STATE_PATH = "auth.json";
|
||||
export const OUTPUT_DIR = "output";
|
||||
|
||||
const aspect = requiredEnv("ASPECT");
|
||||
if (aspect !== "16x9" && aspect !== "9x16") {
|
||||
throw new Error("ASPECT must be '16x9' or '9x16'");
|
||||
}
|
||||
export const VIEWPORT =
|
||||
aspect === '9x16' ? { width: 1080, height: 1920 } : { width: 1920, height: 1080 };
|
||||
export const CAPTURE_SCALE = Math.max(1, Number(process.env.CAPTURE_SCALE ?? 1));
|
||||
aspect === "9x16"
|
||||
? { width: 1080, height: 1920 }
|
||||
: { width: 1920, height: 1080 };
|
||||
export const CAPTURE_SCALE = Math.max(1, requiredNumberEnv("CAPTURE_SCALE"));
|
||||
export const VIDEO_SIZE = {
|
||||
width: VIEWPORT.width,
|
||||
height: VIEWPORT.height,
|
||||
};
|
||||
export const WEBM_BITRATE = process.env.WEBM_BITRATE ?? (CAPTURE_SCALE > 1 ? '18M' : '8M');
|
||||
export const WEBM_BITRATE = requiredEnv("WEBM_BITRATE");
|
||||
|
||||
// Cold-open prompt. Punchy version of the user's intent, short enough to type
|
||||
// on camera without making the opening scene drag.
|
||||
export const PROMPT_TEXT =
|
||||
process.env.PROMPT_TEXT ?? 'Flats or terraces <£450k, 35 min to Manchester, low crime';
|
||||
export const PROMPT_TEXT = requiredEnv("PROMPT_TEXT");
|
||||
|
||||
// Filters returned by the AI stub. Keys MUST match real feature names from
|
||||
// /api/features (verified against the running server's schema).
|
||||
export const STUBBED_FILTERS: Record<string, [number, number] | string[]> = {
|
||||
'Property type': ['Flats/Maisonettes', 'Terraced'],
|
||||
'Estimated current price': [175000, 450000],
|
||||
'Serious crime per 1k residents (avg/yr)': [0, 55],
|
||||
'Noise (dB)': [50, 68],
|
||||
"Property type": ["Flats/Maisonettes", "Terraced"],
|
||||
"Estimated current price": [175000, 450000],
|
||||
"Serious crime per 1k residents (avg/yr)": [0, 55],
|
||||
"Noise (dB)": [50, 68],
|
||||
};
|
||||
|
||||
// Travel-time filters returned by the AI stub. Slug matches the real
|
||||
// /api/travel-destinations?mode=transit response.
|
||||
export const STUBBED_TRAVEL_TIME_FILTERS: {
|
||||
mode: 'transit' | 'car' | 'bicycle' | 'walking';
|
||||
mode: "transit" | "car" | "bicycle" | "walking";
|
||||
slug: string;
|
||||
label: string;
|
||||
min?: number;
|
||||
max?: number;
|
||||
}[] = [
|
||||
{
|
||||
mode: 'transit',
|
||||
slug: 'manchester',
|
||||
label: 'Manchester city centre',
|
||||
mode: "transit",
|
||||
slug: "manchester",
|
||||
label: "Manchester city centre",
|
||||
max: 35,
|
||||
},
|
||||
];
|
||||
|
|
@ -55,7 +75,7 @@ export const TT_DRAG_TO_MIN = 20;
|
|||
|
||||
// Cold-open zoom: how aggressively to magnify the AI box.
|
||||
// 2.4 fills most of the viewport with the prompt card without blowing up text.
|
||||
export const AI_ZOOM_SCALE = Number(process.env.AI_ZOOM_SCALE ?? 2.4);
|
||||
export const AI_ZOOM_SCALE = requiredNumberEnv("AI_ZOOM_SCALE");
|
||||
|
||||
// Initial map view used while we navigate. The AI scene zooms in on the
|
||||
// sidebar so this only matters once we zoom out.
|
||||
|
|
@ -67,13 +87,18 @@ export const INITIAL_MAP_VIEW = {
|
|||
|
||||
// Verification guard only. The renderer does not use this as an editing cap:
|
||||
// if the storyboard needs more than 15 seconds to avoid jumps, keep the frames.
|
||||
export const MAX_DURATION_S = Number(process.env.MAX_DURATION_S ?? 45);
|
||||
export const MIN_DURATION_S = Number(process.env.MIN_DURATION_S ?? 10);
|
||||
export const MAX_DURATION_S = requiredNumberEnv("MAX_DURATION_S");
|
||||
export const MIN_DURATION_S = requiredNumberEnv("MIN_DURATION_S");
|
||||
|
||||
// Target fps of the FINAL output.
|
||||
export const OUTPUT_FPS = Number(process.env.OUTPUT_FPS ?? 50);
|
||||
export const OUTPUT_FPS = requiredNumberEnv("OUTPUT_FPS");
|
||||
|
||||
// Frames of head-room kept in front of sceneStart when trimming. Shared by
|
||||
// the video trim and the narration manifest so cue offsets line up with the
|
||||
// trimmed timeline.
|
||||
export const LEAD_IN_S = 0.12;
|
||||
|
||||
// Brand strings for the outro card.
|
||||
export const BRAND_NAME = 'Perfect Postcode';
|
||||
export const BRAND_TAGLINE = 'Find where you actually want to live.';
|
||||
export const BRAND_URL = 'https://perfect-postcode.co.uk';
|
||||
export const BRAND_NAME = "Perfect Postcode";
|
||||
export const BRAND_TAGLINE = "Find where you actually want to live.";
|
||||
export const BRAND_URL = "https://perfect-postcode.co.uk";
|
||||
|
|
|
|||
|
|
@ -20,8 +20,10 @@ export async function installCursor(page: Page): Promise<void> {
|
|||
pointer-events: none;
|
||||
z-index: 2147483646;
|
||||
transform: translate(-2px, -2px);
|
||||
transform-origin: 2px 2px;
|
||||
transition: transform 60ms linear, scale 120ms ease-out;
|
||||
will-change: transform;
|
||||
will-change: transform, scale;
|
||||
scale: 1;
|
||||
}
|
||||
#__demo-cursor svg {
|
||||
filter: drop-shadow(0 2px 4px rgba(0,0,0,0.35));
|
||||
|
|
@ -225,6 +227,30 @@ export async function showCaption(page: Page, text: string): Promise<void> {
|
|||
}, text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Animate the visible cursor to a new CSS scale. The injected cursor element
|
||||
* uses the `scale` shorthand (separate from `transform: translate(...)`),
|
||||
* which means resizing it doesn't fight the per-frame translate updates from
|
||||
* mousemove. The transition duration is set inline so each call decides its
|
||||
* own pace.
|
||||
*/
|
||||
export async function setCursorScale(
|
||||
page: Page,
|
||||
scale: number,
|
||||
durationMs: number
|
||||
): Promise<void> {
|
||||
await page.evaluate(
|
||||
({ scale, durationMs }) => {
|
||||
const cursor = document.getElementById('__demo-cursor');
|
||||
if (!cursor) return;
|
||||
cursor.style.transition =
|
||||
`transform 60ms linear, scale ${Math.max(0, durationMs)}ms cubic-bezier(0.22, 1, 0.36, 1)`;
|
||||
cursor.style.scale = String(scale);
|
||||
},
|
||||
{ scale, durationMs }
|
||||
);
|
||||
}
|
||||
|
||||
export async function hideCaption(page: Page): Promise<void> {
|
||||
await page.evaluate(() => {
|
||||
document.getElementById('__demo-caption')?.classList.remove('visible');
|
||||
|
|
|
|||
|
|
@ -72,18 +72,31 @@ export async function smoothMove(
|
|||
|
||||
/**
|
||||
* "Fake" type: progressively set the textarea value, dispatching
|
||||
* React-compatible input events. This stays Node-driven so typing cadence is
|
||||
* stable even when the map is busy rendering.
|
||||
* React-compatible input events.
|
||||
*
|
||||
* Cadence is generated as a per-char weight ratio (so spaces and punctuation
|
||||
* read as natural pauses), then **rescaled** so that the sum of delays equals
|
||||
* `totalDurationMs` exactly. The runner depends on this: it budgets a
|
||||
* specific number of ms for the type step, and any divergence would cascade
|
||||
* into narration drift.
|
||||
*/
|
||||
export async function fakeType(
|
||||
page: Page,
|
||||
selector: string,
|
||||
text: string,
|
||||
delayMs: number
|
||||
totalDurationMs: number
|
||||
): Promise<void> {
|
||||
const steps = text.length;
|
||||
if (steps === 0) {
|
||||
if (totalDurationMs > 0) await sleep(totalDurationMs);
|
||||
return;
|
||||
}
|
||||
|
||||
const weights = computeTypingWeights(text);
|
||||
const weightSum = weights.reduce((a, b) => a + b, 0);
|
||||
const msPerWeight = totalDurationMs / weightSum;
|
||||
|
||||
for (let i = 1; i <= steps; i++) {
|
||||
const end = Math.ceil((text.length * i) / steps);
|
||||
await page.evaluate(
|
||||
({ selector, value }) => {
|
||||
const ta = document.querySelector(selector) as HTMLTextAreaElement | null;
|
||||
|
|
@ -97,28 +110,25 @@ export async function fakeType(
|
|||
setValue.call(ta, value);
|
||||
ta.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
},
|
||||
{ selector, value: text.slice(0, end) }
|
||||
{ selector, value: text.slice(0, i) }
|
||||
);
|
||||
if (delayMs > 0 && i < steps) {
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, humanTypingDelay(text[i - 1], text[i], i, delayMs))
|
||||
);
|
||||
if (i < steps) {
|
||||
const ms = Math.max(0, Math.round(weights[i - 1] * msPerWeight));
|
||||
if (ms > 0) await sleep(ms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function humanTypingDelay(
|
||||
char: string,
|
||||
nextChar: string | undefined,
|
||||
index: number,
|
||||
baseDelayMs: number
|
||||
): number {
|
||||
function computeTypingWeights(text: string): number[] {
|
||||
const cadence = [0.82, 1.08, 0.94, 1.22, 0.88, 1.14, 0.98, 1.28];
|
||||
let delay = baseDelayMs * cadence[index % cadence.length];
|
||||
if (char === ' ') delay += baseDelayMs * 0.9;
|
||||
if (/[,.!?;:]/.test(char)) delay += baseDelayMs * 1.8;
|
||||
if (nextChar === ' ' && index % 4 === 0) delay += baseDelayMs * 0.55;
|
||||
return Math.round(delay);
|
||||
return Array.from(text, (char, index) => {
|
||||
let weight = cadence[index % cadence.length];
|
||||
if (char === ' ') weight += 0.9;
|
||||
if (/[,.!?;:]/.test(char)) weight += 1.8;
|
||||
const next = text[index + 1];
|
||||
if (next === ' ' && index % 4 === 0) weight += 0.55;
|
||||
return weight;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
37
video/src/narration.ts
Normal file
37
video/src/narration.ts
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
import { writeFileSync } from 'node:fs';
|
||||
|
||||
export interface NarrationCue {
|
||||
text: string;
|
||||
videoTimeMs: number;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Narration manifest writer.
|
||||
*
|
||||
* The runner knows the exact video-time of each narration block from the
|
||||
* storyboard itself, so cues come in with an explicit `videoTimeMs` instead
|
||||
* of being stamped against a wall-clock origin. That keeps the manifest in
|
||||
* lockstep with the trimmed video even if step durations drift slightly.
|
||||
*/
|
||||
class NarrationLog {
|
||||
private cues: NarrationCue[] = [];
|
||||
|
||||
reset(): void {
|
||||
this.cues = [];
|
||||
}
|
||||
|
||||
add(cue: NarrationCue): void {
|
||||
if (cue.videoTimeMs < 0) return;
|
||||
this.cues.push(cue);
|
||||
}
|
||||
|
||||
flush(path: string, totalDurationMs: number): NarrationCue[] {
|
||||
const sorted = [...this.cues].sort((a, b) => a.videoTimeMs - b.videoTimeMs);
|
||||
const manifest = { totalDurationMs, cues: sorted };
|
||||
writeFileSync(path, JSON.stringify(manifest, null, 2));
|
||||
return sorted;
|
||||
}
|
||||
}
|
||||
|
||||
export const narrationLog = new NarrationLog();
|
||||
32
video/src/preflight.ts
Normal file
32
video/src/preflight.ts
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { OUTPUT_DIR } from './config.js';
|
||||
import { storyboard } from './storyboard.js';
|
||||
|
||||
/**
|
||||
* Emit the narration script for the synth step.
|
||||
*
|
||||
* Synth (tts/synth.py) runs BEFORE recording, so it needs the full ordered
|
||||
* narration list — text + per-cue gaps — without depending on Playwright,
|
||||
* the dashboard, or auth. Walk the storyboard cues, write a flat manifest,
|
||||
* exit.
|
||||
*
|
||||
* The cue index in this manifest is the source of truth: the runner later
|
||||
* matches storyboard cues to measured durations by index.
|
||||
*/
|
||||
function main(): void {
|
||||
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
const items = storyboard.cues.map((cue, cueIndex) => ({
|
||||
cueIndex,
|
||||
text: cue.text.trim(),
|
||||
gapBeforeMs: cue.gapBeforeMs,
|
||||
}));
|
||||
|
||||
const manifest = { items };
|
||||
const path = join(OUTPUT_DIR, 'narration-script.json');
|
||||
writeFileSync(path, JSON.stringify(manifest, null, 2));
|
||||
console.log(`Wrote ${items.length} narration cues to ${path}`);
|
||||
}
|
||||
|
||||
main();
|
||||
|
|
@ -1,8 +1,10 @@
|
|||
import { existsSync, mkdirSync, statSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { AUTH_STATE_PATH, OUTPUT_DIR } from './config.js';
|
||||
import { AUTH_STATE_PATH, LEAD_IN_S, OUTPUT_DIR } from './config.js';
|
||||
import { assertHardwareWebGL, launchRecordingBrowser } from './browser.js';
|
||||
import { narrationLog } from './narration.js';
|
||||
import { installDemoRoutes } from './routes.js';
|
||||
import { storyboard } from './storyboard.js';
|
||||
import { prepareTimeline, runTimeline } from './timeline.js';
|
||||
import { trimRecording } from './video.js';
|
||||
|
||||
|
|
@ -37,7 +39,7 @@ async function main() {
|
|||
|
||||
await installDemoRoutes(page);
|
||||
const ctx = await prepareTimeline(page);
|
||||
const timeline = await runTimeline(ctx);
|
||||
const timeline = await runTimeline(ctx, storyboard);
|
||||
|
||||
await page.close();
|
||||
const rawPath = join(OUTPUT_DIR, 'recording.raw.webm');
|
||||
|
|
@ -54,6 +56,16 @@ async function main() {
|
|||
recordStartMs,
|
||||
...timeline,
|
||||
});
|
||||
|
||||
const totalDurationMs =
|
||||
timeline.sceneEndMs - timeline.sceneStartMs + LEAD_IN_S * 1000;
|
||||
const cues = narrationLog.flush(
|
||||
join(OUTPUT_DIR, 'narration.json'),
|
||||
totalDurationMs
|
||||
);
|
||||
console.log(
|
||||
`Wrote ${cues.length} narration cues to ${join(OUTPUT_DIR, 'narration.json')}`
|
||||
);
|
||||
console.log('Run "npm run encode" to produce output/recording.mp4');
|
||||
}
|
||||
|
||||
|
|
|
|||
275
video/src/runner.ts
Normal file
275
video/src/runner.ts
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
import { existsSync, readFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import type { Page } from 'playwright';
|
||||
import { LEAD_IN_S, OUTPUT_DIR } from './config.js';
|
||||
import {
|
||||
clearVignette,
|
||||
hideCaption,
|
||||
setCursorScale,
|
||||
showCaption,
|
||||
showOutro,
|
||||
zoomReset,
|
||||
zoomTo,
|
||||
} from './dom.js';
|
||||
import { fakeType, sleep, smoothDragSliderThumb, smoothMove } from './motion.js';
|
||||
import { narrationLog } from './narration.js';
|
||||
import type { Activity, Cue, ScriptCtx, Storyboard, Target } from './script.js';
|
||||
|
||||
export interface RunnerResult {
|
||||
/** Wall-clock when the first activity started. */
|
||||
sceneStartMs: number;
|
||||
/** Wall-clock when the last activity finished (after padding). */
|
||||
sceneEndMs: number;
|
||||
}
|
||||
|
||||
const MAP_ZOOM_WHEEL_DELTA = -120;
|
||||
const FALLBACK_MS_PER_WORD = 750;
|
||||
const FALLBACK_TAIL_BUFFER_MS = 800;
|
||||
|
||||
interface SynthCue {
|
||||
cueIndex: number;
|
||||
text: string;
|
||||
durationMs: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Drive the recording from a cue-anchored storyboard.
|
||||
*
|
||||
* Synth runs first and writes ``output/audio/index.json`` with per-cue
|
||||
* measured durations. The runner reads that manifest and sizes each cue's
|
||||
* wall-clock to its measured audio length: ``during`` activities run
|
||||
* sequentially with their declared budgets, then a final wait pads to the
|
||||
* full cue duration so the caption stays on for as long as the audio
|
||||
* plays. ``tail`` activities run after the caption hides; ``gapBeforeMs``
|
||||
* inserts pure silence before the next cue.
|
||||
*
|
||||
* The activity cursor is wall-clock honest: each step advances it by
|
||||
* ``max(declared, actual)`` so an overrun extends the timeline rather than
|
||||
* silently desyncing the narration manifest from reality. videoTimeMs
|
||||
* recorded for each cue therefore matches the trimmed mp4 frame-for-frame,
|
||||
* which is what the mux step needs to drop audio at the right moment.
|
||||
*
|
||||
* If the audio manifest is missing (``--no-audio`` runs), we fall back to a
|
||||
* worst-case estimate (750ms/word + 800ms buffer) so the visual flow still
|
||||
* works, just without sound.
|
||||
*/
|
||||
export async function runStoryboard(
|
||||
ctx: ScriptCtx,
|
||||
storyboard: Storyboard
|
||||
): Promise<RunnerResult> {
|
||||
narrationLog.reset();
|
||||
|
||||
const synth = loadSynthIndex(storyboard);
|
||||
const sceneStartMs = Date.now();
|
||||
const leadInMs = LEAD_IN_S * 1000;
|
||||
const cursor = { ms: 0 };
|
||||
|
||||
for (const step of storyboard.pre ?? []) {
|
||||
cursor.ms += await runStep(ctx, step);
|
||||
}
|
||||
|
||||
for (let i = 0; i < storyboard.cues.length; i++) {
|
||||
await runCue(ctx, storyboard.cues[i], synth[i], cursor, leadInMs);
|
||||
}
|
||||
|
||||
for (const step of storyboard.post ?? []) {
|
||||
cursor.ms += await runStep(ctx, step);
|
||||
}
|
||||
|
||||
return { sceneStartMs, sceneEndMs: sceneStartMs + cursor.ms };
|
||||
}
|
||||
|
||||
async function runCue(
|
||||
ctx: ScriptCtx,
|
||||
cue: Cue,
|
||||
synth: SynthCue,
|
||||
cursor: { ms: number },
|
||||
leadInMs: number
|
||||
): Promise<void> {
|
||||
if (cue.gapBeforeMs > 0) {
|
||||
await sleep(cue.gapBeforeMs);
|
||||
cursor.ms += cue.gapBeforeMs;
|
||||
}
|
||||
|
||||
const measuredAudioMs = synth.durationMs;
|
||||
narrationLog.add({
|
||||
text: cue.text,
|
||||
videoTimeMs: cursor.ms + leadInMs,
|
||||
durationMs: measuredAudioMs,
|
||||
});
|
||||
await showCaption(ctx.page, cue.text);
|
||||
|
||||
const during = cue.during ?? [];
|
||||
const declaredSum = during.reduce((s, a) => s + a.durationMs, 0);
|
||||
if (declaredSum > measuredAudioMs + 50) {
|
||||
throw new Error(
|
||||
`Cue ${synth.cueIndex} "${cue.text.slice(0, 40)}…" has ${declaredSum}ms of ` +
|
||||
`during activities but the measured audio is only ${measuredAudioMs}ms. ` +
|
||||
`Trim a during step, lengthen the cue text, or move work into tail.`
|
||||
);
|
||||
}
|
||||
// Time the during block as a whole — individual steps may overrun their
|
||||
// budgets, but what matters at the cue boundary is total wall-clock.
|
||||
const duringStart = Date.now();
|
||||
for (const step of during) {
|
||||
await runStep(ctx, step);
|
||||
}
|
||||
const duringElapsed = Date.now() - duringStart;
|
||||
if (duringElapsed < measuredAudioMs) {
|
||||
await sleep(measuredAudioMs - duringElapsed);
|
||||
cursor.ms += measuredAudioMs;
|
||||
} else {
|
||||
cursor.ms += duringElapsed;
|
||||
}
|
||||
|
||||
await hideCaption(ctx.page);
|
||||
|
||||
for (const step of cue.tail ?? []) {
|
||||
cursor.ms += await runStep(ctx, step);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a single activity. Pads short steps to their declared budget, lets
|
||||
* long ones bleed past it, and returns ``max(declared, actual)`` so the
|
||||
* caller can advance the wall-clock-honest cursor.
|
||||
*/
|
||||
async function runStep(ctx: ScriptCtx, step: Activity): Promise<number> {
|
||||
const startedAt = Date.now();
|
||||
await runActivity(ctx, step);
|
||||
const realMs = Date.now() - startedAt;
|
||||
if (realMs < step.durationMs) {
|
||||
await sleep(step.durationMs - realMs);
|
||||
return step.durationMs;
|
||||
}
|
||||
if (realMs > step.durationMs + 50) {
|
||||
console.log(
|
||||
`[runner] step ${step.kind} ran ${realMs}ms over a ${step.durationMs}ms budget (drift +${realMs - step.durationMs}ms)`
|
||||
);
|
||||
}
|
||||
return realMs;
|
||||
}
|
||||
|
||||
async function runActivity(ctx: ScriptCtx, step: Activity): Promise<void> {
|
||||
switch (step.kind) {
|
||||
case 'wait':
|
||||
return;
|
||||
case 'clearVignette':
|
||||
await clearVignette(ctx.page);
|
||||
return;
|
||||
case 'zoomTo': {
|
||||
const focus = await resolveTarget(ctx, step.target);
|
||||
await zoomTo(ctx.page, {
|
||||
scale: step.scale,
|
||||
focusX: focus.x,
|
||||
focusY: focus.y,
|
||||
durationMs: step.durationMs,
|
||||
});
|
||||
return;
|
||||
}
|
||||
case 'zoomReset':
|
||||
await zoomReset(ctx.page, step.durationMs);
|
||||
return;
|
||||
case 'cursorScale':
|
||||
await setCursorScale(ctx.page, step.scale, step.durationMs);
|
||||
return;
|
||||
case 'moveCursor': {
|
||||
const to = await resolveTarget(ctx, step.target);
|
||||
await smoothMove(ctx.page, ctx.cursor, to, { durationMs: step.durationMs });
|
||||
ctx.cursor = to;
|
||||
return;
|
||||
}
|
||||
case 'click': {
|
||||
const to = await resolveTarget(ctx, step.target);
|
||||
const moveMs = Math.max(120, Math.round(step.durationMs * 0.7));
|
||||
await smoothMove(ctx.page, ctx.cursor, to, { durationMs: moveMs });
|
||||
ctx.cursor = to;
|
||||
await ctx.page.mouse.click(to.x, to.y);
|
||||
return;
|
||||
}
|
||||
case 'type':
|
||||
await fakeType(ctx.page, step.selector, step.text, step.durationMs);
|
||||
return;
|
||||
case 'mapZoom': {
|
||||
const point = await resolveTarget(ctx, step.target);
|
||||
await ctx.page.mouse.move(point.x, point.y);
|
||||
const perStepMs = Math.floor(step.durationMs / Math.max(1, step.steps));
|
||||
for (let i = 0; i < step.steps; i++) {
|
||||
await ctx.page.mouse.wheel(0, MAP_ZOOM_WHEEL_DELTA);
|
||||
if (perStepMs > 0) await sleep(perStepMs);
|
||||
}
|
||||
return;
|
||||
}
|
||||
case 'dragSlider':
|
||||
ctx.cursor = await smoothDragSliderThumb(
|
||||
ctx.page,
|
||||
step.thumbSelector,
|
||||
step.trackSelector,
|
||||
ctx.cursor,
|
||||
step.toFraction,
|
||||
step.durationMs
|
||||
);
|
||||
return;
|
||||
case 'submitForm':
|
||||
await ctx.page.evaluate((selector) => {
|
||||
document.querySelector<HTMLFormElement>(selector)?.requestSubmit();
|
||||
}, step.formSelector);
|
||||
return;
|
||||
case 'showOutro':
|
||||
await showOutro(ctx.page, step.brand, step.tagline, step.url);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
async function resolveTarget(
|
||||
ctx: ScriptCtx,
|
||||
target: Target
|
||||
): Promise<{ x: number; y: number }> {
|
||||
if (target.kind === 'point') return { x: target.x, y: target.y };
|
||||
if (target.kind === 'hexagon') {
|
||||
const targets = await ctx.dashboard.visibleHexagonTargets(1);
|
||||
if (targets.length === 0) throw new Error('No visible hexagon to target');
|
||||
return { x: targets[0].x, y: targets[0].y };
|
||||
}
|
||||
const box = await ctx.page.locator(target.selector).boundingBox();
|
||||
if (!box) throw new Error(`No bounding box for selector: ${target.selector}`);
|
||||
return { x: box.x + box.width / 2, y: box.y + box.height / 2 };
|
||||
}
|
||||
|
||||
/**
|
||||
* Load synth's measured cue durations. Falls back to a worst-case estimate
|
||||
* if the manifest is missing — that path is only used for ``--no-audio``
|
||||
* runs, where the visual flow needs to play even without speech to time
|
||||
* against.
|
||||
*/
|
||||
function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
|
||||
const path = join(OUTPUT_DIR, 'audio', 'index.json');
|
||||
if (existsSync(path)) {
|
||||
const raw = JSON.parse(readFileSync(path, 'utf-8')) as {
|
||||
items: SynthCue[];
|
||||
};
|
||||
const byIndex = new Map(raw.items.map((it) => [it.cueIndex, it] as const));
|
||||
return storyboard.cues.map((cue, i) => {
|
||||
const m = byIndex.get(i);
|
||||
if (!m) {
|
||||
throw new Error(
|
||||
`Synth manifest is missing cue ${i} ("${cue.text.slice(0, 40)}…"). ` +
|
||||
`Re-run preflight + synth so the audio matches the storyboard.`
|
||||
);
|
||||
}
|
||||
return m;
|
||||
});
|
||||
}
|
||||
console.log(
|
||||
`[runner] no ${path} found — using worst-case fallback durations (${FALLBACK_MS_PER_WORD}ms/word + ${FALLBACK_TAIL_BUFFER_MS}ms buffer). Audio will be missing.`
|
||||
);
|
||||
return storyboard.cues.map((cue, cueIndex) => ({
|
||||
cueIndex,
|
||||
text: cue.text,
|
||||
durationMs:
|
||||
cue.text.split(/\s+/).filter(Boolean).length * FALLBACK_MS_PER_WORD +
|
||||
FALLBACK_TAIL_BUFFER_MS,
|
||||
}));
|
||||
}
|
||||
|
||||
export type { Page };
|
||||
109
video/src/script.ts
Normal file
109
video/src/script.ts
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
import type { Page } from 'playwright';
|
||||
import type { DashboardRecorder } from './dashboard.js';
|
||||
|
||||
/**
|
||||
* Public scripting API for the demo video.
|
||||
*
|
||||
* The storyboard is a `Storyboard` — an ordered list of narration cues, each
|
||||
* carrying the activities that play alongside it. Audio is generated FIRST
|
||||
* (one batched Qwen call so the voice stays consistent across cues); the
|
||||
* runner then reads the measured per-cue durations and slots `during`
|
||||
* activities inside each cue's audio window.
|
||||
*
|
||||
* Why cue-anchored: the audio drives pacing. Re-running synth produces a new
|
||||
* set of measured durations and the storyboard self-aligns — you don't have
|
||||
* to retune activity numbers. Author intent stays declarative ("zoom + type
|
||||
* happen during this cue, dwell 4s after, then next cue starts").
|
||||
*/
|
||||
|
||||
export interface ScriptCtx {
|
||||
page: Page;
|
||||
dashboard: DashboardRecorder;
|
||||
cursor: { x: number; y: number };
|
||||
}
|
||||
|
||||
/** A point on screen, either absolute pixel coords or the centre of an element. */
|
||||
export type Target =
|
||||
| { kind: 'point'; x: number; y: number }
|
||||
| { kind: 'element'; selector: string }
|
||||
/**
|
||||
* Resolved at runtime to the centre of a visible hexagon/postcode polygon,
|
||||
* picked from the dashboard's most recent map response. Robust to any zoom
|
||||
* level — use this when the click MUST land on a polygon and a fixed pixel
|
||||
* coordinate would risk landing on a road or river at deep zoom.
|
||||
*/
|
||||
| { kind: 'hexagon' };
|
||||
|
||||
export const at = (x: number, y: number): Target => ({ kind: 'point', x, y });
|
||||
export const el = (selector: string): Target => ({ kind: 'element', selector });
|
||||
export const hex = (): Target => ({ kind: 'hexagon' });
|
||||
|
||||
/**
|
||||
* Activities are the runner's atomic operations. Each one has a fixed
|
||||
* `durationMs` budget; the runner pads short overruns and warns on long ones.
|
||||
*/
|
||||
export type Activity =
|
||||
/** Pure pause. Useful for spacing. */
|
||||
| { kind: 'wait'; durationMs: number }
|
||||
/** Smoothly zoom the dashboard wrapper so `target` lands at viewport centre. */
|
||||
| { kind: 'zoomTo'; target: Target; scale: number; durationMs: number }
|
||||
/** Animate the wrapper back to identity. */
|
||||
| { kind: 'zoomReset'; durationMs: number }
|
||||
/** Slide the cursor from its current position to `target`. */
|
||||
| { kind: 'moveCursor'; target: Target; durationMs: number }
|
||||
/** Move + click + ripple. `durationMs` is the whole gesture, including settle. */
|
||||
| { kind: 'click'; target: Target; durationMs: number }
|
||||
/** Type into a textarea/input over exactly `durationMs`. */
|
||||
| { kind: 'type'; selector: string; text: string; durationMs: number }
|
||||
/** Grow or shrink the visible cursor (CSS scale). */
|
||||
| { kind: 'cursorScale'; scale: number; durationMs: number }
|
||||
/**
|
||||
* Wheel-zoom the underlying map at `target`. `steps` controls intensity
|
||||
* (each step is one ~120px wheel notch).
|
||||
*/
|
||||
| { kind: 'mapZoom'; target: Target; steps: number; durationMs: number }
|
||||
/** Drag the right thumb of a Radix slider to a fraction in [0,1]. */
|
||||
| {
|
||||
kind: 'dragSlider';
|
||||
thumbSelector: string;
|
||||
trackSelector: string;
|
||||
toFraction: number;
|
||||
durationMs: number;
|
||||
}
|
||||
/** Submit a form found by selector and wait `durationMs`. */
|
||||
| { kind: 'submitForm'; formSelector: string; durationMs: number }
|
||||
/** Reveal the closing brand card. */
|
||||
| { kind: 'showOutro'; brand: string; tagline: string; url: string; durationMs: number }
|
||||
/** Fade away the opening vignette. */
|
||||
| { kind: 'clearVignette'; durationMs: number };
|
||||
|
||||
/**
|
||||
* A narration cue + the activities that play alongside it.
|
||||
*
|
||||
* gapBeforeMs : silent wall-time before the caption appears (= silence in
|
||||
* audio between the previous cue ending and this one).
|
||||
* during : activities that play WHILE the caption is on screen. The
|
||||
* sum of declared durations must be ≤ the measured audio
|
||||
* duration; the runner pads short blocks so the caption stays
|
||||
* on for the full cue. Sum > measured is a hard error.
|
||||
* tail : activities that run AFTER the caption hides, before the
|
||||
* next cue's gapBefore starts. Use it for dwells/transitions
|
||||
* that aren't tied to spoken words.
|
||||
*/
|
||||
export interface Cue {
|
||||
text: string;
|
||||
gapBeforeMs: number;
|
||||
during?: Activity[];
|
||||
tail?: Activity[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Top-level storyboard. `pre` runs once before the first cue's gapBefore;
|
||||
* `post` runs once after the last cue's tail finishes. The cue list is what
|
||||
* gets handed to the synth step.
|
||||
*/
|
||||
export interface Storyboard {
|
||||
pre?: Activity[];
|
||||
cues: Cue[];
|
||||
post?: Activity[];
|
||||
}
|
||||
170
video/src/storyboard.ts
Normal file
170
video/src/storyboard.ts
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
import {
|
||||
AI_ZOOM_SCALE,
|
||||
BRAND_NAME,
|
||||
BRAND_TAGLINE,
|
||||
BRAND_URL,
|
||||
PROMPT_TEXT,
|
||||
TT_CARD_SELECTOR,
|
||||
TT_DRAG_TO_MIN,
|
||||
TT_SLIDER_MAX,
|
||||
} from './config.js';
|
||||
import { el, type Storyboard } from './script.js';
|
||||
|
||||
/**
|
||||
* The demo video, top to bottom.
|
||||
*
|
||||
* Audio is generated first (one batched Qwen call), so each cue's actual
|
||||
* duration is known before recording. The runner sizes each cue's wall-time
|
||||
* to the measured audio length, padding short `during` blocks with a
|
||||
* trailing wait. Inter-cue spacing is controlled here via `gapBeforeMs`
|
||||
* (silence in audio) plus optional `tail` activities (visual movement after
|
||||
* the caption hides, before the next cue's gap).
|
||||
*
|
||||
* Sum of `during` declared durations MUST be ≤ measured cue duration. If
|
||||
* synth comes back tighter than the activities can fit, the runner throws
|
||||
* with a pointer to the offending cue — bump that cue's text, lengthen its
|
||||
* gapBefore, or trim a during step.
|
||||
*
|
||||
* Reference durations (Qwen3-TTS / speaker=ryan, 2026-05-09 measured):
|
||||
* cue 0 1920ms "Describe the life you want."
|
||||
* cue 1 2720ms "Every matching neighbourhood, side by side."
|
||||
* cue 2 2160ms "Tighten the commute to 20 minutes."
|
||||
* cue 3 1840ms "Drill into a single block."
|
||||
* cue 4 4480ms "Stats, listings, Street View, price history…"
|
||||
* cue 5 1760ms "Take the shortlist into Excel."
|
||||
* cue 6 4400ms "Perfect Postcode. Find where you actually want to live."
|
||||
*/
|
||||
export const storyboard: Storyboard = {
|
||||
// Camera push-in to the AI box happens before the first caption — silent
|
||||
// setup keeps the cold open from feeling rushed.
|
||||
pre: [
|
||||
{ kind: 'clearVignette', durationMs: 0 },
|
||||
{ kind: 'wait', durationMs: 200 },
|
||||
{
|
||||
kind: 'zoomTo',
|
||||
target: el('[data-tutorial="ai-filters"]'),
|
||||
scale: AI_ZOOM_SCALE,
|
||||
durationMs: 1300,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
],
|
||||
|
||||
cues: [
|
||||
// -- Scene 1: AI prompt ----------------------------------------------
|
||||
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
|
||||
// happen silently in the tail. The natural beat is: viewer hears the
|
||||
// brief, then watches the prompt being typed.
|
||||
{
|
||||
text: 'Describe the life you want.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
{
|
||||
kind: 'type',
|
||||
selector: '[data-tutorial="ai-filters"] textarea',
|
||||
text: PROMPT_TEXT,
|
||||
durationMs: 3000,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
|
||||
{ kind: 'wait', durationMs: 700 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 2: zoom out reveal ---------------------------------------
|
||||
{
|
||||
text: 'Every matching neighbourhood, side by side.',
|
||||
gapBeforeMs: 400,
|
||||
during: [{ kind: 'zoomReset', durationMs: 1400 }],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 3: travel-time slider ------------------------------------
|
||||
{
|
||||
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{
|
||||
kind: 'dragSlider',
|
||||
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
|
||||
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
|
||||
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
|
||||
durationMs: 1400,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 4a: deep zoom into a hexagon -----------------------------
|
||||
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
|
||||
// earlier in this cue's during, the click + payoff dwell are in tail.
|
||||
{
|
||||
text: 'Drill into a single block.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
|
||||
{
|
||||
kind: 'mapZoom',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
steps: 18,
|
||||
durationMs: 1500,
|
||||
},
|
||||
],
|
||||
tail: [
|
||||
// Wait for the post-zoom /api/postcodes response and a redraw
|
||||
// before the click — otherwise the click can fire on a stale
|
||||
// frame and miss the polygon.
|
||||
{ kind: 'wait', durationMs: 1200 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
durationMs: 700,
|
||||
},
|
||||
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
|
||||
// Linger so the climax cue lands on the right-pane reveal.
|
||||
{ kind: 'wait', durationMs: 1500 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 4b: right-pane payoff -----------------------------------
|
||||
// 4480ms cue, no during — the camera holds on the populated right pane
|
||||
// for the whole climax line. Tail dwells before the export beat.
|
||||
{
|
||||
text: 'Stats, listings, Street View, price history — all in one pane.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 5: export ------------------------------------------------
|
||||
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
|
||||
{
|
||||
text: 'Take the shortlist into Excel.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'zoomReset', durationMs: 900 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: el('button[title="Export to Excel"]'),
|
||||
durationMs: 800,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 800 }],
|
||||
},
|
||||
|
||||
// -- Scene 6: outro -------------------------------------------------
|
||||
{
|
||||
text: `${BRAND_NAME}. ${BRAND_TAGLINE}`,
|
||||
gapBeforeMs: 600,
|
||||
during: [
|
||||
{
|
||||
kind: 'showOutro',
|
||||
brand: BRAND_NAME,
|
||||
tagline: BRAND_TAGLINE,
|
||||
url: BRAND_URL,
|
||||
durationMs: 0,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1500 }],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
|
@ -1,24 +1,19 @@
|
|||
import type { Page } from 'playwright';
|
||||
import { installCursor, installZoomWrapper } from './dom.js';
|
||||
import { DashboardRecorder } from './dashboard.js';
|
||||
import { installCursor, installZoomWrapper } from './dom.js';
|
||||
import { sleep } from './motion.js';
|
||||
import { dashboardUrl } from './routes.js';
|
||||
import {
|
||||
prepareAiBox,
|
||||
sceneAiCloseUp,
|
||||
sceneClusterClick,
|
||||
sceneExportAndOutro,
|
||||
sceneTravelTimeSlider,
|
||||
sceneZoomOutResults,
|
||||
type SceneCtx,
|
||||
} from './scenes.js';
|
||||
import { runStoryboard, type RunnerResult } from './runner.js';
|
||||
import type { ScriptCtx, Storyboard } from './script.js';
|
||||
|
||||
export interface TimelineResult {
|
||||
sceneStartMs: number;
|
||||
sceneEndMs: number;
|
||||
}
|
||||
export type TimelineResult = RunnerResult;
|
||||
|
||||
export async function prepareTimeline(page: Page): Promise<SceneCtx> {
|
||||
/**
|
||||
* Boot the dashboard, wait for the first map response, and inject the
|
||||
* recording chrome (cursor, zoom wrapper, caption layer). Also opens the
|
||||
* AI prompt textarea so the storyboard can begin typing immediately.
|
||||
*/
|
||||
export async function prepareTimeline(page: Page): Promise<ScriptCtx> {
|
||||
const dashboard = new DashboardRecorder(page);
|
||||
const initialMapVersion = dashboard.getMapDataVersion();
|
||||
await page.goto(dashboardUrl(), { waitUntil: 'domcontentloaded' });
|
||||
|
|
@ -29,33 +24,46 @@ export async function prepareTimeline(page: Page): Promise<SceneCtx> {
|
|||
await page.locator('canvas').first().waitFor({ state: 'attached', timeout: 15000 });
|
||||
await dashboard.waitForMapSettled(initialMapVersion, 15000);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 400));
|
||||
await sleep(400);
|
||||
await installZoomWrapper(page);
|
||||
await installCursor(page);
|
||||
|
||||
const ctx: SceneCtx = { page, dashboard, cursor: { x: 200, y: 240 } };
|
||||
const ctx: ScriptCtx = { page, dashboard, cursor: { x: 200, y: 240 } };
|
||||
await page.mouse.move(ctx.cursor.x, ctx.cursor.y);
|
||||
await prepareAiBox(ctx);
|
||||
await sleep(80);
|
||||
return ctx;
|
||||
}
|
||||
|
||||
export async function runTimeline(ctx: SceneCtx): Promise<TimelineResult> {
|
||||
const sceneStartMs = Date.now();
|
||||
let mark = sceneStartMs;
|
||||
|
||||
mark = await runScene('AI close-up', mark, () => sceneAiCloseUp(ctx));
|
||||
mark = await runScene('Zoom out', mark, () => sceneZoomOutResults(ctx));
|
||||
mark = await runScene('TT slider', mark, () => sceneTravelTimeSlider(ctx));
|
||||
mark = await runScene('Cluster click', mark, () => sceneClusterClick(ctx));
|
||||
mark = await runScene('Export + outro', mark, () => sceneExportAndOutro(ctx));
|
||||
|
||||
return { sceneStartMs, sceneEndMs: mark };
|
||||
export async function runTimeline(
|
||||
ctx: ScriptCtx,
|
||||
storyboard: Storyboard
|
||||
): Promise<TimelineResult> {
|
||||
return runStoryboard(ctx, storyboard);
|
||||
}
|
||||
|
||||
async function runScene(label: string, prev: number, scene: () => Promise<void>): Promise<number> {
|
||||
await scene();
|
||||
const now = Date.now();
|
||||
console.log(`[scene] ${label}: ${((now - prev) / 1000).toFixed(2)}s wall`);
|
||||
return now;
|
||||
/**
|
||||
* Open the AI prompt before the timed scene starts. This is preparation
|
||||
* work, not part of the storyboard, because waiting for the textarea to
|
||||
* appear has indeterminate duration.
|
||||
*/
|
||||
async function prepareAiBox(ctx: ScriptCtx): Promise<void> {
|
||||
const { page } = ctx;
|
||||
const aiRoot = page.locator('[data-tutorial="ai-filters"]').first();
|
||||
await aiRoot.waitFor({ state: 'visible', timeout: 15000 });
|
||||
|
||||
const textarea = page.locator('[data-tutorial="ai-filters"] textarea');
|
||||
if (!(await textarea.isVisible().catch(() => false))) {
|
||||
const aiButton = aiRoot.locator('button').first();
|
||||
await aiButton.waitFor({ state: 'visible', timeout: 8000 });
|
||||
const btnBox = await aiButton.boundingBox();
|
||||
if (btnBox) await page.mouse.click(btnBox.x + btnBox.width / 2, btnBox.y + btnBox.height / 2);
|
||||
}
|
||||
if (!(await textarea.isVisible().catch(() => false))) {
|
||||
await page.evaluate(() => {
|
||||
document.querySelector<HTMLElement>('[data-tutorial="ai-filters"] button')?.click();
|
||||
});
|
||||
}
|
||||
await textarea.waitFor({ state: 'visible', timeout: 15000 });
|
||||
await sleep(100);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
import { execSync } from 'node:child_process';
|
||||
import { renameSync, statSync } from 'node:fs';
|
||||
import { MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
|
||||
|
||||
const LEAD_IN_S = 0.12;
|
||||
import { LEAD_IN_S, MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
|
||||
|
||||
export function trimRecording(
|
||||
rawPath: string,
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"resolveJsonModule": true,
|
||||
"types": ["node"],
|
||||
"declaration": false,
|
||||
"sourceMap": true
|
||||
},
|
||||
|
|
|
|||
188
video/tts/mux.py
Normal file
188
video/tts/mux.py
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
|
||||
|
||||
Reads two manifests:
|
||||
|
||||
* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
|
||||
* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
the trimmed video. Generated DURING recording.
|
||||
|
||||
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
|
||||
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
|
||||
video stream, and writes ``output/recording.narrated.mp4``.
|
||||
|
||||
Run from the ``video/`` directory after recording:
|
||||
|
||||
uv run --project tts python tts/mux.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
|
||||
parser.add_argument(
|
||||
"--narration",
|
||||
type=Path,
|
||||
default=Path("output/narration.json"),
|
||||
help="Per-cue videoTimeMs manifest written by the recorder.",
|
||||
)
|
||||
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
default=Path("output/recording.narrated.mp4"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace",
|
||||
action="store_true",
|
||||
help="After muxing, atomically replace --video with --out.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if not shutil.which("ffmpeg"):
|
||||
print("[mux] ffmpeg not on PATH", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index_path = args.audio_dir / "index.json"
|
||||
if not audio_index_path.exists():
|
||||
print(
|
||||
f"[mux] {audio_index_path} not found; run tts/synth.py first",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.narration.exists():
|
||||
print(
|
||||
f"[mux] {args.narration} not found; the recorder must run before mux",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.video.exists():
|
||||
print(f"[mux] video not found: {args.video}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index = json.loads(audio_index_path.read_text())
|
||||
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
|
||||
if not audio_items:
|
||||
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
|
||||
shutil.copyfile(args.video, args.out)
|
||||
return 0
|
||||
|
||||
narration = json.loads(args.narration.read_text())
|
||||
nar_cues = list(narration.get("cues", []))
|
||||
if len(nar_cues) != len(audio_items):
|
||||
print(
|
||||
f"[mux] cue count mismatch: synth has {len(audio_items)} cues, "
|
||||
f"recorder logged {len(nar_cues)}. Re-run preflight + synth + record.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
# Sort audio items by cueIndex so list-order matches the recorder's
|
||||
# cue list (which is also in cue order). Then pair 1:1.
|
||||
audio_by_index = {int(it["cueIndex"]): it for it in audio_items}
|
||||
items = []
|
||||
for i, nar in enumerate(nar_cues):
|
||||
audio = audio_by_index.get(i)
|
||||
if audio is None:
|
||||
print(f"[mux] no synth wav for cue {i}", file=sys.stderr)
|
||||
return 1
|
||||
items.append(
|
||||
{
|
||||
"cueIndex": i,
|
||||
"wav": audio["wav"],
|
||||
"durationMs": int(audio["durationMs"]),
|
||||
"videoTimeMs": int(nar["videoTimeMs"]),
|
||||
"text": nar.get("text", ""),
|
||||
}
|
||||
)
|
||||
|
||||
# Refuse to mux overlapping cues — amix would silently mash voices on top
|
||||
# of each other. Sort by start so the order matches what we'll actually
|
||||
# play, then check that each cue ends before the next one starts.
|
||||
ordered = sorted(items, key=lambda it: it["videoTimeMs"])
|
||||
overlaps: list[str] = []
|
||||
for prev, nxt in zip(ordered, ordered[1:]):
|
||||
prev_end = prev["videoTimeMs"] + prev["durationMs"]
|
||||
nxt_start = nxt["videoTimeMs"]
|
||||
if prev_end > nxt_start:
|
||||
overlaps.append(
|
||||
f"cue {prev['cueIndex']} ends at {prev_end}ms but cue {nxt['cueIndex']} "
|
||||
f"starts at {nxt_start}ms (overlap {prev_end - nxt_start}ms)"
|
||||
)
|
||||
if overlaps:
|
||||
raise SystemExit(
|
||||
"[mux] refusing to produce overlapping narration:\n - "
|
||||
+ "\n - ".join(overlaps)
|
||||
)
|
||||
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
|
||||
for it in items:
|
||||
cmd += ["-i", str(args.audio_dir / it["wav"])]
|
||||
|
||||
filter_parts: list[str] = []
|
||||
mix_inputs: list[str] = []
|
||||
for n, it in enumerate(items, start=1):
|
||||
delay_ms = max(0, it["videoTimeMs"])
|
||||
label = f"a{n}"
|
||||
# adelay needs one delay per channel; "all=1" applies the same delay
|
||||
# to every channel, which is what we want for mono narration.
|
||||
filter_parts.append(
|
||||
f"[{n}:a]aresample=async=1,adelay={delay_ms}|{delay_ms}:all=1[{label}]"
|
||||
)
|
||||
mix_inputs.append(f"[{label}]")
|
||||
|
||||
mix = (
|
||||
f"{''.join(mix_inputs)}amix=inputs={len(items)}"
|
||||
f":duration=longest:dropout_transition=0:normalize=0[aout]"
|
||||
)
|
||||
filter_complex = ";".join(filter_parts + [mix])
|
||||
|
||||
cmd += [
|
||||
"-filter_complex",
|
||||
filter_complex,
|
||||
"-map",
|
||||
"0:v:0",
|
||||
"-map",
|
||||
"[aout]",
|
||||
"-c:v",
|
||||
"copy",
|
||||
"-c:a",
|
||||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
"-shortest",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
str(args.out),
|
||||
]
|
||||
|
||||
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
|
||||
return result.returncode
|
||||
|
||||
if args.replace:
|
||||
args.out.replace(args.video)
|
||||
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
208
video/tts/synth.py
Normal file
208
video/tts/synth.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
|
||||
|
||||
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
|
||||
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
|
||||
batched list — that way every cue shares the same model state, which keeps
|
||||
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
|
||||
go to ``output/audio/`` for the recording step (which reads measured cue
|
||||
durations) and the mux step (which drops each WAV at its videoTime).
|
||||
|
||||
Run from the ``video/`` directory:
|
||||
|
||||
uv run --project tts python tts/synth.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_SPEAKER = "ryan"
|
||||
DEFAULT_LANGUAGE = "English"
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--script",
|
||||
type=Path,
|
||||
default=Path("output/narration-script.json"),
|
||||
help="Narration script emitted by dist/preflight.js.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-dir",
|
||||
type=Path,
|
||||
default=Path("output/audio"),
|
||||
help="Directory to write WAV files and index.json into.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker",
|
||||
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
|
||||
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default=os.environ.get("TTS_DEVICE", "cuda:0"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-speakers",
|
||||
action="store_true",
|
||||
help="Load the model, print available speaker names, and exit.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_model(model_id: str, device: str) -> Qwen3TTSModel:
|
||||
dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
|
||||
print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
|
||||
return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)
|
||||
|
||||
|
||||
def cached_index_matches(
|
||||
index_path: Path,
|
||||
cues: list[dict],
|
||||
speaker: str,
|
||||
language: str,
|
||||
) -> bool:
|
||||
"""Return True iff index_path's cue list lines up with `cues` 1:1.
|
||||
|
||||
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
|
||||
settings (``speaker``, ``language``). All cue WAV files must also exist
|
||||
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
|
||||
the cache.
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return False
|
||||
try:
|
||||
cached = json.loads(index_path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
if cached.get("speaker") != speaker or cached.get("language") != language:
|
||||
return False
|
||||
cached_items = cached.get("items", [])
|
||||
if len(cached_items) != len(cues):
|
||||
return False
|
||||
for live, prev in zip(cues, cached_items):
|
||||
if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
|
||||
return False
|
||||
if live["text"].strip() != str(prev.get("text", "")).strip():
|
||||
return False
|
||||
if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
|
||||
return False
|
||||
wav = prev.get("wav")
|
||||
if not wav or not (index_path.parent / wav).exists():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if args.list_speakers:
|
||||
model = load_model(args.model, args.device)
|
||||
speakers = model.get_supported_speakers()
|
||||
print(json.dumps(speakers, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
if not args.script.exists():
|
||||
print(f"[synth] script not found: {args.script}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
script = json.loads(args.script.read_text())
|
||||
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
|
||||
if not cues:
|
||||
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Skip generation when the existing audio matches the script — same cue
|
||||
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
|
||||
# time when iterating on activity timing without changing narration.
|
||||
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
|
||||
print(
|
||||
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
model = load_model(args.model, args.device)
|
||||
|
||||
texts = [c["text"].strip() for c in cues]
|
||||
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
|
||||
for i, t in enumerate(texts):
|
||||
print(f"[synth] {i:2d}: {t}", flush=True)
|
||||
|
||||
# ONE batched call. generate_custom_voice handles text=List[str] natively
|
||||
# and broadcasts the speaker/language across all items, so the entire
|
||||
# narration is decoded in one model pass — same RNG state, same batch,
|
||||
# consistent voice from cue to cue.
|
||||
wavs, sr = model.generate_custom_voice(
|
||||
text=texts,
|
||||
language=args.language,
|
||||
speaker=args.speaker,
|
||||
)
|
||||
if len(wavs) != len(texts):
|
||||
print(
|
||||
f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
items = []
|
||||
for cue, audio in zip(cues, wavs):
|
||||
if hasattr(audio, "cpu"):
|
||||
audio = audio.cpu().float().numpy()
|
||||
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
|
||||
wav_path = args.out_dir / wav_name
|
||||
sf.write(str(wav_path), audio, sr)
|
||||
duration_ms = int(round(len(audio) * 1000 / sr))
|
||||
items.append(
|
||||
{
|
||||
"cueIndex": cue["cueIndex"],
|
||||
"text": cue["text"],
|
||||
"gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
|
||||
"wav": wav_name,
|
||||
"sampleRate": sr,
|
||||
"durationMs": duration_ms,
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[synth] wrote {wav_name} {duration_ms:>5d}ms «{cue['text']}»",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
out_index = {
|
||||
"speaker": args.speaker,
|
||||
"language": args.language,
|
||||
"model": args.model,
|
||||
"items": items,
|
||||
}
|
||||
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
|
||||
print(
|
||||
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue