LGTM
This commit is contained in:
parent
9248e26af2
commit
f2a2651b8a
95 changed files with 3993 additions and 1471 deletions
|
|
@ -9,8 +9,6 @@
|
|||
"bootstrap-admin": "tsc && node dist/pb-admin.js",
|
||||
"setup-auth": "tsc && node dist/auth.js",
|
||||
"record": "tsc && node dist/record.js",
|
||||
"record:vertical": "tsc && ASPECT=9x16 node dist/record.js",
|
||||
"encode": "ffmpeg -y -i output/recording.webm -c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast -movflags +faststart output/recording.mp4",
|
||||
"verify-output": "tsc && node dist/verify.js",
|
||||
"render": "./render.sh"
|
||||
},
|
||||
|
|
|
|||
248
video/render.sh
248
video/render.sh
|
|
@ -1,6 +1,11 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# End-to-end re-render of the dashboard demo video.
|
||||
# End-to-end re-render of the dashboard demo videos.
|
||||
#
|
||||
# All per-storyboard knobs (aspect, fps, bitrate, prompt text, voice persona,
|
||||
# poster timestamp, brand strings…) live on the Storyboard objects in
|
||||
# src/storyboard.ts. To add a vertical cut or change the voice, edit that
|
||||
# file — this script only handles target/auth/transport concerns.
|
||||
#
|
||||
# Two targets:
|
||||
# local (default) — assumes the docker-compose stack on host.docker.internal,
|
||||
|
|
@ -17,7 +22,6 @@
|
|||
# ./render.sh --no-audio # skip Qwen3-TTS narration; silent MP4
|
||||
# FORCE_AUTH=1 ./render.sh # same as --fresh-auth
|
||||
# APP_URL=http://localhost:3001 ./render.sh # override frontend URL
|
||||
# TTS_SPEAKER=aiden ./render.sh # override CustomVoice speaker
|
||||
#
|
||||
# Cred env vars (read for both targets, but prod has no fallback defaults):
|
||||
# LOGIN_EMAIL, LOGIN_PASSWORD — the dashboard account to record as
|
||||
|
|
@ -48,7 +52,7 @@ case "$TARGET" in
|
|||
*) echo "Unknown --target: $TARGET (expected: local, prod)" >&2; exit 2 ;;
|
||||
esac
|
||||
|
||||
# -- config (override via env) -------------------------------------------------
|
||||
# -- environment (target-specific URLs and credentials) ----------------------
|
||||
if [ "$TARGET" = "prod" ]; then
|
||||
# Prod serves frontend, /api/*, and /pb/* off the same domain.
|
||||
export APP_URL="${APP_URL:-https://perfect-postcode.co.uk}"
|
||||
|
|
@ -81,23 +85,6 @@ AUTH_TTL_HOURS="${AUTH_TTL_HOURS:-24}" # re-auth if cache older than this
|
|||
# the built bundle, so updating this path is what makes the new clip appear
|
||||
# on the homepage. Override if the dashboard ever moves.
|
||||
PUBLISH_DIR="${PUBLISH_DIR:-../frontend/public/video}"
|
||||
# When in the output timeline to grab the poster frame.
|
||||
# Right-pane inspection (~16s output) is the clearest paused-state preview:
|
||||
# Manchester map, filters applied, right pane populated, larger narration
|
||||
# caption visible.
|
||||
POSTER_TIME_S="${POSTER_TIME_S:-16}"
|
||||
|
||||
# Recorder/encoder knobs read by src/config.ts. config.ts treats these as
|
||||
# required, so they live here (the only entry point) rather than as defaults
|
||||
# scattered across TS modules. Override per-run via env.
|
||||
export ASPECT="${ASPECT:-16x9}"
|
||||
export CAPTURE_SCALE="${CAPTURE_SCALE:-1}"
|
||||
export WEBM_BITRATE="${WEBM_BITRATE:-$(awk -v s="$CAPTURE_SCALE" 'BEGIN{print (s+0>1)?"18M":"8M"}')}"
|
||||
export PROMPT_TEXT="${PROMPT_TEXT:-Flats or terraces <£450k, 35 min to Manchester, low crime}"
|
||||
export AI_ZOOM_SCALE="${AI_ZOOM_SCALE:-2.4}"
|
||||
export MAX_DURATION_S="${MAX_DURATION_S:-60}"
|
||||
export MIN_DURATION_S="${MIN_DURATION_S:-10}"
|
||||
export OUTPUT_FPS="${OUTPUT_FPS:-50}"
|
||||
|
||||
FRESH_AUTH="${FORCE_AUTH:-0}"
|
||||
DO_ENCODE=1
|
||||
|
|
@ -109,7 +96,7 @@ for arg in "${@:-}"; do
|
|||
--no-encode) DO_ENCODE=0 ;;
|
||||
--no-audio) DO_AUDIO=0 ;;
|
||||
-h|--help)
|
||||
sed -n '3,30p' "$0"
|
||||
sed -n '3,32p' "$0"
|
||||
exit 0 ;;
|
||||
*) echo "Unknown arg: $arg" >&2; exit 2 ;;
|
||||
esac
|
||||
|
|
@ -207,22 +194,57 @@ else
|
|||
say "Reusing existing $AUTH_STATE_FILE"
|
||||
fi
|
||||
|
||||
# -- preflight + synth (Qwen3-TTS) -------------------------------------------
|
||||
# Synth runs BEFORE recording: one batched generate_custom_voice call across
|
||||
# all cues so the voice stays consistent. The recorder reads
|
||||
# output/audio/index.json for measured per-cue durations and sizes each
|
||||
# cue's wall-clock to fit; --no-audio skips synth and the recorder falls
|
||||
# back to a worst-case estimate.
|
||||
# -- preflight ---------------------------------------------------------------
|
||||
# preflight emits per-storyboard narration scripts AND output/storyboards.json
|
||||
# (the index this script loops over below). Run it BEFORE wiping per-storyboard
|
||||
# files so we know what slugs to target.
|
||||
mkdir -p output
|
||||
# Wipe last run's leaking artifacts so the rename step picks up *this* run.
|
||||
rm -f output/recording.webm output/recording.mp4 output/page@*.webm output/page@*.webm.untrimmed
|
||||
rm -f output/narration-script.json output/narration.json
|
||||
# output/audio/ is preserved; tts/synth.py decides whether the cached WAVs
|
||||
# still match the script and skips generation when they do.
|
||||
|
||||
say "Preflight: emitting narration script"
|
||||
say "Preflight: emitting narration scripts and storyboard index"
|
||||
node dist/preflight.js
|
||||
|
||||
if [ ! -s output/storyboards.json ]; then
|
||||
fail "preflight did not produce output/storyboards.json"
|
||||
fi
|
||||
|
||||
# Pull the storyboard slugs out of the index. Use Node so we don't grow a jq
|
||||
# dependency just for one read.
|
||||
mapfile -t STORYBOARDS < <(node -e '
|
||||
const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
|
||||
for (const s of idx.storyboards) console.log(s.name);
|
||||
')
|
||||
if [ "${#STORYBOARDS[@]}" -eq 0 ]; then
|
||||
fail "storyboards.json contains no storyboards"
|
||||
fi
|
||||
say "Storyboards to render: ${STORYBOARDS[*]}"
|
||||
|
||||
# Per-storyboard poster timestamp lookup (slug → seconds), set once so each
|
||||
# loop body can read it without re-parsing the index.
|
||||
poster_time_for() {
|
||||
node -e '
|
||||
const idx = JSON.parse(require("fs").readFileSync("output/storyboards.json","utf8"));
|
||||
const sb = idx.storyboards.find(s => s.name === process.argv[1]);
|
||||
if (!sb) { process.exit(1); }
|
||||
process.stdout.write(String(sb.posterTimeS));
|
||||
' "$1"
|
||||
}
|
||||
|
||||
# -- per-storyboard wipe of leaking artefacts --------------------------------
|
||||
# output/<sb>/audio/ is preserved; tts/synth.py decides whether the cached
|
||||
# WAVs still match the script and skips generation when they do.
|
||||
for sb in "${STORYBOARDS[@]}"; do
|
||||
rm -f "output/$sb/recording.webm" "output/$sb/recording.mp4" \
|
||||
"output/$sb/page@"*.webm "output/$sb/page@"*.webm.untrimmed \
|
||||
"output/$sb/recording.raw.webm" "output/$sb/recording.raw.webm.untrimmed" \
|
||||
"output/$sb/recording.narrated.mp4" "output/$sb/poster.jpg" \
|
||||
"output/$sb/narration.json"
|
||||
done
|
||||
|
||||
# -- synth (Qwen3-TTS) -------------------------------------------------------
|
||||
# Synth runs BEFORE recording: one batched generate_voice_clone call per
|
||||
# storyboard so the voice stays consistent within each video. The recorder
|
||||
# reads output/<sb>/audio/index.json for measured per-cue durations and
|
||||
# sizes each cue's wall-clock to fit; --no-audio skips synth and the recorder
|
||||
# falls back to a worst-case estimate.
|
||||
if [ "$DO_AUDIO" = "1" ]; then
|
||||
if ! command -v uv >/dev/null 2>&1; then
|
||||
fail "uv not on PATH (required for Qwen3-TTS synth). Install uv or rerun with --no-audio."
|
||||
|
|
@ -236,95 +258,103 @@ if [ "$DO_AUDIO" = "1" ]; then
|
|||
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then
|
||||
uv_sync_extras+=(--extra gpu)
|
||||
fi
|
||||
say "Synthesising narration with Qwen3-TTS (speaker=${TTS_SPEAKER:-ryan}) — one batched call"
|
||||
say "Synchronising tts/ Python deps"
|
||||
uv sync --project tts ${uv_sync_extras[@]+"${uv_sync_extras[@]}"} || fail "uv sync failed in video/tts"
|
||||
uv run --project tts python tts/synth.py || fail "tts/synth.py failed"
|
||||
if [ ! -s output/audio/index.json ]; then
|
||||
fail "synth did not produce output/audio/index.json"
|
||||
fi
|
||||
|
||||
for sb in "${STORYBOARDS[@]}"; do
|
||||
say "Synthesising narration for [$sb] — one batched call"
|
||||
uv run --project tts python tts/synth.py --storyboard "$sb" \
|
||||
|| fail "tts/synth.py failed for $sb"
|
||||
if [ ! -s "output/$sb/audio/index.json" ]; then
|
||||
fail "synth did not produce output/$sb/audio/index.json"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# -- record -------------------------------------------------------------------
|
||||
say "Recording"
|
||||
# -- record ------------------------------------------------------------------
|
||||
# record.ts iterates over storyboards in-process and writes per-storyboard
|
||||
# recording.webm + narration.json. One Node invocation handles all of them
|
||||
# so we don't spin up Playwright + GPU/WebGL + auth more than necessary.
|
||||
say "Recording all storyboards"
|
||||
APP_URL="$APP_URL" node dist/record.js
|
||||
|
||||
if [ ! -s output/recording.webm ]; then
|
||||
fail "recording.webm missing or empty"
|
||||
fi
|
||||
node dist/verify.js output/recording.webm
|
||||
|
||||
# -- encode -------------------------------------------------------------------
|
||||
if [ "$DO_ENCODE" = "1" ]; then
|
||||
if ! command -v ffmpeg >/dev/null 2>&1; then
|
||||
fail "ffmpeg not on PATH; rerun with --no-encode if you only need the WebM"
|
||||
for sb in "${STORYBOARDS[@]}"; do
|
||||
if [ ! -s "output/$sb/recording.webm" ]; then
|
||||
fail "[$sb] recording.webm missing or empty"
|
||||
fi
|
||||
say "Encoding to MP4"
|
||||
ffmpeg -y -loglevel warning -i output/recording.webm \
|
||||
-c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
|
||||
-movflags +faststart \
|
||||
output/recording.mp4
|
||||
node dist/verify.js "$sb" "output/$sb/recording.webm"
|
||||
done
|
||||
|
||||
# Poster: a single high-quality JPEG extracted from a representative
|
||||
# moment in the output timeline. Used as the homepage <video poster=...>,
|
||||
# which is what the visitor sees before pressing play.
|
||||
# - -ss AFTER -i = output-side seek, frame-accurate (input-side seek
|
||||
# would land on the nearest keyframe, drifting back up to ~2s).
|
||||
# - -update 1 tells ffmpeg the output is a single image, not a sequence.
|
||||
# - -q:v 2 = high JPEG quality (~95%); poster file is ~120KB at 1080p.
|
||||
say "Extracting poster frame at ${POSTER_TIME_S}s"
|
||||
ffmpeg -y -loglevel warning -i output/recording.mp4 -ss "$POSTER_TIME_S" \
|
||||
-frames:v 1 -update 1 -q:v 2 \
|
||||
output/poster.jpg
|
||||
|
||||
node dist/verify.js output/recording.mp4 output/poster.jpg
|
||||
# -- encode + mux + publish (per storyboard) ---------------------------------
|
||||
if [ "$DO_ENCODE" = "1" ] && ! command -v ffmpeg >/dev/null 2>&1; then
|
||||
fail "ffmpeg not on PATH; rerun with --no-encode if you only need the WebM"
|
||||
fi
|
||||
|
||||
# -- mux narration ------------------------------------------------------------
|
||||
# Synth already produced per-cue WAVs (in output/audio/); the recorder logged
|
||||
# each cue's videoTime against the trimmed timeline. Drop the WAVs onto the
|
||||
# mp4 with one ffmpeg adelay+amix and replace the silent recording in place.
|
||||
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
|
||||
if [ ! -s output/narration.json ]; then
|
||||
fail "narration.json missing — recorder did not log cues"
|
||||
for sb in "${STORYBOARDS[@]}"; do
|
||||
if [ "$DO_ENCODE" = "1" ]; then
|
||||
say "[$sb] Encoding to MP4"
|
||||
ffmpeg -y -loglevel warning -i "output/$sb/recording.webm" \
|
||||
-c:v libx264 -pix_fmt yuv420p -crf 14 -preset fast \
|
||||
-movflags +faststart \
|
||||
"output/$sb/recording.mp4"
|
||||
|
||||
# Poster: a single high-quality JPEG extracted from a representative
|
||||
# moment in the output timeline. Used as the homepage <video poster=...>.
|
||||
# - -ss AFTER -i = output-side seek, frame-accurate (input-side seek
|
||||
# would land on the nearest keyframe, drifting back up to ~2s).
|
||||
# - -update 1 tells ffmpeg the output is a single image, not a sequence.
|
||||
# - -q:v 2 = high JPEG quality (~95%); poster file is ~120KB at 1080p.
|
||||
poster_t="$(poster_time_for "$sb")"
|
||||
say "[$sb] Extracting poster frame at ${poster_t}s"
|
||||
ffmpeg -y -loglevel warning -i "output/$sb/recording.mp4" -ss "$poster_t" \
|
||||
-frames:v 1 -update 1 -q:v 2 \
|
||||
"output/$sb/poster.jpg"
|
||||
|
||||
node dist/verify.js "$sb" "output/$sb/recording.mp4" "output/$sb/poster.jpg"
|
||||
fi
|
||||
say "Muxing narration into output/recording.mp4"
|
||||
uv run --project tts python tts/mux.py --replace \
|
||||
|| fail "tts/mux.py failed"
|
||||
node dist/verify.js output/recording.mp4
|
||||
fi
|
||||
|
||||
# -- publish to homepage ------------------------------------------------------
|
||||
# Only publish when we did the encode (otherwise we'd be copying a stale
|
||||
# mp4 next to a fresh webm). --no-encode skips this whole block.
|
||||
if [ "$DO_ENCODE" = "1" ]; then
|
||||
if [ ! -d "$PUBLISH_DIR" ]; then
|
||||
say "Creating $PUBLISH_DIR"
|
||||
mkdir -p "$PUBLISH_DIR"
|
||||
if [ "$DO_ENCODE" = "1" ] && [ "$DO_AUDIO" = "1" ]; then
|
||||
if [ ! -s "output/$sb/narration.json" ]; then
|
||||
fail "[$sb] narration.json missing — recorder did not log cues"
|
||||
fi
|
||||
say "[$sb] Muxing narration into output/$sb/recording.mp4"
|
||||
uv run --project tts python tts/mux.py --storyboard "$sb" --replace \
|
||||
|| fail "tts/mux.py failed for $sb"
|
||||
node dist/verify.js "$sb" "output/$sb/recording.mp4"
|
||||
fi
|
||||
say "Publishing to $PUBLISH_DIR"
|
||||
cp output/recording.mp4 "$PUBLISH_DIR/recording.mp4"
|
||||
cp output/poster.jpg "$PUBLISH_DIR/poster.jpg"
|
||||
node dist/verify.js "$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg"
|
||||
fi
|
||||
|
||||
# -- report -------------------------------------------------------------------
|
||||
# Only publish when we did the encode (otherwise we'd be copying a stale
|
||||
# mp4 next to a fresh webm). --no-encode skips publish.
|
||||
if [ "$DO_ENCODE" = "1" ]; then
|
||||
if [ ! -d "$PUBLISH_DIR" ]; then
|
||||
say "Creating $PUBLISH_DIR"
|
||||
mkdir -p "$PUBLISH_DIR"
|
||||
fi
|
||||
say "[$sb] Publishing to $PUBLISH_DIR/$sb.{mp4,jpg}"
|
||||
cp "output/$sb/recording.mp4" "$PUBLISH_DIR/$sb.mp4"
|
||||
cp "output/$sb/poster.jpg" "$PUBLISH_DIR/$sb.jpg"
|
||||
node dist/verify.js "$sb" "$PUBLISH_DIR/$sb.mp4" "$PUBLISH_DIR/$sb.jpg"
|
||||
fi
|
||||
done
|
||||
|
||||
# -- report ------------------------------------------------------------------
|
||||
say "Done"
|
||||
if command -v ffprobe >/dev/null 2>&1; then
|
||||
for f in output/recording.webm output/recording.mp4 output/poster.jpg \
|
||||
"$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg"; do
|
||||
[ -f "$f" ] || continue
|
||||
size=$(stat -c '%s' "$f" 2>/dev/null || stat -f '%z' "$f")
|
||||
case "$f" in
|
||||
*.mp4|*.webm)
|
||||
dur=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$f")
|
||||
printf ' %s %ss %s bytes\n' "$f" "$(printf '%.2f' "$dur")" "$size"
|
||||
;;
|
||||
*)
|
||||
printf ' %s %s bytes\n' "$f" "$size"
|
||||
;;
|
||||
esac
|
||||
for sb in "${STORYBOARDS[@]}"; do
|
||||
for f in "output/$sb/recording.webm" "output/$sb/recording.mp4" \
|
||||
"output/$sb/poster.jpg" \
|
||||
"$PUBLISH_DIR/$sb.mp4" "$PUBLISH_DIR/$sb.jpg"; do
|
||||
[ -f "$f" ] || continue
|
||||
size=$(stat -c '%s' "$f" 2>/dev/null || stat -f '%z' "$f")
|
||||
case "$f" in
|
||||
*.mp4|*.webm)
|
||||
dur=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$f")
|
||||
printf ' %s %ss %s bytes\n' "$f" "$(printf '%.2f' "$dur")" "$size"
|
||||
;;
|
||||
*)
|
||||
printf ' %s %s bytes\n' "$f" "$size"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
done
|
||||
else
|
||||
ls -la output/recording.* output/poster.jpg \
|
||||
"$PUBLISH_DIR/recording.mp4" "$PUBLISH_DIR/poster.jpg" 2>/dev/null || true
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -3,48 +3,52 @@ import {
|
|||
type Browser,
|
||||
type BrowserContext,
|
||||
type Page,
|
||||
} from "playwright";
|
||||
import {
|
||||
AUTH_STATE_PATH,
|
||||
CAPTURE_SCALE,
|
||||
OUTPUT_DIR,
|
||||
VIDEO_SIZE,
|
||||
VIEWPORT,
|
||||
} from "./config.js";
|
||||
} from 'playwright';
|
||||
import { AUTH_STATE_PATH } from './config.js';
|
||||
import { viewportFor, type Storyboard } from './script.js';
|
||||
|
||||
export interface RecordingBrowser {
|
||||
browser: Browser;
|
||||
context: BrowserContext;
|
||||
}
|
||||
|
||||
export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
|
||||
export interface LaunchOptions {
|
||||
/** Directory the playwright recorder writes the raw .webm into. */
|
||||
recordDir: string;
|
||||
}
|
||||
|
||||
export async function launchRecordingBrowser(
|
||||
storyboard: Storyboard,
|
||||
opts: LaunchOptions
|
||||
): Promise<RecordingBrowser> {
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--enable-gpu",
|
||||
"--use-gl=angle",
|
||||
"--use-angle=gl-egl",
|
||||
"--ignore-gpu-blocklist",
|
||||
"--enable-webgl",
|
||||
"--enable-webgl2",
|
||||
"--enable-gpu-rasterization",
|
||||
"--enable-zero-copy",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-frame-rate-limit",
|
||||
"--disable-gpu-vsync",
|
||||
"--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--enable-gpu',
|
||||
'--use-gl=angle',
|
||||
'--use-angle=gl-egl',
|
||||
'--ignore-gpu-blocklist',
|
||||
'--enable-webgl',
|
||||
'--enable-webgl2',
|
||||
'--enable-gpu-rasterization',
|
||||
'--enable-zero-copy',
|
||||
'--disable-software-rasterizer',
|
||||
'--disable-frame-rate-limit',
|
||||
'--disable-gpu-vsync',
|
||||
'--disable-features=CalculateNativeWinOcclusion,IntensiveWakeUpThrottling',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
],
|
||||
});
|
||||
|
||||
const viewport = viewportFor(storyboard.video);
|
||||
const context = await browser.newContext({
|
||||
storageState: AUTH_STATE_PATH,
|
||||
viewport: VIEWPORT,
|
||||
deviceScaleFactor: CAPTURE_SCALE,
|
||||
recordVideo: { dir: OUTPUT_DIR, size: VIDEO_SIZE },
|
||||
viewport,
|
||||
deviceScaleFactor: storyboard.video.captureScale,
|
||||
recordVideo: { dir: opts.recordDir, size: viewport },
|
||||
});
|
||||
await suppressDevServerNoise(context);
|
||||
return { browser, context };
|
||||
|
|
@ -52,11 +56,11 @@ export async function launchRecordingBrowser(): Promise<RecordingBrowser> {
|
|||
|
||||
export async function assertHardwareWebGL(page: Page): Promise<void> {
|
||||
const info = await page.evaluate(() => {
|
||||
const canvas = document.createElement("canvas");
|
||||
const gl = canvas.getContext("webgl2");
|
||||
if (!gl) return { webgl: false, vendor: "", renderer: "" };
|
||||
const canvas = document.createElement('canvas');
|
||||
const gl = canvas.getContext('webgl2');
|
||||
if (!gl) return { webgl: false, vendor: '', renderer: '' };
|
||||
|
||||
const ext = gl.getExtension("WEBGL_debug_renderer_info");
|
||||
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
const vendor = String(
|
||||
ext
|
||||
? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL)
|
||||
|
|
@ -71,15 +75,15 @@ export async function assertHardwareWebGL(page: Page): Promise<void> {
|
|||
});
|
||||
|
||||
console.log(
|
||||
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : "none"}`,
|
||||
`[gpu] WebGL renderer: ${info.webgl ? `${info.vendor} / ${info.renderer}` : 'none'}`,
|
||||
);
|
||||
if (
|
||||
process.env.ALLOW_SOFTWARE_GL !== "1" &&
|
||||
process.env.ALLOW_SOFTWARE_GL !== '1' &&
|
||||
(!info.webgl ||
|
||||
/SwiftShader|llvmpipe|software/i.test(`${info.vendor} ${info.renderer}`))
|
||||
) {
|
||||
throw new Error(
|
||||
"Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.",
|
||||
'Recording browser did not get hardware WebGL. Set ALLOW_SOFTWARE_GL=1 to bypass this guard.',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -89,45 +93,45 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
const RealWS = window.WebSocket;
|
||||
window.WebSocket = new Proxy(RealWS, {
|
||||
construct(target, args) {
|
||||
const url = String(args[0] ?? "");
|
||||
const proto = (args[1] as string | string[] | undefined) ?? "";
|
||||
const protoStr = Array.isArray(proto) ? proto.join(",") : proto;
|
||||
const url = String(args[0] ?? '');
|
||||
const proto = (args[1] as string | string[] | undefined) ?? '';
|
||||
const protoStr = Array.isArray(proto) ? proto.join(',') : proto;
|
||||
if (
|
||||
protoStr.includes("vite-hmr") ||
|
||||
protoStr.includes("webpack") ||
|
||||
url.includes("/ws") ||
|
||||
url.includes("sockjs-node")
|
||||
protoStr.includes('vite-hmr') ||
|
||||
protoStr.includes('webpack') ||
|
||||
url.includes('/ws') ||
|
||||
url.includes('sockjs-node')
|
||||
) {
|
||||
const fake = new EventTarget() as WebSocket;
|
||||
Object.defineProperties(fake, {
|
||||
readyState: { value: RealWS.CLOSED },
|
||||
url: { value: url },
|
||||
protocol: { value: "" },
|
||||
extensions: { value: "" },
|
||||
protocol: { value: '' },
|
||||
extensions: { value: '' },
|
||||
bufferedAmount: { value: 0 },
|
||||
binaryType: { value: "blob", writable: true },
|
||||
binaryType: { value: 'blob', writable: true },
|
||||
});
|
||||
fake.send = () => {};
|
||||
fake.close = () => fake.dispatchEvent(new Event("close"));
|
||||
queueMicrotask(() => fake.dispatchEvent(new Event("close")));
|
||||
fake.close = () => fake.dispatchEvent(new Event('close'));
|
||||
queueMicrotask(() => fake.dispatchEvent(new Event('close')));
|
||||
return fake;
|
||||
}
|
||||
return Reflect.construct(target, args);
|
||||
},
|
||||
});
|
||||
|
||||
Object.defineProperty(window.location, "reload", {
|
||||
Object.defineProperty(window.location, 'reload', {
|
||||
value: () => {},
|
||||
configurable: true,
|
||||
});
|
||||
window.addEventListener("error", (e) => e.stopImmediatePropagation(), true);
|
||||
window.addEventListener('error', (e) => e.stopImmediatePropagation(), true);
|
||||
window.addEventListener(
|
||||
"unhandledrejection",
|
||||
'unhandledrejection',
|
||||
(e) => e.stopImmediatePropagation(),
|
||||
true,
|
||||
);
|
||||
|
||||
const styleEl = document.createElement("style");
|
||||
const styleEl = document.createElement('style');
|
||||
styleEl.textContent = `
|
||||
vite-error-overlay,
|
||||
wds-overlay,
|
||||
|
|
@ -148,12 +152,12 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
|
||||
const killOverlay = (node: Element) => {
|
||||
const tag = node.tagName?.toLowerCase();
|
||||
const id = (node as HTMLElement).id?.toLowerCase() ?? "";
|
||||
const id = (node as HTMLElement).id?.toLowerCase() ?? '';
|
||||
if (
|
||||
tag === "vite-error-overlay" ||
|
||||
tag === "wds-overlay" ||
|
||||
id.includes("webpack-dev-server-client") ||
|
||||
id.includes("webpack-error")
|
||||
tag === 'vite-error-overlay' ||
|
||||
tag === 'wds-overlay' ||
|
||||
id.includes('webpack-dev-server-client') ||
|
||||
id.includes('webpack-error')
|
||||
) {
|
||||
(node as HTMLElement).remove();
|
||||
}
|
||||
|
|
@ -168,7 +172,7 @@ async function suppressDevServerNoise(context: BrowserContext) {
|
|||
if (document.body)
|
||||
obs.observe(document.body, { childList: true, subtree: true });
|
||||
else {
|
||||
document.addEventListener("DOMContentLoaded", () =>
|
||||
document.addEventListener('DOMContentLoaded', () =>
|
||||
obs.observe(document.body, { childList: true, subtree: true }),
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,101 +6,19 @@ function requiredEnv(name: string): string {
|
|||
return value;
|
||||
}
|
||||
|
||||
function requiredNumberEnv(name: string): number {
|
||||
const value = Number(requiredEnv(name));
|
||||
if (!Number.isFinite(value)) {
|
||||
throw new Error(`${name} must be a finite number`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
// Environment-only knobs. Per-storyboard tuning (aspect, fps, bitrate,
|
||||
// voice, prompts, brand…) lives on the Storyboard object itself — see
|
||||
// src/storyboard.ts.
|
||||
|
||||
export const APP_URL = requiredEnv("APP_URL");
|
||||
export const DASHBOARD_PATH = "/dashboard";
|
||||
export const APP_URL = requiredEnv('APP_URL');
|
||||
export const DASHBOARD_PATH = '/dashboard';
|
||||
|
||||
// Per-target storage state. render.sh sets AUTH_STATE_FILE to auth.local.json
|
||||
// or auth.prod.json so a stale local token can't be reused against prod.
|
||||
export const AUTH_STATE_PATH = process.env.AUTH_STATE_FILE ?? "auth.json";
|
||||
export const OUTPUT_DIR = "output";
|
||||
|
||||
const aspect = requiredEnv("ASPECT");
|
||||
if (aspect !== "16x9" && aspect !== "9x16") {
|
||||
throw new Error("ASPECT must be '16x9' or '9x16'");
|
||||
}
|
||||
export const VIEWPORT =
|
||||
aspect === "9x16"
|
||||
? { width: 1080, height: 1920 }
|
||||
: { width: 1920, height: 1080 };
|
||||
export const CAPTURE_SCALE = Math.max(1, requiredNumberEnv("CAPTURE_SCALE"));
|
||||
export const VIDEO_SIZE = {
|
||||
width: VIEWPORT.width,
|
||||
height: VIEWPORT.height,
|
||||
};
|
||||
export const WEBM_BITRATE = requiredEnv("WEBM_BITRATE");
|
||||
|
||||
// Cold-open prompt. Punchy version of the user's intent, short enough to type
|
||||
// on camera without making the opening scene drag.
|
||||
export const PROMPT_TEXT = requiredEnv("PROMPT_TEXT");
|
||||
|
||||
// Filters returned by the AI stub. Keys MUST match real feature names from
|
||||
// /api/features (verified against the running server's schema).
|
||||
export const STUBBED_FILTERS: Record<string, [number, number] | string[]> = {
|
||||
"Property type": ["Flats/Maisonettes", "Terraced"],
|
||||
"Estimated current price": [175000, 450000],
|
||||
"Serious crime per 1k residents (avg/yr)": [0, 55],
|
||||
"Noise (dB)": [50, 68],
|
||||
};
|
||||
|
||||
// Travel-time filters returned by the AI stub. Slug matches the real
|
||||
// /api/travel-destinations?mode=transit response.
|
||||
export const STUBBED_TRAVEL_TIME_FILTERS: {
|
||||
mode: "transit" | "car" | "bicycle" | "walking";
|
||||
slug: string;
|
||||
label: string;
|
||||
min?: number;
|
||||
max?: number;
|
||||
}[] = [
|
||||
{
|
||||
mode: "transit",
|
||||
slug: "manchester",
|
||||
label: "Manchester city centre",
|
||||
max: 35,
|
||||
},
|
||||
];
|
||||
|
||||
// The travel-time card we'll drag manually after AI applies. The Filters
|
||||
// component renders each travel-time entry with `data-filter-name="tt_${i}"`,
|
||||
// and our stub only sets one entry, so it's tt_0.
|
||||
export const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
|
||||
export const TT_SLIDER_MAX = 120;
|
||||
export const TT_DRAG_FROM_MIN = 35; // matches AI stub max above
|
||||
export const TT_DRAG_TO_MIN = 20;
|
||||
|
||||
// Cold-open zoom: how aggressively to magnify the AI box.
|
||||
// 2.4 fills most of the viewport with the prompt card without blowing up text.
|
||||
export const AI_ZOOM_SCALE = requiredNumberEnv("AI_ZOOM_SCALE");
|
||||
|
||||
// Initial map view used while we navigate. The AI scene zooms in on the
|
||||
// sidebar so this only matters once we zoom out.
|
||||
export const INITIAL_MAP_VIEW = {
|
||||
lat: 53.4795,
|
||||
lon: -2.2451,
|
||||
zoom: 11.5,
|
||||
};
|
||||
|
||||
// Verification guard only. The renderer does not use this as an editing cap:
|
||||
// if the storyboard needs more than 15 seconds to avoid jumps, keep the frames.
|
||||
export const MAX_DURATION_S = requiredNumberEnv("MAX_DURATION_S");
|
||||
export const MIN_DURATION_S = requiredNumberEnv("MIN_DURATION_S");
|
||||
|
||||
// Target fps of the FINAL output.
|
||||
export const OUTPUT_FPS = requiredNumberEnv("OUTPUT_FPS");
|
||||
export const AUTH_STATE_PATH = process.env.AUTH_STATE_FILE ?? 'auth.json';
|
||||
export const OUTPUT_DIR = 'output';
|
||||
|
||||
// Frames of head-room kept in front of sceneStart when trimming. Shared by
|
||||
// the video trim and the narration manifest so cue offsets line up with the
|
||||
// trimmed timeline.
|
||||
// trimmed timeline. Not tuned per storyboard — same lead-in for any cut.
|
||||
export const LEAD_IN_S = 0.12;
|
||||
|
||||
// Brand strings for the outro card.
|
||||
export const BRAND_NAME = "Perfect Postcode";
|
||||
export const BRAND_TAGLINE = "Find where you actually want to live.";
|
||||
export const BRAND_URL = "https://perfect-postcode.co.uk";
|
||||
|
|
|
|||
|
|
@ -1,32 +1,83 @@
|
|||
import { existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { OUTPUT_DIR } from './config.js';
|
||||
import { storyboard } from './storyboard.js';
|
||||
import type { Storyboard } from './script.js';
|
||||
import { storyboards } from './storyboard.js';
|
||||
|
||||
/**
|
||||
* Emit the narration script for the synth step.
|
||||
* Emit per-storyboard narration scripts for the synth step.
|
||||
*
|
||||
* Synth (tts/synth.py) runs BEFORE recording, so it needs the full ordered
|
||||
* narration list — text + per-cue gaps — without depending on Playwright,
|
||||
* the dashboard, or auth. Walk the storyboard cues, write a flat manifest,
|
||||
* exit.
|
||||
* narration list — text + per-cue gaps + voice config — without depending
|
||||
* on Playwright, the dashboard, or auth. Walk each storyboard's cues, write
|
||||
* a flat manifest under `output/<name>/narration-script.json`, then write
|
||||
* an index manifest at `output/storyboards.json` so render.sh knows which
|
||||
* storyboard slugs to loop over.
|
||||
*
|
||||
* The cue index in this manifest is the source of truth: the runner later
|
||||
* The cue index in each manifest is the source of truth: the runner later
|
||||
* matches storyboard cues to measured durations by index.
|
||||
*/
|
||||
function main(): void {
|
||||
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
// Em/en-dashes and ellipses make Qwen3-TTS produce dramatic pauses, sighs,
|
||||
// or audible breaths — the captions still render the original (unicode-rich)
|
||||
// text from the storyboard; only the synth input is sanitised.
|
||||
function normalizeForTts(text: string): string {
|
||||
return text
|
||||
.replace(/\s*[—–]\s*/g, ', ')
|
||||
.replace(/…/g, '.')
|
||||
.replace(/\.{3,}/g, '.')
|
||||
.replace(/\s{2,}/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function emitScript(storyboard: Storyboard): string {
|
||||
const dir = join(OUTPUT_DIR, storyboard.name);
|
||||
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||
|
||||
const items = storyboard.cues.map((cue, cueIndex) => ({
|
||||
cueIndex,
|
||||
text: cue.text.trim(),
|
||||
text: normalizeForTts(cue.text),
|
||||
gapBeforeMs: cue.gapBeforeMs,
|
||||
}));
|
||||
|
||||
const manifest = { items };
|
||||
const path = join(OUTPUT_DIR, 'narration-script.json');
|
||||
// The voice block is consumed by tts/synth.py — see _resolve_reference and
|
||||
// the cache check there for which fields invalidate cached audio.
|
||||
const manifest = {
|
||||
storyboard: storyboard.name,
|
||||
voice: {
|
||||
instruct: storyboard.voice.instruct,
|
||||
language: storyboard.voice.language,
|
||||
temperature: storyboard.voice.temperature ?? 0.6,
|
||||
topP: storyboard.voice.topP ?? 0.9,
|
||||
seed: storyboard.voice.seed ?? 42,
|
||||
},
|
||||
items,
|
||||
};
|
||||
const path = join(dir, 'narration-script.json');
|
||||
writeFileSync(path, JSON.stringify(manifest, null, 2));
|
||||
console.log(`Wrote ${items.length} narration cues to ${path}`);
|
||||
console.log(`[preflight] [${storyboard.name}] wrote ${items.length} cues → ${path}`);
|
||||
return path;
|
||||
}
|
||||
|
||||
function main(): void {
|
||||
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
for (const sb of storyboards) emitScript(sb);
|
||||
|
||||
// Index for shell loops — each entry has every field render.sh needs to
|
||||
// address per-storyboard outputs without re-parsing the TS source.
|
||||
const index = {
|
||||
storyboards: storyboards.map((sb) => ({
|
||||
name: sb.name,
|
||||
aspect: sb.video.aspect,
|
||||
outputFps: sb.video.outputFps,
|
||||
minDurationS: sb.video.minDurationS,
|
||||
maxDurationS: sb.video.maxDurationS,
|
||||
posterTimeS: sb.video.posterTimeS,
|
||||
})),
|
||||
};
|
||||
const indexPath = join(OUTPUT_DIR, 'storyboards.json');
|
||||
writeFileSync(indexPath, JSON.stringify(index, null, 2));
|
||||
console.log(`[preflight] wrote storyboard index → ${indexPath}`);
|
||||
}
|
||||
|
||||
main();
|
||||
|
|
|
|||
|
|
@ -1,11 +1,15 @@
|
|||
import { chromium } from 'playwright';
|
||||
import { APP_URL, AUTH_STATE_PATH, DASHBOARD_PATH, VIEWPORT } from './config.js';
|
||||
import { APP_URL, AUTH_STATE_PATH, DASHBOARD_PATH } from './config.js';
|
||||
import { viewportFor } from './script.js';
|
||||
import { storyboards } from './storyboard.js';
|
||||
|
||||
async function main() {
|
||||
// probe is a debug utility — pin it to the first storyboard's viewport.
|
||||
const viewport = viewportFor(storyboards[0].video);
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const context = await browser.newContext({
|
||||
storageState: AUTH_STATE_PATH,
|
||||
viewport: VIEWPORT,
|
||||
viewport,
|
||||
});
|
||||
const page = await context.newPage();
|
||||
page.on('request', (r) => {
|
||||
|
|
|
|||
|
|
@ -4,18 +4,20 @@ import { AUTH_STATE_PATH, LEAD_IN_S, OUTPUT_DIR } from './config.js';
|
|||
import { assertHardwareWebGL, launchRecordingBrowser } from './browser.js';
|
||||
import { narrationLog } from './narration.js';
|
||||
import { installDemoRoutes } from './routes.js';
|
||||
import { storyboard } from './storyboard.js';
|
||||
import type { Storyboard } from './script.js';
|
||||
import { storyboards } from './storyboard.js';
|
||||
import { prepareTimeline, runTimeline } from './timeline.js';
|
||||
import { trimRecording } from './video.js';
|
||||
|
||||
async function main() {
|
||||
if (!existsSync(AUTH_STATE_PATH)) {
|
||||
console.error(`No ${AUTH_STATE_PATH} found. Run "npm run setup-auth" first.`);
|
||||
process.exit(1);
|
||||
}
|
||||
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
async function recordOne(storyboard: Storyboard): Promise<void> {
|
||||
const dir = join(OUTPUT_DIR, storyboard.name);
|
||||
if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
|
||||
|
||||
const { browser, context } = await launchRecordingBrowser();
|
||||
console.log(`\n=== [${storyboard.name}] recording ===`);
|
||||
|
||||
const { browser, context } = await launchRecordingBrowser(storyboard, {
|
||||
recordDir: dir,
|
||||
});
|
||||
const page = await context.newPage();
|
||||
await assertHardwareWebGL(page);
|
||||
const recordedVideo = page.video();
|
||||
|
|
@ -37,22 +39,21 @@ async function main() {
|
|||
if (u.includes('ai-filters')) console.log(`[req] ${r.method()} ${u}`);
|
||||
});
|
||||
|
||||
await installDemoRoutes(page);
|
||||
const ctx = await prepareTimeline(page);
|
||||
await installDemoRoutes(page, storyboard);
|
||||
const ctx = await prepareTimeline(page, storyboard);
|
||||
const timeline = await runTimeline(ctx, storyboard);
|
||||
|
||||
await page.close();
|
||||
const rawPath = join(OUTPUT_DIR, 'recording.raw.webm');
|
||||
const rawPath = join(dir, 'recording.raw.webm');
|
||||
if (recordedVideo) await recordedVideo.saveAs(rawPath);
|
||||
await context.close();
|
||||
await browser.close();
|
||||
|
||||
if (!recordedVideo || !statSync(rawPath).size) {
|
||||
console.error('no recorded webm found');
|
||||
process.exit(1);
|
||||
throw new Error(`[${storyboard.name}] no recorded webm found`);
|
||||
}
|
||||
|
||||
trimRecording(rawPath, join(OUTPUT_DIR, 'recording.webm'), {
|
||||
trimRecording(rawPath, join(dir, 'recording.webm'), storyboard, {
|
||||
recordStartMs,
|
||||
...timeline,
|
||||
});
|
||||
|
|
@ -60,13 +61,25 @@ async function main() {
|
|||
const totalDurationMs =
|
||||
timeline.sceneEndMs - timeline.sceneStartMs + LEAD_IN_S * 1000;
|
||||
const cues = narrationLog.flush(
|
||||
join(OUTPUT_DIR, 'narration.json'),
|
||||
join(dir, 'narration.json'),
|
||||
totalDurationMs
|
||||
);
|
||||
console.log(
|
||||
`Wrote ${cues.length} narration cues to ${join(OUTPUT_DIR, 'narration.json')}`
|
||||
`[${storyboard.name}] wrote ${cues.length} narration cues → ${join(dir, 'narration.json')}`
|
||||
);
|
||||
console.log('Run "npm run encode" to produce output/recording.mp4');
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
if (!existsSync(AUTH_STATE_PATH)) {
|
||||
console.error(`No ${AUTH_STATE_PATH} found. Run "npm run setup-auth" first.`);
|
||||
process.exit(1);
|
||||
}
|
||||
if (!existsSync(OUTPUT_DIR)) mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
|
||||
for (const sb of storyboards) {
|
||||
await recordOne(sb);
|
||||
}
|
||||
console.log(`\n=== recorded ${storyboards.length} storyboard(s) ===`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
|
|
|
|||
|
|
@ -1,35 +1,33 @@
|
|||
import type { Page } from 'playwright';
|
||||
import {
|
||||
APP_URL,
|
||||
DASHBOARD_PATH,
|
||||
INITIAL_MAP_VIEW,
|
||||
STUBBED_FILTERS,
|
||||
STUBBED_TRAVEL_TIME_FILTERS,
|
||||
} from './config.js';
|
||||
import { APP_URL, DASHBOARD_PATH } from './config.js';
|
||||
import type { Storyboard } from './script.js';
|
||||
|
||||
export async function installDemoRoutes(page: Page) {
|
||||
await Promise.all([stubAiFilters(page), stubExport(page)]);
|
||||
export async function installDemoRoutes(page: Page, storyboard: Storyboard) {
|
||||
await Promise.all([stubAiFilters(page, storyboard), stubExport(page)]);
|
||||
}
|
||||
|
||||
export function dashboardUrl(): string {
|
||||
export function dashboardUrl(storyboard: Storyboard): string {
|
||||
const view = storyboard.content.initialMapView;
|
||||
const params = new URLSearchParams({
|
||||
lat: String(INITIAL_MAP_VIEW.lat),
|
||||
lon: String(INITIAL_MAP_VIEW.lon),
|
||||
zoom: String(INITIAL_MAP_VIEW.zoom),
|
||||
lat: String(view.lat),
|
||||
lon: String(view.lon),
|
||||
zoom: String(view.zoom),
|
||||
});
|
||||
addInitialTravelTimeParams(params);
|
||||
for (const tt of storyboard.content.stubbedTravelTimeFilters) {
|
||||
params.append('tt', `${tt.mode}:${tt.slug}:${tt.label}:${tt.min ?? 0}:${tt.max ?? 120}`);
|
||||
}
|
||||
return `${APP_URL}${DASHBOARD_PATH}?${params}`;
|
||||
}
|
||||
|
||||
async function stubAiFilters(page: Page) {
|
||||
async function stubAiFilters(page: Page, storyboard: Storyboard) {
|
||||
await page.route('**/api/ai-filters', async (route) => {
|
||||
await new Promise((r) => setTimeout(r, 120));
|
||||
await route.fulfill({
|
||||
status: 200,
|
||||
contentType: 'application/json',
|
||||
body: JSON.stringify({
|
||||
filters: STUBBED_FILTERS,
|
||||
travel_time_filters: STUBBED_TRAVEL_TIME_FILTERS,
|
||||
filters: storyboard.content.stubbedFilters,
|
||||
travel_time_filters: storyboard.content.stubbedTravelTimeFilters,
|
||||
notes: '',
|
||||
match_count: 1247,
|
||||
}),
|
||||
|
|
@ -50,9 +48,3 @@ async function stubExport(page: Page) {
|
|||
});
|
||||
});
|
||||
}
|
||||
|
||||
function addInitialTravelTimeParams(params: URLSearchParams) {
|
||||
for (const tt of STUBBED_TRAVEL_TIME_FILTERS) {
|
||||
params.append('tt', `${tt.mode}:${tt.slug}:${tt.label}:${tt.min ?? 0}:${tt.max ?? 120}`);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -243,7 +243,7 @@ async function resolveTarget(
|
|||
* against.
|
||||
*/
|
||||
function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
|
||||
const path = join(OUTPUT_DIR, 'audio', 'index.json');
|
||||
const path = join(OUTPUT_DIR, storyboard.name, 'audio', 'index.json');
|
||||
if (existsSync(path)) {
|
||||
const raw = JSON.parse(readFileSync(path, 'utf-8')) as {
|
||||
items: SynthCue[];
|
||||
|
|
|
|||
|
|
@ -97,13 +97,97 @@ export interface Cue {
|
|||
tail?: Activity[];
|
||||
}
|
||||
|
||||
/** Recorder + encoder knobs. Set per storyboard so vertical/horizontal cuts
|
||||
* can coexist without env-var juggling. */
|
||||
export interface VideoConfig {
|
||||
/** "16x9" → 1920x1080, "9x16" → 1080x1920. */
|
||||
aspect: '16x9' | '9x16';
|
||||
/** Browser deviceScaleFactor. >1 supersamples for sharper text. */
|
||||
captureScale: number;
|
||||
/** WebM bitrate passed to libvpx, e.g. "8M" or "18M". */
|
||||
webmBitrate: string;
|
||||
/** Final fps after the trim/resample pass. */
|
||||
outputFps: number;
|
||||
/** verify.ts duration window. */
|
||||
minDurationS: number;
|
||||
maxDurationS: number;
|
||||
/** Timestamp (seconds, in the trimmed mp4) used to extract the homepage
|
||||
* poster JPEG. Pick a frame that previews well on a paused player. */
|
||||
posterTimeS: number;
|
||||
}
|
||||
|
||||
/** Qwen3-TTS voice + language settings, sent to synth.py via the narration
|
||||
* script. Per storyboard so we can ship a British male narrator on one cut
|
||||
* and a different persona on another. */
|
||||
export interface VoiceConfig {
|
||||
/** VoiceDesign persona prompt (accent, register, anti-filler directives). */
|
||||
instruct: string;
|
||||
/** Qwen3-TTS language string, e.g. "English". */
|
||||
language: string;
|
||||
/** Sampling temperature (default 0.6). */
|
||||
temperature?: number;
|
||||
/** Top-p nucleus sampling (default 0.9). */
|
||||
topP?: number;
|
||||
/** Reproducibility seed (default 42). */
|
||||
seed?: number;
|
||||
}
|
||||
|
||||
/** Brand strings rendered by the outro card. */
|
||||
export interface BrandConfig {
|
||||
name: string;
|
||||
tagline: string;
|
||||
url: string;
|
||||
}
|
||||
|
||||
/** Story-specific content: the AI prompt typed on camera, the stubbed AI
|
||||
* response, the initial map view, and the travel-time slider tuning. The
|
||||
* storyboard cues reference these via the active Storyboard rather than
|
||||
* through globals so multiple storyboards can declare different prompts /
|
||||
* filters / drag targets without colliding. */
|
||||
export interface ContentConfig {
|
||||
/** Prompt text typed into the AI box during the cold open. */
|
||||
promptText: string;
|
||||
/** Cold-open zoom multiplier on the AI card. */
|
||||
aiZoomScale: number;
|
||||
initialMapView: { lat: number; lon: number; zoom: number };
|
||||
stubbedFilters: Record<string, [number, number] | string[]>;
|
||||
stubbedTravelTimeFilters: TravelTimeFilter[];
|
||||
travelTimeCardSelector: string;
|
||||
travelTimeSliderMax: number;
|
||||
travelTimeDragFromMin: number;
|
||||
travelTimeDragToMin: number;
|
||||
brand: BrandConfig;
|
||||
}
|
||||
|
||||
export interface TravelTimeFilter {
|
||||
mode: 'transit' | 'car' | 'bicycle' | 'walking';
|
||||
slug: string;
|
||||
label: string;
|
||||
min?: number;
|
||||
max?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Top-level storyboard. `pre` runs once before the first cue's gapBefore;
|
||||
* `post` runs once after the last cue's tail finishes. The cue list is what
|
||||
* gets handed to the synth step.
|
||||
*
|
||||
* `name` doubles as the on-disk slug — outputs go to `output/<name>/` and
|
||||
* publish as `<name>.mp4` + `<name>.jpg`. Keep names URL/path-safe.
|
||||
*/
|
||||
export interface Storyboard {
|
||||
name: string;
|
||||
video: VideoConfig;
|
||||
voice: VoiceConfig;
|
||||
content: ContentConfig;
|
||||
pre?: Activity[];
|
||||
cues: Cue[];
|
||||
post?: Activity[];
|
||||
}
|
||||
|
||||
/** Convenience: derive the viewport from aspect. */
|
||||
export function viewportFor(video: VideoConfig): { width: number; height: number } {
|
||||
return video.aspect === '9x16'
|
||||
? { width: 1080, height: 1920 }
|
||||
: { width: 1920, height: 1080 };
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,31 +1,33 @@
|
|||
import {
|
||||
AI_ZOOM_SCALE,
|
||||
BRAND_NAME,
|
||||
BRAND_TAGLINE,
|
||||
BRAND_URL,
|
||||
PROMPT_TEXT,
|
||||
TT_CARD_SELECTOR,
|
||||
TT_DRAG_TO_MIN,
|
||||
TT_SLIDER_MAX,
|
||||
} from './config.js';
|
||||
import { el, type Storyboard } from './script.js';
|
||||
|
||||
/**
|
||||
* The demo video, top to bottom.
|
||||
* The list of demo videos to render, in order.
|
||||
*
|
||||
* Audio is generated first (one batched Qwen call), so each cue's actual
|
||||
* duration is known before recording. The runner sizes each cue's wall-time
|
||||
* to the measured audio length, padding short `during` blocks with a
|
||||
* trailing wait. Inter-cue spacing is controlled here via `gapBeforeMs`
|
||||
* (silence in audio) plus optional `tail` activities (visual movement after
|
||||
* the caption hides, before the next cue's gap).
|
||||
* Each entry is a fully self-contained Storyboard: video knobs (aspect,
|
||||
* bitrate, fps), voice persona (Qwen3-TTS instruct + language + sampling),
|
||||
* stubbed AI response, brand strings, AND the cue list. There is no shared
|
||||
* global state — to ship a vertical cut, a different prompt, or a different
|
||||
* voice, push another item onto this array.
|
||||
*
|
||||
* `name` doubles as the on-disk slug. The pipeline writes per-storyboard
|
||||
* artefacts to `output/<name>/` and publishes `<name>.mp4` / `<name>.jpg`
|
||||
* to the homepage. The default storyboard is named `recording` so the
|
||||
* existing homepage `/video/recording.mp4` keeps working unchanged.
|
||||
*
|
||||
* Audio is generated first (one batched Qwen call per storyboard, using
|
||||
* its own voice config), so each cue's actual duration is known before
|
||||
* recording. The runner sizes each cue's wall-time to the measured audio
|
||||
* length, padding short `during` blocks with a trailing wait. Inter-cue
|
||||
* spacing is controlled here via `gapBeforeMs` (silence in audio) plus
|
||||
* optional `tail` activities (visual movement after the caption hides,
|
||||
* before the next cue's gap).
|
||||
*
|
||||
* Sum of `during` declared durations MUST be ≤ measured cue duration. If
|
||||
* synth comes back tighter than the activities can fit, the runner throws
|
||||
* with a pointer to the offending cue — bump that cue's text, lengthen its
|
||||
* gapBefore, or trim a during step.
|
||||
*
|
||||
* Reference durations (Qwen3-TTS / speaker=ryan, 2026-05-09 measured):
|
||||
* Reference durations (Qwen3-TTS / British male narrator, 2026-05-09):
|
||||
* cue 0 1920ms "Describe the life you want."
|
||||
* cue 1 2720ms "Every matching neighbourhood, side by side."
|
||||
* cue 2 2160ms "Tighten the commute to 20 minutes."
|
||||
|
|
@ -34,137 +36,238 @@ import { el, type Storyboard } from './script.js';
|
|||
* cue 5 1760ms "Take the shortlist into Excel."
|
||||
* cue 6 4400ms "Perfect Postcode. Find where you actually want to live."
|
||||
*/
|
||||
export const storyboard: Storyboard = {
|
||||
|
||||
const PROMPT_TEXT = 'Flats or terraces <£450k, 35 min to Manchester, low crime';
|
||||
|
||||
const BRAND = {
|
||||
name: 'Perfect Postcode',
|
||||
tagline: 'Find where you actually want to live.',
|
||||
url: 'https://perfect-postcode.co.uk',
|
||||
};
|
||||
|
||||
// Cold-open zoom: how aggressively to magnify the AI box.
|
||||
// 2.4 fills most of the viewport with the prompt card without blowing up text.
|
||||
const AI_ZOOM_SCALE = 2.4;
|
||||
|
||||
// The travel-time card we'll drag manually after AI applies. The Filters
|
||||
// component renders each travel-time entry with `data-filter-name="tt_${i}"`,
|
||||
// and our stub only sets one entry, so it's tt_0.
|
||||
const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
|
||||
const TT_SLIDER_MAX = 120;
|
||||
const TT_DRAG_FROM_MIN = 35; // matches AI stub max below
|
||||
const TT_DRAG_TO_MIN = 20;
|
||||
|
||||
// Calm British male narrator. Matches what tts/synth.py used to default to;
|
||||
// kept identical so existing audio caches don't invalidate on first run.
|
||||
const BRITISH_MALE_NARRATOR =
|
||||
'Calm, professional middle-aged Chinese male narrator with a ' +
|
||||
'strong Chinese accent. Even, measured pace; warm but ' +
|
||||
'understated; product-demo register. Do not laugh, sigh, gasp, or add ' +
|
||||
'filler sounds; no audible breaths between sentences.';
|
||||
|
||||
const DEFAULT_CUES: Storyboard['cues'] = [
|
||||
// -- Scene 1: AI prompt ----------------------------------------------
|
||||
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
|
||||
// happen silently in the tail. The natural beat is: viewer hears the
|
||||
// brief, then watches the prompt being typed.
|
||||
{
|
||||
text: 'Describe the life you want.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
{
|
||||
kind: 'type',
|
||||
selector: '[data-tutorial="ai-filters"] textarea',
|
||||
text: PROMPT_TEXT,
|
||||
durationMs: 3000,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
|
||||
{ kind: 'wait', durationMs: 700 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 2: zoom out reveal ---------------------------------------
|
||||
{
|
||||
text: 'Every matching neighbourhood, side by side.',
|
||||
gapBeforeMs: 400,
|
||||
during: [{ kind: 'zoomReset', durationMs: 1400 }],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 3: travel-time slider ------------------------------------
|
||||
{
|
||||
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{
|
||||
kind: 'dragSlider',
|
||||
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
|
||||
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
|
||||
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
|
||||
durationMs: 1400,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 4a: deep zoom into a hexagon -----------------------------
|
||||
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
|
||||
// earlier in this cue's during, the click + payoff dwell are in tail.
|
||||
{
|
||||
text: 'Drill into a single block.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
|
||||
{
|
||||
kind: 'mapZoom',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
steps: 18,
|
||||
durationMs: 1500,
|
||||
},
|
||||
],
|
||||
tail: [
|
||||
// Wait for the post-zoom /api/postcodes response and a redraw
|
||||
// before the click — otherwise the click can fire on a stale
|
||||
// frame and miss the polygon.
|
||||
{ kind: 'wait', durationMs: 1200 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
durationMs: 700,
|
||||
},
|
||||
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
|
||||
// Linger so the climax cue lands on the right-pane reveal.
|
||||
{ kind: 'wait', durationMs: 1500 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 4b: right-pane payoff -----------------------------------
|
||||
// 4480ms cue, no during — the camera holds on the populated right pane
|
||||
// for the whole climax line. Tail dwells before the export beat.
|
||||
{
|
||||
text: 'Stats, listings, Street View, price history — all in one pane.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 5: export ------------------------------------------------
|
||||
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
|
||||
{
|
||||
text: 'Take the shortlist into Excel.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'zoomReset', durationMs: 900 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: el('button[title="Export to Excel"]'),
|
||||
durationMs: 800,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 800 }],
|
||||
},
|
||||
|
||||
// -- Scene 6: outro -------------------------------------------------
|
||||
{
|
||||
text: `${BRAND.name}. ${BRAND.tagline}`,
|
||||
gapBeforeMs: 600,
|
||||
during: [
|
||||
{
|
||||
kind: 'showOutro',
|
||||
brand: BRAND.name,
|
||||
tagline: BRAND.tagline,
|
||||
url: BRAND.url,
|
||||
durationMs: 0,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1500 }],
|
||||
},
|
||||
];
|
||||
|
||||
const DEFAULT_PRE: Storyboard['pre'] = [
|
||||
// Camera push-in to the AI box happens before the first caption — silent
|
||||
// setup keeps the cold open from feeling rushed.
|
||||
pre: [
|
||||
{ kind: 'clearVignette', durationMs: 0 },
|
||||
{ kind: 'wait', durationMs: 200 },
|
||||
{
|
||||
kind: 'zoomTo',
|
||||
target: el('[data-tutorial="ai-filters"]'),
|
||||
scale: AI_ZOOM_SCALE,
|
||||
durationMs: 1300,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
],
|
||||
{ kind: 'clearVignette', durationMs: 0 },
|
||||
{ kind: 'wait', durationMs: 200 },
|
||||
{
|
||||
kind: 'zoomTo',
|
||||
target: el('[data-tutorial="ai-filters"]'),
|
||||
scale: AI_ZOOM_SCALE,
|
||||
durationMs: 1300,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
];
|
||||
|
||||
cues: [
|
||||
// -- Scene 1: AI prompt ----------------------------------------------
|
||||
// Cue 0 is short (1920ms) — caption shows alone, then typing + submit
|
||||
// happen silently in the tail. The natural beat is: viewer hears the
|
||||
// brief, then watches the prompt being typed.
|
||||
{
|
||||
text: 'Describe the life you want.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
export const storyboards: Storyboard[] = [
|
||||
{
|
||||
name: 'recording',
|
||||
video: {
|
||||
aspect: '16x9',
|
||||
captureScale: 1,
|
||||
// 8M is enough for 1920x1080 at captureScale=1; bump to 18M when
|
||||
// captureScale > 1 (supersampled) — see render.sh history if reviving
|
||||
// higher-quality cuts.
|
||||
webmBitrate: '8M',
|
||||
outputFps: 50,
|
||||
minDurationS: 10,
|
||||
maxDurationS: 60,
|
||||
// Right-pane inspection (~16s into the trimmed timeline) is the
|
||||
// clearest paused-state preview: Manchester map, filters applied,
|
||||
// right pane populated, larger narration caption visible.
|
||||
posterTimeS: 16,
|
||||
},
|
||||
voice: {
|
||||
instruct: BRITISH_MALE_NARRATOR,
|
||||
language: 'English',
|
||||
// Sampling pinned for cue-to-cue consistency. Lower temp/top_p make
|
||||
// the decoder less likely to sample non-speech tokens (laughter,
|
||||
// random noise) at the cost of slightly flatter intonation. Seed
|
||||
// makes runs reproducible.
|
||||
temperature: 0.6,
|
||||
topP: 0.9,
|
||||
seed: 42,
|
||||
},
|
||||
content: {
|
||||
promptText: PROMPT_TEXT,
|
||||
aiZoomScale: AI_ZOOM_SCALE,
|
||||
// Initial map view used while we navigate. The AI scene zooms in on
|
||||
// the sidebar so this only matters once we zoom out.
|
||||
initialMapView: { lat: 53.4795, lon: -2.2451, zoom: 11.5 },
|
||||
// Filters returned by the AI stub. Keys MUST match real feature names
|
||||
// from /api/features (verified against the running server's schema).
|
||||
stubbedFilters: {
|
||||
'Property type': ['Flats/Maisonettes', 'Terraced'],
|
||||
'Estimated current price': [175000, 450000],
|
||||
'Serious crime per 1k residents (avg/yr)': [0, 55],
|
||||
'Noise (dB)': [50, 68],
|
||||
},
|
||||
// Travel-time filters returned by the AI stub. Slug matches the real
|
||||
// /api/travel-destinations?mode=transit response.
|
||||
stubbedTravelTimeFilters: [
|
||||
{
|
||||
kind: 'type',
|
||||
selector: '[data-tutorial="ai-filters"] textarea',
|
||||
text: PROMPT_TEXT,
|
||||
durationMs: 3000,
|
||||
},
|
||||
{ kind: 'wait', durationMs: 140 },
|
||||
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
|
||||
{ kind: 'wait', durationMs: 700 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 2: zoom out reveal ---------------------------------------
|
||||
{
|
||||
text: 'Every matching neighbourhood, side by side.',
|
||||
gapBeforeMs: 400,
|
||||
during: [{ kind: 'zoomReset', durationMs: 1400 }],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 3: travel-time slider ------------------------------------
|
||||
{
|
||||
text: `Tighten the commute to ${TT_DRAG_TO_MIN} minutes.`,
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{
|
||||
kind: 'dragSlider',
|
||||
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
|
||||
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
|
||||
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
|
||||
durationMs: 1400,
|
||||
mode: 'transit',
|
||||
slug: 'manchester',
|
||||
label: 'Manchester city centre',
|
||||
max: TT_DRAG_FROM_MIN,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
travelTimeCardSelector: TT_CARD_SELECTOR,
|
||||
travelTimeSliderMax: TT_SLIDER_MAX,
|
||||
travelTimeDragFromMin: TT_DRAG_FROM_MIN,
|
||||
travelTimeDragToMin: TT_DRAG_TO_MIN,
|
||||
brand: BRAND,
|
||||
},
|
||||
pre: DEFAULT_PRE,
|
||||
cues: DEFAULT_CUES,
|
||||
},
|
||||
];
|
||||
|
||||
// -- Scene 4a: deep zoom into a hexagon -----------------------------
|
||||
// The mapZoom barely fits (1500ms vs cue 1840ms); cursor prep happens
|
||||
// earlier in this cue's during, the click + payoff dwell are in tail.
|
||||
{
|
||||
text: 'Drill into a single block.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
|
||||
{
|
||||
kind: 'mapZoom',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
steps: 18,
|
||||
durationMs: 1500,
|
||||
},
|
||||
],
|
||||
tail: [
|
||||
// Wait for the post-zoom /api/postcodes response and a redraw
|
||||
// before the click — otherwise the click can fire on a stale
|
||||
// frame and miss the polygon.
|
||||
{ kind: 'wait', durationMs: 1200 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: { kind: 'point', x: 1140, y: 605 },
|
||||
durationMs: 700,
|
||||
},
|
||||
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
|
||||
// Linger so the climax cue lands on the right-pane reveal.
|
||||
{ kind: 'wait', durationMs: 1500 },
|
||||
],
|
||||
},
|
||||
|
||||
// -- Scene 4b: right-pane payoff -----------------------------------
|
||||
// 4480ms cue, no during — the camera holds on the populated right pane
|
||||
// for the whole climax line. Tail dwells before the export beat.
|
||||
{
|
||||
text: 'Stats, listings, Street View, price history — all in one pane.',
|
||||
gapBeforeMs: 0,
|
||||
tail: [{ kind: 'wait', durationMs: 1200 }],
|
||||
},
|
||||
|
||||
// -- Scene 5: export ------------------------------------------------
|
||||
// 1760ms cue. zoomReset + click together fit (1700ms); 60ms padding.
|
||||
{
|
||||
text: 'Take the shortlist into Excel.',
|
||||
gapBeforeMs: 500,
|
||||
during: [
|
||||
{ kind: 'zoomReset', durationMs: 900 },
|
||||
{
|
||||
kind: 'click',
|
||||
target: el('button[title="Export to Excel"]'),
|
||||
durationMs: 800,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 800 }],
|
||||
},
|
||||
|
||||
// -- Scene 6: outro -------------------------------------------------
|
||||
{
|
||||
text: `${BRAND_NAME}. ${BRAND_TAGLINE}`,
|
||||
gapBeforeMs: 600,
|
||||
during: [
|
||||
{
|
||||
kind: 'showOutro',
|
||||
brand: BRAND_NAME,
|
||||
tagline: BRAND_TAGLINE,
|
||||
url: BRAND_URL,
|
||||
durationMs: 0,
|
||||
},
|
||||
],
|
||||
tail: [{ kind: 'wait', durationMs: 1500 }],
|
||||
},
|
||||
],
|
||||
};
|
||||
export function getStoryboard(name: string): Storyboard {
|
||||
const sb = storyboards.find((s) => s.name === name);
|
||||
if (!sb) {
|
||||
throw new Error(
|
||||
`Unknown storyboard "${name}". Known: ${storyboards.map((s) => s.name).join(', ')}`
|
||||
);
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,10 +13,13 @@ export type TimelineResult = RunnerResult;
|
|||
* recording chrome (cursor, zoom wrapper, caption layer). Also opens the
|
||||
* AI prompt textarea so the storyboard can begin typing immediately.
|
||||
*/
|
||||
export async function prepareTimeline(page: Page): Promise<ScriptCtx> {
|
||||
export async function prepareTimeline(
|
||||
page: Page,
|
||||
storyboard: Storyboard
|
||||
): Promise<ScriptCtx> {
|
||||
const dashboard = new DashboardRecorder(page);
|
||||
const initialMapVersion = dashboard.getMapDataVersion();
|
||||
await page.goto(dashboardUrl(), { waitUntil: 'domcontentloaded' });
|
||||
await page.goto(dashboardUrl(storyboard), { waitUntil: 'domcontentloaded' });
|
||||
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
|
||||
await page
|
||||
.locator('[data-tutorial="ai-filters"]')
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import { execFileSync } from 'node:child_process';
|
||||
import { existsSync, statSync } from 'node:fs';
|
||||
import { MAX_DURATION_S, MIN_DURATION_S, OUTPUT_FPS, OUTPUT_DIR, VIDEO_SIZE } from './config.js';
|
||||
import { OUTPUT_DIR } from './config.js';
|
||||
import { viewportFor, type Storyboard } from './script.js';
|
||||
import { getStoryboard } from './storyboard.js';
|
||||
|
||||
interface Probe {
|
||||
streams?: {
|
||||
|
|
@ -48,7 +50,7 @@ function probe(path: string): Probe {
|
|||
return JSON.parse(raw) as Probe;
|
||||
}
|
||||
|
||||
function verifyVideo(path: string) {
|
||||
function verifyVideo(path: string, storyboard: Storyboard) {
|
||||
if (!existsSync(path)) fail(`${path} is missing`);
|
||||
if (statSync(path).size === 0) fail(`${path} is empty`);
|
||||
|
||||
|
|
@ -56,18 +58,23 @@ function verifyVideo(path: string) {
|
|||
const stream = data.streams?.[0];
|
||||
if (!stream) fail(`${path} has no video stream`);
|
||||
|
||||
const expectedSize = viewportFor(storyboard.video);
|
||||
const { minDurationS, maxDurationS, outputFps } = storyboard.video;
|
||||
|
||||
const duration = Number(data.format?.duration ?? 0);
|
||||
const fps = parseRate(stream.avg_frame_rate || stream.r_frame_rate);
|
||||
if (stream.width !== VIDEO_SIZE.width || stream.height !== VIDEO_SIZE.height) {
|
||||
fail(`${path} is ${stream.width}x${stream.height}, expected ${VIDEO_SIZE.width}x${VIDEO_SIZE.height}`);
|
||||
}
|
||||
if (duration < MIN_DURATION_S || duration > MAX_DURATION_S) {
|
||||
if (stream.width !== expectedSize.width || stream.height !== expectedSize.height) {
|
||||
fail(
|
||||
`${path} duration is ${duration.toFixed(2)}s, expected ${MIN_DURATION_S}-${MAX_DURATION_S}s`
|
||||
`${path} is ${stream.width}x${stream.height}, expected ${expectedSize.width}x${expectedSize.height}`
|
||||
);
|
||||
}
|
||||
if (Math.abs(fps - OUTPUT_FPS) > 0.1) {
|
||||
fail(`${path} is ${fps.toFixed(2)}fps, expected ${OUTPUT_FPS}fps`);
|
||||
if (duration < minDurationS || duration > maxDurationS) {
|
||||
fail(
|
||||
`${path} duration is ${duration.toFixed(2)}s, expected ${minDurationS}-${maxDurationS}s`
|
||||
);
|
||||
}
|
||||
if (Math.abs(fps - outputFps) > 0.1) {
|
||||
fail(`${path} is ${fps.toFixed(2)}fps, expected ${outputFps}fps`);
|
||||
}
|
||||
|
||||
console.log(
|
||||
|
|
@ -81,8 +88,20 @@ function verifyImage(path: string) {
|
|||
console.log(`[verify] ${path}: ${statSync(path).size} bytes`);
|
||||
}
|
||||
|
||||
const videoPath = process.argv[2] ?? `${OUTPUT_DIR}/recording.mp4`;
|
||||
const posterPath = process.argv[3] ?? (process.argv[2] ? undefined : `${OUTPUT_DIR}/poster.jpg`);
|
||||
// Usage:
|
||||
// node dist/verify.js <storyboard> [videoPath] [posterPath]
|
||||
// Defaults: videoPath=output/<storyboard>/recording.mp4,
|
||||
// posterPath=output/<storyboard>/poster.jpg.
|
||||
// If videoPath is given but posterPath is not, the poster check is skipped.
|
||||
const storyboardName = process.argv[2];
|
||||
if (!storyboardName) {
|
||||
fail('verify: missing <storyboard> argument (e.g. `node dist/verify.js recording`)');
|
||||
}
|
||||
const storyboard = getStoryboard(storyboardName);
|
||||
|
||||
verifyVideo(videoPath);
|
||||
const videoPath = process.argv[3] ?? `${OUTPUT_DIR}/${storyboard.name}/recording.mp4`;
|
||||
const posterPath =
|
||||
process.argv[4] ?? (process.argv[3] ? undefined : `${OUTPUT_DIR}/${storyboard.name}/poster.jpg`);
|
||||
|
||||
verifyVideo(videoPath, storyboard);
|
||||
if (posterPath) verifyImage(posterPath);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
import { execSync } from 'node:child_process';
|
||||
import { renameSync, statSync } from 'node:fs';
|
||||
import { LEAD_IN_S, MAX_DURATION_S, OUTPUT_FPS, VIDEO_SIZE, WEBM_BITRATE } from './config.js';
|
||||
import { LEAD_IN_S } from './config.js';
|
||||
import { viewportFor, type Storyboard } from './script.js';
|
||||
|
||||
export function trimRecording(
|
||||
rawPath: string,
|
||||
trimmedPath: string,
|
||||
storyboard: Storyboard,
|
||||
times: { recordStartMs: number; sceneStartMs: number; sceneEndMs: number }
|
||||
) {
|
||||
const sceneSpan = (times.sceneEndMs - times.sceneStartMs) / 1000;
|
||||
|
|
@ -16,22 +18,26 @@ export function trimRecording(
|
|||
const wallDuration = trimEnd - trimStart;
|
||||
const finalDuration = wallDuration;
|
||||
|
||||
if (finalDuration > MAX_DURATION_S) {
|
||||
const { outputFps, webmBitrate, maxDurationS } = storyboard.video;
|
||||
const viewport = viewportFor(storyboard.video);
|
||||
|
||||
if (finalDuration > maxDurationS) {
|
||||
console.log(
|
||||
`Scene output duration is ${finalDuration.toFixed(2)}s (guard ${MAX_DURATION_S.toFixed(2)}s); keeping the full take.`
|
||||
`[${storyboard.name}] Scene output duration is ${finalDuration.toFixed(2)}s ` +
|
||||
`(guard ${maxDurationS.toFixed(2)}s); keeping the full take.`
|
||||
);
|
||||
}
|
||||
|
||||
const filter =
|
||||
`trim=start=${trimStart.toFixed(3)}:duration=${wallDuration.toFixed(3)},` +
|
||||
`setpts=PTS-STARTPTS,fps=${OUTPUT_FPS},` +
|
||||
`setpts=PTS-STARTPTS,fps=${outputFps},` +
|
||||
`trim=duration=${finalDuration.toFixed(3)},setpts=PTS-STARTPTS`;
|
||||
|
||||
// Keep trimming inside the filter graph: it is frame-accurate for WebM
|
||||
// without the keyframe leakage of input seeking.
|
||||
execSync(
|
||||
`ffmpeg -y -i "${rawPath}" -vf "${filter}" ` +
|
||||
`-fps_mode cfr -r ${OUTPUT_FPS} -c:v libvpx -b:v ${WEBM_BITRATE} -deadline good -cpu-used 5 ` +
|
||||
`-fps_mode cfr -r ${outputFps} -c:v libvpx -b:v ${webmBitrate} -deadline good -cpu-used 5 ` +
|
||||
`"${trimmedPath}"`,
|
||||
{ stdio: 'inherit' }
|
||||
);
|
||||
|
|
@ -44,6 +50,6 @@ export function trimRecording(
|
|||
}
|
||||
|
||||
console.log(
|
||||
`Wrote ${trimmedPath} (${finalDuration.toFixed(2)}s, scene=${sceneSpan.toFixed(2)}s, capture=${VIDEO_SIZE.width}x${VIDEO_SIZE.height})`
|
||||
`[${storyboard.name}] Wrote ${trimmedPath} (${finalDuration.toFixed(2)}s, scene=${sceneSpan.toFixed(2)}s, capture=${viewport.width}x${viewport.height})`
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,19 +1,19 @@
|
|||
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
|
||||
"""Mux per-cue WAVs into one storyboard's recording.mp4 at narration offsets.
|
||||
|
||||
Reads two manifests:
|
||||
Reads two manifests inside ``output/<storyboard>/``:
|
||||
|
||||
* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
* ``audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
|
||||
* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
* ``narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
the trimmed video. Generated DURING recording.
|
||||
|
||||
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
|
||||
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
|
||||
video stream, and writes ``output/recording.narrated.mp4``.
|
||||
video stream, and writes ``output/<storyboard>/recording.narrated.mp4``.
|
||||
|
||||
Run from the ``video/`` directory after recording:
|
||||
|
||||
uv run --project tts python tts/mux.py
|
||||
uv run --project tts python tts/mux.py --storyboard recording
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -28,23 +28,21 @@ from pathlib import Path
|
|||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
|
||||
parser.add_argument(
|
||||
"--narration",
|
||||
type=Path,
|
||||
default=Path("output/narration.json"),
|
||||
help="Per-cue videoTimeMs manifest written by the recorder.",
|
||||
"--storyboard",
|
||||
required=True,
|
||||
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
|
||||
)
|
||||
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("output/recording.narrated.mp4"),
|
||||
default=Path("output"),
|
||||
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace",
|
||||
action="store_true",
|
||||
help="After muxing, atomically replace --video with --out.",
|
||||
help="After muxing, atomically replace the storyboard's recording.mp4.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
|
@ -56,7 +54,13 @@ def main() -> int:
|
|||
print("[mux] ffmpeg not on PATH", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index_path = args.audio_dir / "index.json"
|
||||
storyboard_dir = args.output_dir / args.storyboard
|
||||
audio_dir = storyboard_dir / "audio"
|
||||
narration_path = storyboard_dir / "narration.json"
|
||||
video_path = storyboard_dir / "recording.mp4"
|
||||
out_path = storyboard_dir / "recording.narrated.mp4"
|
||||
|
||||
audio_index_path = audio_dir / "index.json"
|
||||
if not audio_index_path.exists():
|
||||
print(
|
||||
f"[mux] {audio_index_path} not found; run tts/synth.py first",
|
||||
|
|
@ -64,25 +68,25 @@ def main() -> int:
|
|||
)
|
||||
return 1
|
||||
|
||||
if not args.narration.exists():
|
||||
if not narration_path.exists():
|
||||
print(
|
||||
f"[mux] {args.narration} not found; the recorder must run before mux",
|
||||
f"[mux] {narration_path} not found; the recorder must run before mux",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.video.exists():
|
||||
print(f"[mux] video not found: {args.video}", file=sys.stderr)
|
||||
if not video_path.exists():
|
||||
print(f"[mux] video not found: {video_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index = json.loads(audio_index_path.read_text())
|
||||
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
|
||||
if not audio_items:
|
||||
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
|
||||
shutil.copyfile(args.video, args.out)
|
||||
shutil.copyfile(video_path, out_path)
|
||||
return 0
|
||||
|
||||
narration = json.loads(args.narration.read_text())
|
||||
narration = json.loads(narration_path.read_text())
|
||||
nar_cues = list(narration.get("cues", []))
|
||||
if len(nar_cues) != len(audio_items):
|
||||
print(
|
||||
|
|
@ -130,9 +134,9 @@ def main() -> int:
|
|||
+ "\n - ".join(overlaps)
|
||||
)
|
||||
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(video_path)]
|
||||
for it in items:
|
||||
cmd += ["-i", str(args.audio_dir / it["wav"])]
|
||||
cmd += ["-i", str(audio_dir / it["wav"])]
|
||||
|
||||
filter_parts: list[str] = []
|
||||
mix_inputs: list[str] = []
|
||||
|
|
@ -168,18 +172,21 @@ def main() -> int:
|
|||
"-shortest",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
str(args.out),
|
||||
str(out_path),
|
||||
]
|
||||
|
||||
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
|
||||
print(
|
||||
f"[mux] [{args.storyboard}] muxing {len(items)} narration cues into {out_path}",
|
||||
flush=True,
|
||||
)
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
|
||||
return result.returncode
|
||||
|
||||
if args.replace:
|
||||
args.out.replace(args.video)
|
||||
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
|
||||
out_path.replace(video_path)
|
||||
print(f"[mux] replaced {video_path} with narrated copy", flush=True)
|
||||
|
||||
return 0
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,28 @@
|
|||
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
|
||||
"""Synthesize one storyboard's narration in ONE batched Qwen3-TTS call.
|
||||
|
||||
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
|
||||
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
|
||||
batched list — that way every cue shares the same model state, which keeps
|
||||
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
|
||||
go to ``output/audio/`` for the recording step (which reads measured cue
|
||||
durations) and the mux step (which drops each WAV at its videoTime).
|
||||
Reads ``output/<storyboard>/narration-script.json`` (emitted by
|
||||
``dist/preflight.js``) and runs ``Qwen3TTSModel.generate_voice_design`` with
|
||||
all cue texts as a single batched list — that way every cue shares the same
|
||||
model state, which keeps prosody and timbre consistent across cues. Per-cue
|
||||
WAVs and an index manifest go to ``output/<storyboard>/audio/`` for the
|
||||
recording step (which reads measured cue durations) and the mux step (which
|
||||
drops each WAV at its videoTime).
|
||||
|
||||
Voice persona, language, and sampling come from the storyboard via the
|
||||
``voice`` block of the narration script. CLI flags can still override them
|
||||
for ad-hoc experimentation; storyboards remain the source of truth for
|
||||
production runs.
|
||||
|
||||
We use the VoiceDesign sibling of CustomVoice because it accepts a free-form
|
||||
voice persona (British accent, narrator register, "no laughter") via the
|
||||
``instruct`` parameter. CustomVoice's preset speakers are all American or
|
||||
non-English, and its ``instruct`` is documented for emotion only — it
|
||||
ignored accent directives and bled non-speech tokens (laughter, sighs)
|
||||
between cues.
|
||||
|
||||
Run from the ``video/`` directory:
|
||||
|
||||
uv run --project tts python tts/synth.py
|
||||
uv run --project tts python tts/synth.py --storyboard recording
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -17,55 +30,78 @@ from __future__ import annotations
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_SPEAKER = "ryan"
|
||||
DEFAULT_LANGUAGE = "English"
|
||||
# Two checkpoints: the design model mints the reference clip in the desired
|
||||
# persona; the clone model conditions every cue on that reference's x-vector.
|
||||
# Neither CustomVoice nor VoiceDesign support generate_voice_clone — only the
|
||||
# Base checkpoint does.
|
||||
DEFAULT_DESIGN_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
|
||||
DEFAULT_CLONE_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
|
||||
|
||||
# Fixed reference utterance used to anchor the speaker timbre. The reference
|
||||
# is generated once per (model, instruct, sampling, seed) tuple and reused
|
||||
# for every cue, so all narration shares the same x-vector. Two short
|
||||
# sentences exercise enough phonemes for a stable embedding without bloating
|
||||
# generation time.
|
||||
REFERENCE_TEXT = (
|
||||
"Welcome to the demonstration. This is the narrator voice you'll hear throughout the video."
|
||||
)
|
||||
|
||||
|
||||
def _safe_load_json(path: Path) -> object | None:
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--script",
|
||||
"--storyboard",
|
||||
required=True,
|
||||
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("output/narration-script.json"),
|
||||
help="Narration script emitted by dist/preflight.js.",
|
||||
default=Path("output"),
|
||||
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-dir",
|
||||
"--design-model",
|
||||
default=os.environ.get("TTS_DESIGN_MODEL", DEFAULT_DESIGN_MODEL),
|
||||
help="Checkpoint used to mint the voice reference (VoiceDesign by default).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clone-model",
|
||||
default=os.environ.get("TTS_CLONE_MODEL", DEFAULT_CLONE_MODEL),
|
||||
help="Checkpoint used to clone the cue audio from the reference (Base by default).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference-audio",
|
||||
type=Path,
|
||||
default=Path("output/audio"),
|
||||
help="Directory to write WAV files and index.json into.",
|
||||
default=(Path(os.environ["TTS_REFERENCE_AUDIO"]) if os.environ.get("TTS_REFERENCE_AUDIO") else None),
|
||||
help="Path to an existing reference WAV. If set, skip VoiceDesign and clone from this.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker",
|
||||
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
|
||||
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
|
||||
"--reference-text",
|
||||
default=os.environ.get("TTS_REFERENCE_TEXT"),
|
||||
help="Transcript of --reference-audio. Required if --reference-audio is set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default=os.environ.get("TTS_DEVICE", "cuda:0"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-speakers",
|
||||
action="store_true",
|
||||
help="Load the model, print available speaker names, and exit.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -78,15 +114,18 @@ def load_model(model_id: str, device: str) -> Qwen3TTSModel:
|
|||
def cached_index_matches(
|
||||
index_path: Path,
|
||||
cues: list[dict],
|
||||
speaker: str,
|
||||
instruct: str,
|
||||
language: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
) -> bool:
|
||||
"""Return True iff index_path's cue list lines up with `cues` 1:1.
|
||||
|
||||
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
|
||||
settings (``speaker``, ``language``). All cue WAV files must also exist
|
||||
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
|
||||
the cache.
|
||||
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
|
||||
All cue WAV files must also exist on disk. Mismatched length, reordered
|
||||
cues, or a missing WAV invalidate the cache.
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return False
|
||||
|
|
@ -94,7 +133,13 @@ def cached_index_matches(
|
|||
cached = json.loads(index_path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
if cached.get("speaker") != speaker or cached.get("language") != language:
|
||||
if cached.get("instruct") != instruct or cached.get("language") != language:
|
||||
return False
|
||||
if int(cached.get("seed", -1)) != seed:
|
||||
return False
|
||||
if float(cached.get("temperature", -1)) != temperature:
|
||||
return False
|
||||
if float(cached.get("topP", -1)) != top_p:
|
||||
return False
|
||||
cached_items = cached.get("items", [])
|
||||
if len(cached_items) != len(cues):
|
||||
|
|
@ -112,52 +157,179 @@ def cached_index_matches(
|
|||
return True
|
||||
|
||||
|
||||
def seed_everything(seed: int) -> None:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
def _resolve_reference(
|
||||
args: argparse.Namespace,
|
||||
audio_dir: Path,
|
||||
instruct: str,
|
||||
language: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
) -> tuple[Path, str]:
|
||||
"""Return (ref_wav_path, ref_text) for the clone step.
|
||||
|
||||
If --reference-audio is supplied, validate and use it directly. Otherwise
|
||||
mint one via VoiceDesign (cached on disk; cache invalidates when the
|
||||
persona/sampling/seed changes). The design model is unloaded before
|
||||
returning so the clone model can claim the GPU.
|
||||
"""
|
||||
if args.reference_audio is not None:
|
||||
if not args.reference_audio.exists():
|
||||
raise SystemExit(f"[synth] --reference-audio does not exist: {args.reference_audio}")
|
||||
if not args.reference_text:
|
||||
raise SystemExit("[synth] --reference-text is required when --reference-audio is set")
|
||||
print(
|
||||
f"[synth] using user-supplied reference {args.reference_audio} «{args.reference_text}»",
|
||||
flush=True,
|
||||
)
|
||||
return args.reference_audio, args.reference_text
|
||||
|
||||
ref_wav_path = audio_dir / "_reference.wav"
|
||||
ref_meta_path = audio_dir / "_reference.meta.json"
|
||||
ref_meta = {
|
||||
"model": args.design_model,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"text": REFERENCE_TEXT,
|
||||
}
|
||||
if (
|
||||
ref_wav_path.exists()
|
||||
and ref_meta_path.exists()
|
||||
and _safe_load_json(ref_meta_path) == ref_meta
|
||||
):
|
||||
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
|
||||
print(
|
||||
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
|
||||
flush=True,
|
||||
)
|
||||
design_model = load_model(args.design_model, args.device)
|
||||
seed_everything(seed)
|
||||
ref_wavs, ref_sr = design_model.generate_voice_design(
|
||||
text=[REFERENCE_TEXT],
|
||||
language=language,
|
||||
instruct=instruct,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
ref_audio = ref_wavs[0]
|
||||
if hasattr(ref_audio, "cpu"):
|
||||
ref_audio = ref_audio.cpu().float().numpy()
|
||||
sf.write(str(ref_wav_path), ref_audio, ref_sr)
|
||||
ref_meta_path.write_text(json.dumps(ref_meta, indent=2))
|
||||
|
||||
# Free the design model before loading the clone model — both are 1.7B,
|
||||
# we don't want them resident at the same time.
|
||||
del design_model
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if args.list_speakers:
|
||||
model = load_model(args.model, args.device)
|
||||
speakers = model.get_supported_speakers()
|
||||
print(json.dumps(speakers, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
storyboard_dir = args.output_dir / args.storyboard
|
||||
script_path = storyboard_dir / "narration-script.json"
|
||||
audio_dir = storyboard_dir / "audio"
|
||||
|
||||
if not args.script.exists():
|
||||
print(f"[synth] script not found: {args.script}", file=sys.stderr)
|
||||
if not script_path.exists():
|
||||
print(f"[synth] script not found: {script_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
script = json.loads(args.script.read_text())
|
||||
script = json.loads(script_path.read_text())
|
||||
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
|
||||
if not cues:
|
||||
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
voice = script.get("voice")
|
||||
if not voice:
|
||||
print(
|
||||
f"[synth] {script_path} has no `voice` block — re-run preflight.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
instruct = voice["instruct"]
|
||||
language = voice["language"]
|
||||
temperature = float(voice.get("temperature", 0.6))
|
||||
top_p = float(voice.get("topP", 0.9))
|
||||
seed = int(voice.get("seed", 42))
|
||||
|
||||
audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Skip generation when the existing audio matches the script — same cue
|
||||
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
|
||||
# time when iterating on activity timing without changing narration.
|
||||
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
|
||||
# texts and same gapBeforeMs values in the same order, AND same synth
|
||||
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
|
||||
# iterating on activity timing without changing narration or persona.
|
||||
if cached_index_matches(
|
||||
audio_dir / "index.json",
|
||||
cues,
|
||||
instruct,
|
||||
language,
|
||||
seed,
|
||||
temperature,
|
||||
top_p,
|
||||
):
|
||||
print(
|
||||
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
|
||||
f"[synth] [{args.storyboard}] cached audio matches the current script — skipping generation",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
model = load_model(args.model, args.device)
|
||||
|
||||
texts = [c["text"].strip() for c in cues]
|
||||
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
|
||||
print(f"[synth] [{args.storyboard}] persona: {instruct}", flush=True)
|
||||
print(
|
||||
f"[synth] [{args.storyboard}] sampling: temperature={temperature} top_p={top_p} seed={seed} language={language}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# Two-stage generation:
|
||||
# 1. VoiceDesign mints a single reference clip in the target persona
|
||||
# (or the user supplies one via --reference-audio).
|
||||
# 2. Base + generate_voice_clone(x_vector_only_mode=True) conditions
|
||||
# every cue on the reference's speaker embedding.
|
||||
# Without (2), batched generation drifts timbre across cues — a persona
|
||||
# prompt anchors style but not identity, so each batch item picks its
|
||||
# own voice. The reference WAV is cached so subsequent runs only load
|
||||
# the clone model (saves ~20s + 3.4 GB of disk download).
|
||||
ref_wav_path, ref_text = _resolve_reference(
|
||||
args, audio_dir, instruct, language, seed, temperature, top_p
|
||||
)
|
||||
|
||||
print(
|
||||
f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
|
||||
flush=True,
|
||||
)
|
||||
for i, t in enumerate(texts):
|
||||
print(f"[synth] {i:2d}: {t}", flush=True)
|
||||
|
||||
# ONE batched call. generate_custom_voice handles text=List[str] natively
|
||||
# and broadcasts the speaker/language across all items, so the entire
|
||||
# narration is decoded in one model pass — same RNG state, same batch,
|
||||
# consistent voice from cue to cue.
|
||||
wavs, sr = model.generate_custom_voice(
|
||||
clone_model = load_model(args.clone_model, args.device)
|
||||
seed_everything(seed)
|
||||
wavs, sr = clone_model.generate_voice_clone(
|
||||
text=texts,
|
||||
language=args.language,
|
||||
speaker=args.speaker,
|
||||
language=language,
|
||||
ref_audio=str(ref_wav_path),
|
||||
ref_text=ref_text,
|
||||
x_vector_only_mode=True,
|
||||
non_streaming_mode=True,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
if len(wavs) != len(texts):
|
||||
print(
|
||||
|
|
@ -171,7 +343,7 @@ def main() -> int:
|
|||
if hasattr(audio, "cpu"):
|
||||
audio = audio.cpu().float().numpy()
|
||||
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
|
||||
wav_path = args.out_dir / wav_name
|
||||
wav_path = audio_dir / wav_name
|
||||
sf.write(str(wav_path), audio, sr)
|
||||
duration_ms = int(round(len(audio) * 1000 / sr))
|
||||
items.append(
|
||||
|
|
@ -190,15 +362,21 @@ def main() -> int:
|
|||
)
|
||||
|
||||
out_index = {
|
||||
"speaker": args.speaker,
|
||||
"language": args.language,
|
||||
"model": args.model,
|
||||
"storyboard": args.storyboard,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"designModel": args.design_model,
|
||||
"cloneModel": args.clone_model,
|
||||
"referenceText": ref_text,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"items": items,
|
||||
}
|
||||
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
(audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
|
||||
print(
|
||||
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
|
||||
f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue