This commit is contained in:
Andras Schmelczer 2026-05-12 22:00:56 +01:00
parent 8708bf000d
commit 11711c57e6
38 changed files with 5361 additions and 265 deletions

View file

@ -2,10 +2,10 @@
#
# End-to-end re-render of the dashboard demo videos.
#
# All per-storyboard knobs (aspect, fps, bitrate, prompt text, voice persona,
# poster timestamp, brand strings…) live on the Storyboard objects in
# src/storyboard.ts. To add a vertical cut or change the voice, edit that
# file — this script only handles target/auth/transport concerns.
# All per-storyboard knobs (aspect, fps, bitrate, prompt text, localized
# narration, voice persona, poster timestamp, brand strings…) live in
# src/storyboard.ts. A single visual storyboard can expand into multiple
# language variants there; this script renders every emitted slug.
#
# Two targets:
# local (default) — assumes the docker-compose stack on host.docker.internal,

View file

@ -50,6 +50,9 @@ export async function launchRecordingBrowser(
deviceScaleFactor: storyboard.video.captureScale,
recordVideo: { dir: opts.recordDir, size: viewport },
});
await context.addInitScript((appLanguage) => {
if (appLanguage) localStorage.setItem('language', appLanguage);
}, storyboard.content.appLanguage ?? 'en');
await suppressDevServerNoise(context);
return { browser, context };
}

View file

@ -307,12 +307,19 @@ export async function showOutro(
document.getElementById('__demo-caption')?.classList.remove('visible');
const el = document.createElement('div');
el.id = '__demo-outro';
el.innerHTML = `
<div id="__demo-outro-card">
<div id="__demo-outro-brand">${brand}</div>
<div id="__demo-outro-tagline">${tagline}</div>
<div id="__demo-outro-url">${url}</div>
</div>`;
const card = document.createElement('div');
card.id = '__demo-outro-card';
const brandEl = document.createElement('div');
brandEl.id = '__demo-outro-brand';
brandEl.textContent = brand;
const taglineEl = document.createElement('div');
taglineEl.id = '__demo-outro-tagline';
taglineEl.textContent = tagline;
const urlEl = document.createElement('div');
urlEl.id = '__demo-outro-url';
urlEl.textContent = url;
card.append(brandEl, taglineEl, urlEl);
el.appendChild(card);
document.body.appendChild(el);
requestAnimationFrame(() => {
requestAnimationFrame(() => el.classList.add('visible'));

View file

@ -46,6 +46,7 @@ function emitScript(storyboard: Storyboard): string {
voice: {
instruct: storyboard.voice.instruct,
language: storyboard.voice.language,
referenceText: storyboard.voice.referenceText,
temperature: storyboard.voice.temperature ?? 0.6,
topP: storyboard.voice.topP ?? 0.9,
seed: storyboard.voice.seed ?? 42,
@ -68,6 +69,7 @@ function main(): void {
const index = {
storyboards: storyboards.map((sb) => ({
name: sb.name,
locale: sb.locale ?? sb.content.appLanguage,
aspect: sb.video.aspect,
outputFps: sb.video.outputFps,
minDurationS: sb.video.minDurationS,

View file

@ -25,6 +25,7 @@ export interface RunnerResult {
const MAP_ZOOM_WHEEL_DELTA = -120;
const FALLBACK_MS_PER_WORD = 750;
const FALLBACK_TAIL_BUFFER_MS = 800;
const CJK_CHARS_PER_FALLBACK_WORD = 2;
interface SynthCue {
cueIndex: number;
@ -266,10 +267,15 @@ function loadSynthIndex(storyboard: Storyboard): SynthCue[] {
return storyboard.cues.map((cue, cueIndex) => ({
cueIndex,
text: cue.text,
durationMs:
cue.text.split(/\s+/).filter(Boolean).length * FALLBACK_MS_PER_WORD +
FALLBACK_TAIL_BUFFER_MS,
durationMs: estimateFallbackDurationMs(cue.text),
}));
}
function estimateFallbackDurationMs(text: string): number {
const wordCount = text.split(/\s+/).filter(Boolean).length;
const cjkCount = text.match(/\p{Script=Han}/gu)?.length ?? 0;
const units = Math.max(wordCount, Math.ceil(cjkCount / CJK_CHARS_PER_FALLBACK_WORD), 1);
return units * FALLBACK_MS_PER_WORD + FALLBACK_TAIL_BUFFER_MS;
}
export type { Page };

View file

@ -124,6 +124,8 @@ export interface VoiceConfig {
instruct: string;
/** Qwen3-TTS language string, e.g. "English". */
language: string;
/** Reference utterance used when minting a generated voice for this language. */
referenceText?: string;
/** Sampling temperature (default 0.6). */
temperature?: number;
/** Top-p nucleus sampling (default 0.9). */
@ -147,6 +149,8 @@ export interface BrandConfig {
export interface ContentConfig {
/** Prompt text typed into the AI box during the cold open. */
promptText: string;
/** Frontend i18n language code to set before loading the dashboard. */
appLanguage?: string;
/** Cold-open zoom multiplier on the AI card. */
aiZoomScale: number;
initialMapView: { lat: number; lon: number; zoom: number };
@ -177,6 +181,8 @@ export interface TravelTimeFilter {
*/
export interface Storyboard {
name: string;
/** Optional language/variant code, used for manifests and logging. */
locale?: string;
video: VideoConfig;
voice: VoiceConfig;
content: ContentConfig;

View file

@ -6,8 +6,9 @@ import { el, type Storyboard } from './script.js';
* Each entry is a fully self-contained Storyboard: video knobs (aspect,
* bitrate, fps), voice persona (Qwen3-TTS instruct + language + sampling),
* stubbed AI response, brand strings, AND the cue list. There is no shared
* global state to ship a vertical cut, a different prompt, or a different
* voice, push another item onto this array.
* global state. The exported array can contain generated variants, so a
* shared visual storyboard can render once per language without repeating
* its activity sequence.
*
* `name` doubles as the on-disk slug. The pipeline writes per-storyboard
* artefacts to `output/<name>/` and publishes `<name>.mp4` / `<name>.jpg`
@ -23,15 +24,6 @@ import { el, type Storyboard } from './script.js';
* before the next cue's gap).
*/
const PROMPT_TEXT = 'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area';
const BRAND = {
name: 'Perfect Postcode',
tagline: 'Your best chance to find your next perfect home.',
url: 'https://perfect-postcode.co.uk',
};
const AI_ZOOM_SCALE = 2.4;
const TT_CARD_SELECTOR = '[data-filter-name="tt_0"]';
@ -39,103 +31,238 @@ const TT_SLIDER_MAX = 120;
const TT_DRAG_FROM_MIN = 35;
const TT_DRAG_TO_MIN = 20;
const BRITISH_MALE_NARRATOR =
'Calm and cheerful young British male narrator from the North of England with a ' +
'strong Manchester accent.';
type RecordingLocale = 'en' | 'de' | 'zh' | 'hi';
const DEFAULT_CUES: Storyboard['cues'] = [
{
text: 'Start by describing the type of place you\'re looking for',
gapBeforeMs: 0,
tail: [
{
kind: 'type',
selector: '[data-tutorial="ai-filters"] textarea',
text: PROMPT_TEXT,
durationMs: 3000,
},
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
],
},
{
text: 'The dashboard will show you the likeliest places that will meet your expectations',
gapBeforeMs: 400,
during: [{ kind: 'zoomReset', durationMs: 1400 }],
tail: [{ kind: 'wait', durationMs: 500 }],
},
interface RecordingLocalization {
name: string;
appLanguage: string;
ttsLanguage: string;
voiceInstruct: string;
voiceReferenceText: string;
promptText: string;
travelTimeLabel: string;
exportButtonTitle: string;
brand: {
name: string;
tagline: string;
url: string;
};
cues: {
describe: string;
dashboard: string;
filters: string;
details: string;
shortlist: string;
};
}
{
text: `Adjust the filters to narrow down to the best candidates`,
gapBeforeMs: 500,
during: [
{
kind: 'dragSlider',
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
durationMs: 1000,
},
],
tail: [{ kind: 'wait', durationMs: 400 }],
},
const BRAND_URL = 'https://perfect-postcode.co.uk';
{
text: 'And now it\'s time to dig into the details. Looks good to me!',
gapBeforeMs: 500,
during: [
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
{
kind: 'mapZoom',
target: { kind: 'point', x: 1140, y: 605 },
steps: 18,
durationMs: 1500,
},
],
tail: [
// Wait for the post-zoom /api/postcodes response and a redraw
// before the click — otherwise the click can fire on a stale
// frame and miss the polygon.
{ kind: 'wait', durationMs: 500 },
{
kind: 'click',
target: { kind: 'point', x: 1140, y: 605 },
durationMs: 700,
},
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
// Linger so the climax cue lands on the right-pane reveal.
{ kind: 'wait', durationMs: 1500 },
],
const RECORDING_LOCALIZATIONS: Record<RecordingLocale, RecordingLocalization> = {
en: {
name: 'recording',
appLanguage: 'en',
ttsLanguage: 'English',
voiceInstruct:
'Calm and cheerful young British male narrator from the North of England with a ' +
'strong Manchester accent.',
voiceReferenceText:
"Welcome to the demonstration. This is the narrator voice you'll hear throughout the video.",
promptText:
'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area',
travelTimeLabel: 'Manchester city centre',
exportButtonTitle: 'Export to Excel',
brand: {
name: 'Perfect Postcode',
tagline: 'Your best chance to find your next perfect home.',
url: BRAND_URL,
},
cues: {
describe: "Start by describing the type of place you're looking for",
dashboard: 'The dashboard will show you the likeliest places that will meet your expectations',
filters: 'Adjust the filters to narrow down to the best candidates',
details: "And now it's time to dig into the details. Looks good to me!",
shortlist:
'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
},
},
de: {
name: 'recording-de',
appLanguage: 'de',
ttsLanguage: 'German',
voiceInstruct:
'Calm and cheerful German male narrator with clear standard German pronunciation ' +
'and a friendly, practical delivery.',
voiceReferenceText:
'Willkommen zur Demonstration. Diese Sprecherstimme hören Sie im gesamten Video.',
promptText:
'Wohnungen unter £300k, 35 Min. Pendelzeit nach Manchester, nahe einer herausragenden Schule in einer sehr kriminalitätsarmen Gegend',
travelTimeLabel: 'Stadtzentrum Manchester',
exportButtonTitle: 'Als Excel exportieren',
brand: {
name: 'Perfect Postcode',
tagline: 'Ihre beste Chance, Ihr nächstes perfektes Zuhause zu finden.',
url: BRAND_URL,
},
cues: {
describe: 'Beschreiben Sie zuerst, wonach Sie suchen.',
dashboard: 'Das Dashboard zeigt die Orte, die Ihre Erwartungen am ehesten erfüllen.',
filters: 'Passen Sie die Filter an, um die besten Kandidaten einzugrenzen.',
details: 'Jetzt geht es in die Details. Sieht gut aus!',
shortlist:
'Jetzt können Sie Ihre Auswahl nehmen und Ihr nächstes Zuhause in Ihrem perfekten Postcode suchen.',
},
},
zh: {
name: 'recording-zh',
appLanguage: 'zh',
ttsLanguage: 'Chinese',
voiceInstruct:
'Calm and cheerful Mandarin Chinese male narrator with clear standard Mandarin ' +
'pronunciation and a friendly, practical delivery.',
voiceReferenceText: '欢迎观看演示。整段视频都会使用这位旁白的声音。',
promptText: '30万英镑以内的公寓35分钟通勤到曼彻斯特靠近优秀学校犯罪率很低的区域',
travelTimeLabel: '曼彻斯特市中心',
exportButtonTitle: '导出为 Excel',
brand: {
name: 'Perfect Postcode',
tagline: '帮你更有把握找到下一个理想家。',
url: BRAND_URL,
},
cues: {
describe: '先描述你想找什么样的地方',
dashboard: '仪表板会显示最符合你期望的地点',
filters: '调整筛选条件,缩小到最合适的候选区域',
details: '现在深入查看细节。看起来不错!',
shortlist: '现在你可以带着候选清单,开始寻找理想邮编里的下一个家。',
},
},
hi: {
name: 'recording-hi',
appLanguage: 'hi',
ttsLanguage: 'English',
voiceInstruct:
'Calm and cheerful Indian male narrator speaking English with a strong Indian accent ' +
'and a friendly, practical delivery.',
voiceReferenceText:
"Welcome to the demonstration. This is the narrator voice you'll hear throughout the video.",
promptText:
'Flats <£300k, 35 min to commute Manchester close to an outstanding school in a quite low crime area',
travelTimeLabel: 'Manchester city centre',
exportButtonTitle: 'Excel में निर्यात करें',
brand: {
name: 'Perfect Postcode',
tagline: 'Your best chance to find your next perfect home.',
url: BRAND_URL,
},
cues: {
describe: "Start by describing the type of place you're looking for",
dashboard: 'The dashboard will show you the likeliest places that will meet your expectations',
filters: 'Adjust the filters to narrow down to the best candidates',
details: "And now it's time to dig into the details. Looks good to me!",
shortlist:
'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
},
},
};
{
text: 'Now you can take your shortlist and start looking for your next home in your perfect postcode.',
gapBeforeMs: 500,
during: [
{ kind: 'zoomReset', durationMs: 900 },
{
kind: 'click',
target: el('button[title="Export to Excel"]'),
durationMs: 800,
},
],
tail: [{ kind: 'wait', durationMs: 800 }],
},
function createCues(locale: RecordingLocale): Storyboard['cues'] {
const copy = RECORDING_LOCALIZATIONS[locale];
{
text: `${BRAND.name}. ${BRAND.tagline}`,
gapBeforeMs: 600,
during: [
{
kind: 'showOutro',
brand: BRAND.name,
tagline: BRAND.tagline,
url: BRAND.url,
durationMs: 0,
},
],
tail: [{ kind: 'wait', durationMs: 1500 }],
},
];
return [
{
text: copy.cues.describe,
gapBeforeMs: 0,
tail: [
{
kind: 'type',
selector: '[data-tutorial="ai-filters"] textarea',
text: copy.promptText,
durationMs: 3000,
},
{ kind: 'submitForm', formSelector: '[data-tutorial="ai-filters"] form', durationMs: 1700 },
],
},
{
text: copy.cues.dashboard,
gapBeforeMs: 400,
during: [{ kind: 'zoomReset', durationMs: 1400 }],
tail: [{ kind: 'wait', durationMs: 500 }],
},
{
text: copy.cues.filters,
gapBeforeMs: 500,
during: [
{
kind: 'dragSlider',
thumbSelector: `${TT_CARD_SELECTOR} [role="slider"] >> nth=1`,
trackSelector: `${TT_CARD_SELECTOR} [data-orientation="horizontal"] >> nth=0`,
toFraction: TT_DRAG_TO_MIN / TT_SLIDER_MAX,
durationMs: 1000,
},
],
tail: [{ kind: 'wait', durationMs: 400 }],
},
{
text: copy.cues.details,
gapBeforeMs: 500,
during: [
{ kind: 'cursorScale', scale: 1.4, durationMs: 200 },
{
kind: 'mapZoom',
target: { kind: 'point', x: 1140, y: 605 },
steps: 18,
durationMs: 1500,
},
],
tail: [
// Wait for the post-zoom /api/postcodes response and a redraw
// before the click — otherwise the click can fire on a stale
// frame and miss the polygon.
{ kind: 'wait', durationMs: 500 },
{
kind: 'click',
target: { kind: 'point', x: 1140, y: 605 },
durationMs: 700,
},
{ kind: 'cursorScale', scale: 1, durationMs: 280 },
// Linger so the climax cue lands on the right-pane reveal.
{ kind: 'wait', durationMs: 1500 },
],
},
{
text: copy.cues.shortlist,
gapBeforeMs: 500,
during: [
{ kind: 'zoomReset', durationMs: 900 },
{
kind: 'click',
target: el(`button[title="${copy.exportButtonTitle}"]`),
durationMs: 800,
},
],
tail: [{ kind: 'wait', durationMs: 800 }],
},
{
text: `${copy.brand.name}. ${copy.brand.tagline}`,
gapBeforeMs: 600,
during: [
{
kind: 'showOutro',
brand: copy.brand.name,
tagline: copy.brand.tagline,
url: copy.brand.url,
durationMs: 0,
},
],
tail: [{ kind: 'wait', durationMs: 1500 }],
},
];
}
const DEFAULT_PRE: Storyboard['pre'] = [
{ kind: 'clearVignette', durationMs: 0 },
@ -149,9 +276,12 @@ const DEFAULT_PRE: Storyboard['pre'] = [
{ kind: 'wait', durationMs: 140 },
];
export const storyboards: Storyboard[] = [
{
name: 'recording',
function createRecordingStoryboard(locale: RecordingLocale): Storyboard {
const copy = RECORDING_LOCALIZATIONS[locale];
return {
name: copy.name,
locale,
video: {
aspect: '16x9',
captureScale: 1,
@ -168,23 +298,25 @@ export const storyboards: Storyboard[] = [
posterTimeS: 16,
},
voice: {
instruct: BRITISH_MALE_NARRATOR,
language: 'English',
instruct: copy.voiceInstruct,
language: copy.ttsLanguage,
referenceText: copy.voiceReferenceText,
temperature: 0.6,
topP: 0.9,
seed: 42,
},
content: {
promptText: PROMPT_TEXT,
promptText: copy.promptText,
appLanguage: copy.appLanguage,
aiZoomScale: AI_ZOOM_SCALE,
initialMapView: { lat: 53.4795, lon: -2.2451, zoom: 11.5 },
// Filters returned by the AI stub. Keys MUST match real feature names
// from /api/features (verified against the running server's schema).
stubbedFilters: {
'Property type': ['Flats/Maisonettes', 'Terraced'],
'Estimated current price': [175000, 450000],
'Property type': ['Flats/Maisonettes'],
'Estimated current price': [0, 300000],
'Serious crime per 1k residents (avg/yr)': [0, 55],
'Noise (dB)': [50, 68],
'Outstanding primary schools within 2km': [1, 10],
},
// Travel-time filters returned by the AI stub. Slug matches the real
// /api/travel-destinations?mode=transit response.
@ -192,7 +324,7 @@ export const storyboards: Storyboard[] = [
{
mode: 'transit',
slug: 'manchester',
label: 'Manchester city centre',
label: copy.travelTimeLabel,
max: TT_DRAG_FROM_MIN,
},
],
@ -200,12 +332,16 @@ export const storyboards: Storyboard[] = [
travelTimeSliderMax: TT_SLIDER_MAX,
travelTimeDragFromMin: TT_DRAG_FROM_MIN,
travelTimeDragToMin: TT_DRAG_TO_MIN,
brand: BRAND,
brand: copy.brand,
},
pre: DEFAULT_PRE,
cues: DEFAULT_CUES,
},
];
cues: createCues(locale),
};
}
export const storyboards: Storyboard[] = (['en', 'de', 'zh', 'hi'] as const).map((locale) =>
createRecordingStoryboard(locale)
);
export function getStoryboard(name: string): Storyboard {
const sb = storyboards.find((s) => s.name === name);

View file

@ -116,6 +116,10 @@ def cached_index_matches(
cues: list[dict],
instruct: str,
language: str,
reference_text: str,
design_model: str,
clone_model: str,
reference_audio: str,
seed: int,
temperature: float,
top_p: float,
@ -123,7 +127,8 @@ def cached_index_matches(
"""Return True iff index_path's cue list lines up with `cues` 1:1.
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
settings (``instruct``, ``language``, reference text, models, ``seed``,
``temperature``, ``top_p``).
All cue WAV files must also exist on disk. Mismatched length, reordered
cues, or a missing WAV invalidate the cache.
"""
@ -135,6 +140,12 @@ def cached_index_matches(
return False
if cached.get("instruct") != instruct or cached.get("language") != language:
return False
if cached.get("referenceText") != reference_text:
return False
if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
return False
if cached.get("referenceAudio", "") != reference_audio:
return False
if int(cached.get("seed", -1)) != seed:
return False
if float(cached.get("temperature", -1)) != temperature:
@ -170,6 +181,7 @@ def _resolve_reference(
audio_dir: Path,
instruct: str,
language: str,
reference_text: str,
seed: int,
temperature: float,
top_p: float,
@ -178,8 +190,8 @@ def _resolve_reference(
If --reference-audio is supplied, validate and use it directly. Otherwise
mint one via VoiceDesign (cached on disk; cache invalidates when the
persona/sampling/seed changes). The design model is unloaded before
returning so the clone model can claim the GPU.
persona/language/reference/sampling/seed changes). The design model is
unloaded before returning so the clone model can claim the GPU.
"""
if args.reference_audio is not None:
if not args.reference_audio.exists():
@ -201,7 +213,7 @@ def _resolve_reference(
"seed": seed,
"temperature": temperature,
"topP": top_p,
"text": REFERENCE_TEXT,
"text": reference_text,
}
if (
ref_wav_path.exists()
@ -209,16 +221,16 @@ def _resolve_reference(
and _safe_load_json(ref_meta_path) == ref_meta
):
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
return ref_wav_path, REFERENCE_TEXT
return ref_wav_path, reference_text
print(
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
f"[synth] minting voice reference via VoiceDesign: «{reference_text}»",
flush=True,
)
design_model = load_model(args.design_model, args.device)
seed_everything(seed)
ref_wavs, ref_sr = design_model.generate_voice_design(
text=[REFERENCE_TEXT],
text=[reference_text],
language=language,
instruct=instruct,
do_sample=True,
@ -237,7 +249,7 @@ def _resolve_reference(
if torch.cuda.is_available():
torch.cuda.empty_cache()
return ref_wav_path, REFERENCE_TEXT
return ref_wav_path, reference_text
def main() -> int:
@ -266,21 +278,30 @@ def main() -> int:
return 1
instruct = voice["instruct"]
language = voice["language"]
reference_text = str(voice.get("referenceText") or REFERENCE_TEXT)
temperature = float(voice.get("temperature", 0.6))
top_p = float(voice.get("topP", 0.9))
seed = int(voice.get("seed", 42))
reference_audio_cache_key = (
str(args.reference_audio.resolve()) if args.reference_audio is not None else ""
)
audio_dir.mkdir(parents=True, exist_ok=True)
# Skip generation when the existing audio matches the script — same cue
# texts and same gapBeforeMs values in the same order, AND same synth
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
# iterating on activity timing without changing narration or persona.
# settings (instruct/language/reference/model/seed/temperature/top_p).
# Saves ~30s of GPU time when iterating on activity timing without
# changing narration or persona.
if cached_index_matches(
audio_dir / "index.json",
cues,
instruct,
language,
reference_text,
args.design_model,
args.clone_model,
reference_audio_cache_key,
seed,
temperature,
top_p,
@ -308,7 +329,7 @@ def main() -> int:
# own voice. The reference WAV is cached so subsequent runs only load
# the clone model (saves ~20s + 3.4 GB of disk download).
ref_wav_path, ref_text = _resolve_reference(
args, audio_dir, instruct, language, seed, temperature, top_p
args, audio_dir, instruct, language, reference_text, seed, temperature, top_p
)
print(
@ -367,6 +388,7 @@ def main() -> int:
"language": language,
"designModel": args.design_model,
"cloneModel": args.clone_model,
"referenceAudio": reference_audio_cache_key,
"referenceText": ref_text,
"seed": seed,
"temperature": temperature,