LGTM

2026-05-11 21:38:26 +01:00 · 2026-05-11 21:38:26 +01:00 · f2a2651b8a
commit f2a2651b8a
parent 9248e26af2
95 changed files with 3993 additions and 1471 deletions
--- a/video/tts/mux.py
+++ b/video/tts/mux.py
@ -1,19 +1,19 @@
-"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
+"""Mux per-cue WAVs into one storyboard's recording.mp4 at narration offsets.

-Reads two manifests:
+Reads two manifests inside ``output/<storyboard>/``:

-* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
+* ``audio/index.json`` (synth output) — per-cue WAV filename + measured
  duration. Generated BEFORE recording in one batched Qwen3-TTS call.
-* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
+* ``narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
  the trimmed video. Generated DURING recording.

 Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
 runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
-video stream, and writes ``output/recording.narrated.mp4``.
+video stream, and writes ``output/<storyboard>/recording.narrated.mp4``.

 Run from the ``video/`` directory after recording:

-    uv run --project tts python tts/mux.py
+    uv run --project tts python tts/mux.py --storyboard recording
 """

 from __future__ import annotations
@ -28,23 +28,21 @@ from pathlib import Path

 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
    parser.add_argument(
-        "--narration",
-        type=Path,
-        default=Path("output/narration.json"),
-        help="Per-cue videoTimeMs manifest written by the recorder.",
+        "--storyboard",
+        required=True,
+        help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
    )
-    parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
    parser.add_argument(
-        "--out",
+        "--output-dir",
        type=Path,
-        default=Path("output/recording.narrated.mp4"),
+        default=Path("output"),
+        help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
    )
    parser.add_argument(
        "--replace",
        action="store_true",
-        help="After muxing, atomically replace --video with --out.",
+        help="After muxing, atomically replace the storyboard's recording.mp4.",
    )
    return parser.parse_args()

@ -56,7 +54,13 @@ def main() -> int:
        print("[mux] ffmpeg not on PATH", file=sys.stderr)
        return 1

-    audio_index_path = args.audio_dir / "index.json"
+    storyboard_dir = args.output_dir / args.storyboard
+    audio_dir = storyboard_dir / "audio"
+    narration_path = storyboard_dir / "narration.json"
+    video_path = storyboard_dir / "recording.mp4"
+    out_path = storyboard_dir / "recording.narrated.mp4"
+
+    audio_index_path = audio_dir / "index.json"
    if not audio_index_path.exists():
        print(
            f"[mux] {audio_index_path} not found; run tts/synth.py first",
@ -64,25 +68,25 @@ def main() -> int:
        )
        return 1

-    if not args.narration.exists():
+    if not narration_path.exists():
        print(
-            f"[mux] {args.narration} not found; the recorder must run before mux",
+            f"[mux] {narration_path} not found; the recorder must run before mux",
            file=sys.stderr,
        )
        return 1

-    if not args.video.exists():
-        print(f"[mux] video not found: {args.video}", file=sys.stderr)
+    if not video_path.exists():
+        print(f"[mux] video not found: {video_path}", file=sys.stderr)
        return 1

    audio_index = json.loads(audio_index_path.read_text())
    audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
    if not audio_items:
        print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
-        shutil.copyfile(args.video, args.out)
+        shutil.copyfile(video_path, out_path)
        return 0

-    narration = json.loads(args.narration.read_text())
+    narration = json.loads(narration_path.read_text())
    nar_cues = list(narration.get("cues", []))
    if len(nar_cues) != len(audio_items):
        print(
@ -130,9 +134,9 @@ def main() -> int:
            + "\n  - ".join(overlaps)
        )

-    cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
+    cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(video_path)]
    for it in items:
-        cmd += ["-i", str(args.audio_dir / it["wav"])]
+        cmd += ["-i", str(audio_dir / it["wav"])]

    filter_parts: list[str] = []
    mix_inputs: list[str] = []
@ -168,18 +172,21 @@ def main() -> int:
        "-shortest",
        "-movflags",
        "+faststart",
-        str(args.out),
+        str(out_path),
    ]

-    print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
+    print(
+        f"[mux] [{args.storyboard}] muxing {len(items)} narration cues into {out_path}",
+        flush=True,
+    )
    result = subprocess.run(cmd)
    if result.returncode != 0:
        print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
        return result.returncode

    if args.replace:
-        args.out.replace(args.video)
-        print(f"[mux] replaced {args.video} with narrated copy", flush=True)
+        out_path.replace(video_path)
+        print(f"[mux] replaced {video_path} with narrated copy", flush=True)

    return 0

--- a/video/tts/synth.py
+++ b/video/tts/synth.py
@ -1,15 +1,28 @@
-"""Synthesize the full narration in ONE batched Qwen3-TTS call.
+"""Synthesize one storyboard's narration in ONE batched Qwen3-TTS call.

-Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
-runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
-batched list — that way every cue shares the same model state, which keeps
-prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
-go to ``output/audio/`` for the recording step (which reads measured cue
-durations) and the mux step (which drops each WAV at its videoTime).
+Reads ``output/<storyboard>/narration-script.json`` (emitted by
+``dist/preflight.js``) and runs ``Qwen3TTSModel.generate_voice_design`` with
+all cue texts as a single batched list — that way every cue shares the same
+model state, which keeps prosody and timbre consistent across cues. Per-cue
+WAVs and an index manifest go to ``output/<storyboard>/audio/`` for the
+recording step (which reads measured cue durations) and the mux step (which
+drops each WAV at its videoTime).
+
+Voice persona, language, and sampling come from the storyboard via the
+``voice`` block of the narration script. CLI flags can still override them
+for ad-hoc experimentation; storyboards remain the source of truth for
+production runs.
+
+We use the VoiceDesign sibling of CustomVoice because it accepts a free-form
+voice persona (British accent, narrator register, "no laughter") via the
+``instruct`` parameter. CustomVoice's preset speakers are all American or
+non-English, and its ``instruct`` is documented for emotion only — it
+ignored accent directives and bled non-speech tokens (laughter, sighs)
+between cues.

 Run from the ``video/`` directory:

-    uv run --project tts python tts/synth.py
+    uv run --project tts python tts/synth.py --storyboard recording
 """

 from __future__ import annotations
@ -17,55 +30,78 @@ from __future__ import annotations
 import argparse
 import json
 import os
+import random
 import sys
 from pathlib import Path

+import numpy as np
 import soundfile as sf
 import torch
 from qwen_tts import Qwen3TTSModel


-DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-DEFAULT_SPEAKER = "ryan"
-DEFAULT_LANGUAGE = "English"
+# Two checkpoints: the design model mints the reference clip in the desired
+# persona; the clone model conditions every cue on that reference's x-vector.
+# Neither CustomVoice nor VoiceDesign support generate_voice_clone — only the
+# Base checkpoint does.
+DEFAULT_DESIGN_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
+DEFAULT_CLONE_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
+
+# Fixed reference utterance used to anchor the speaker timbre. The reference
+# is generated once per (model, instruct, sampling, seed) tuple and reused
+# for every cue, so all narration shares the same x-vector. Two short
+# sentences exercise enough phonemes for a stable embedding without bloating
+# generation time.
+REFERENCE_TEXT = (
+    "Welcome to the demonstration. This is the narrator voice you'll hear throughout the video."
+)
+
+
+def _safe_load_json(path: Path) -> object | None:
+    try:
+        return json.loads(path.read_text())
+    except (FileNotFoundError, json.JSONDecodeError):
+        return None


 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
-        "--script",
+        "--storyboard",
+        required=True,
+        help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
+    )
+    parser.add_argument(
+        "--output-dir",
        type=Path,
-        default=Path("output/narration-script.json"),
-        help="Narration script emitted by dist/preflight.js.",
+        default=Path("output"),
+        help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
    )
    parser.add_argument(
-        "--out-dir",
+        "--design-model",
+        default=os.environ.get("TTS_DESIGN_MODEL", DEFAULT_DESIGN_MODEL),
+        help="Checkpoint used to mint the voice reference (VoiceDesign by default).",
+    )
+    parser.add_argument(
+        "--clone-model",
+        default=os.environ.get("TTS_CLONE_MODEL", DEFAULT_CLONE_MODEL),
+        help="Checkpoint used to clone the cue audio from the reference (Base by default).",
+    )
+    parser.add_argument(
+        "--reference-audio",
        type=Path,
-        default=Path("output/audio"),
-        help="Directory to write WAV files and index.json into.",
+        default=(Path(os.environ["TTS_REFERENCE_AUDIO"]) if os.environ.get("TTS_REFERENCE_AUDIO") else None),
+        help="Path to an existing reference WAV. If set, skip VoiceDesign and clone from this.",
    )
    parser.add_argument(
-        "--model",
-        default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
-    )
-    parser.add_argument(
-        "--speaker",
-        default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
-        help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
-    )
-    parser.add_argument(
-        "--language",
-        default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
+        "--reference-text",
+        default=os.environ.get("TTS_REFERENCE_TEXT"),
+        help="Transcript of --reference-audio. Required if --reference-audio is set.",
    )
    parser.add_argument(
        "--device",
        default=os.environ.get("TTS_DEVICE", "cuda:0"),
    )
-    parser.add_argument(
-        "--list-speakers",
-        action="store_true",
-        help="Load the model, print available speaker names, and exit.",
-    )
    return parser.parse_args()


@ -78,15 +114,18 @@ def load_model(model_id: str, device: str) -> Qwen3TTSModel:
 def cached_index_matches(
    index_path: Path,
    cues: list[dict],
-    speaker: str,
+    instruct: str,
    language: str,
+    seed: int,
+    temperature: float,
+    top_p: float,
 ) -> bool:
    """Return True iff index_path's cue list lines up with `cues` 1:1.

    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
-    settings (``speaker``, ``language``). All cue WAV files must also exist
-    on disk. Mismatched length, reordered cues, or a missing WAV invalidate
-    the cache.
+    settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
+    All cue WAV files must also exist on disk. Mismatched length, reordered
+    cues, or a missing WAV invalidate the cache.
    """
    if not index_path.exists():
        return False
@ -94,7 +133,13 @@ def cached_index_matches(
        cached = json.loads(index_path.read_text())
    except json.JSONDecodeError:
        return False
-    if cached.get("speaker") != speaker or cached.get("language") != language:
+    if cached.get("instruct") != instruct or cached.get("language") != language:
+        return False
+    if int(cached.get("seed", -1)) != seed:
+        return False
+    if float(cached.get("temperature", -1)) != temperature:
+        return False
+    if float(cached.get("topP", -1)) != top_p:
        return False
    cached_items = cached.get("items", [])
    if len(cached_items) != len(cues):
@ -112,52 +157,179 @@ def cached_index_matches(
    return True


+def seed_everything(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+
+
+def _resolve_reference(
+    args: argparse.Namespace,
+    audio_dir: Path,
+    instruct: str,
+    language: str,
+    seed: int,
+    temperature: float,
+    top_p: float,
+) -> tuple[Path, str]:
+    """Return (ref_wav_path, ref_text) for the clone step.
+
+    If --reference-audio is supplied, validate and use it directly. Otherwise
+    mint one via VoiceDesign (cached on disk; cache invalidates when the
+    persona/sampling/seed changes). The design model is unloaded before
+    returning so the clone model can claim the GPU.
+    """
+    if args.reference_audio is not None:
+        if not args.reference_audio.exists():
+            raise SystemExit(f"[synth] --reference-audio does not exist: {args.reference_audio}")
+        if not args.reference_text:
+            raise SystemExit("[synth] --reference-text is required when --reference-audio is set")
+        print(
+            f"[synth] using user-supplied reference {args.reference_audio} «{args.reference_text}»",
+            flush=True,
+        )
+        return args.reference_audio, args.reference_text
+
+    ref_wav_path = audio_dir / "_reference.wav"
+    ref_meta_path = audio_dir / "_reference.meta.json"
+    ref_meta = {
+        "model": args.design_model,
+        "instruct": instruct,
+        "language": language,
+        "seed": seed,
+        "temperature": temperature,
+        "topP": top_p,
+        "text": REFERENCE_TEXT,
+    }
+    if (
+        ref_wav_path.exists()
+        and ref_meta_path.exists()
+        and _safe_load_json(ref_meta_path) == ref_meta
+    ):
+        print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
+        return ref_wav_path, REFERENCE_TEXT
+
+    print(
+        f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
+        flush=True,
+    )
+    design_model = load_model(args.design_model, args.device)
+    seed_everything(seed)
+    ref_wavs, ref_sr = design_model.generate_voice_design(
+        text=[REFERENCE_TEXT],
+        language=language,
+        instruct=instruct,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    ref_audio = ref_wavs[0]
+    if hasattr(ref_audio, "cpu"):
+        ref_audio = ref_audio.cpu().float().numpy()
+    sf.write(str(ref_wav_path), ref_audio, ref_sr)
+    ref_meta_path.write_text(json.dumps(ref_meta, indent=2))
+
+    # Free the design model before loading the clone model — both are 1.7B,
+    # we don't want them resident at the same time.
+    del design_model
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    return ref_wav_path, REFERENCE_TEXT
+
+
 def main() -> int:
    args = parse_args()

-    if args.list_speakers:
-        model = load_model(args.model, args.device)
-        speakers = model.get_supported_speakers()
-        print(json.dumps(speakers, indent=2, ensure_ascii=False))
-        return 0
+    storyboard_dir = args.output_dir / args.storyboard
+    script_path = storyboard_dir / "narration-script.json"
+    audio_dir = storyboard_dir / "audio"

-    if not args.script.exists():
-        print(f"[synth] script not found: {args.script}", file=sys.stderr)
+    if not script_path.exists():
+        print(f"[synth] script not found: {script_path}", file=sys.stderr)
        return 1

-    script = json.loads(args.script.read_text())
+    script = json.loads(script_path.read_text())
    cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
    if not cues:
        print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
        return 1

-    args.out_dir.mkdir(parents=True, exist_ok=True)
+    voice = script.get("voice")
+    if not voice:
+        print(
+            f"[synth] {script_path} has no `voice` block — re-run preflight.",
+            file=sys.stderr,
+        )
+        return 1
+    instruct = voice["instruct"]
+    language = voice["language"]
+    temperature = float(voice.get("temperature", 0.6))
+    top_p = float(voice.get("topP", 0.9))
+    seed = int(voice.get("seed", 42))
+
+    audio_dir.mkdir(parents=True, exist_ok=True)

    # Skip generation when the existing audio matches the script — same cue
-    # texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
-    # time when iterating on activity timing without changing narration.
-    if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
+    # texts and same gapBeforeMs values in the same order, AND same synth
+    # settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
+    # iterating on activity timing without changing narration or persona.
+    if cached_index_matches(
+        audio_dir / "index.json",
+        cues,
+        instruct,
+        language,
+        seed,
+        temperature,
+        top_p,
+    ):
        print(
-            f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
+            f"[synth] [{args.storyboard}] cached audio matches the current script — skipping generation",
            flush=True,
        )
        return 0

-    model = load_model(args.model, args.device)
-
    texts = [c["text"].strip() for c in cues]
-    print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
+    print(f"[synth] [{args.storyboard}] persona: {instruct}", flush=True)
+    print(
+        f"[synth] [{args.storyboard}] sampling: temperature={temperature} top_p={top_p} seed={seed} language={language}",
+        flush=True,
+    )
+
+    # Two-stage generation:
+    #   1. VoiceDesign mints a single reference clip in the target persona
+    #      (or the user supplies one via --reference-audio).
+    #   2. Base + generate_voice_clone(x_vector_only_mode=True) conditions
+    #      every cue on the reference's speaker embedding.
+    # Without (2), batched generation drifts timbre across cues — a persona
+    # prompt anchors style but not identity, so each batch item picks its
+    # own voice. The reference WAV is cached so subsequent runs only load
+    # the clone model (saves ~20s + 3.4 GB of disk download).
+    ref_wav_path, ref_text = _resolve_reference(
+        args, audio_dir, instruct, language, seed, temperature, top_p
+    )
+
+    print(
+        f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
+        flush=True,
+    )
    for i, t in enumerate(texts):
        print(f"[synth]   {i:2d}: {t}", flush=True)

-    # ONE batched call. generate_custom_voice handles text=List[str] natively
-    # and broadcasts the speaker/language across all items, so the entire
-    # narration is decoded in one model pass — same RNG state, same batch,
-    # consistent voice from cue to cue.
-    wavs, sr = model.generate_custom_voice(
+    clone_model = load_model(args.clone_model, args.device)
+    seed_everything(seed)
+    wavs, sr = clone_model.generate_voice_clone(
        text=texts,
-        language=args.language,
-        speaker=args.speaker,
+        language=language,
+        ref_audio=str(ref_wav_path),
+        ref_text=ref_text,
+        x_vector_only_mode=True,
+        non_streaming_mode=True,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
    )
    if len(wavs) != len(texts):
        print(
@ -171,7 +343,7 @@ def main() -> int:
        if hasattr(audio, "cpu"):
            audio = audio.cpu().float().numpy()
        wav_name = f"cue_{cue['cueIndex']:03d}.wav"
-        wav_path = args.out_dir / wav_name
+        wav_path = audio_dir / wav_name
        sf.write(str(wav_path), audio, sr)
        duration_ms = int(round(len(audio) * 1000 / sr))
        items.append(
@ -190,15 +362,21 @@ def main() -> int:
        )

    out_index = {
-        "speaker": args.speaker,
-        "language": args.language,
-        "model": args.model,
+        "storyboard": args.storyboard,
+        "instruct": instruct,
+        "language": language,
+        "designModel": args.design_model,
+        "cloneModel": args.clone_model,
+        "referenceText": ref_text,
+        "seed": seed,
+        "temperature": temperature,
+        "topP": top_p,
        "items": items,
    }
-    (args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
+    (audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
    total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
    print(
-        f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
+        f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
        flush=True,
    )
    return 0