More FE changes

2026-05-09 09:43:41 +01:00 · 2026-05-09 09:43:41 +01:00 · a48eb945e0
commit a48eb945e0
parent f114ada255
48 changed files with 4127 additions and 1751 deletions
--- a/video/tts/mux.py
+++ b/video/tts/mux.py
@ -0,0 +1,188 @@
+"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
+
+Reads two manifests:
+
+* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
+  duration. Generated BEFORE recording in one batched Qwen3-TTS call.
+* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
+  the trimmed video. Generated DURING recording.
+
+Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
+runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
+video stream, and writes ``output/recording.narrated.mp4``.
+
+Run from the ``video/`` directory after recording:
+
+    uv run --project tts python tts/mux.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
+    parser.add_argument(
+        "--narration",
+        type=Path,
+        default=Path("output/narration.json"),
+        help="Per-cue videoTimeMs manifest written by the recorder.",
+    )
+    parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=Path("output/recording.narrated.mp4"),
+    )
+    parser.add_argument(
+        "--replace",
+        action="store_true",
+        help="After muxing, atomically replace --video with --out.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+
+    if not shutil.which("ffmpeg"):
+        print("[mux] ffmpeg not on PATH", file=sys.stderr)
+        return 1
+
+    audio_index_path = args.audio_dir / "index.json"
+    if not audio_index_path.exists():
+        print(
+            f"[mux] {audio_index_path} not found; run tts/synth.py first",
+            file=sys.stderr,
+        )
+        return 1
+
+    if not args.narration.exists():
+        print(
+            f"[mux] {args.narration} not found; the recorder must run before mux",
+            file=sys.stderr,
+        )
+        return 1
+
+    if not args.video.exists():
+        print(f"[mux] video not found: {args.video}", file=sys.stderr)
+        return 1
+
+    audio_index = json.loads(audio_index_path.read_text())
+    audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
+    if not audio_items:
+        print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
+        shutil.copyfile(args.video, args.out)
+        return 0
+
+    narration = json.loads(args.narration.read_text())
+    nar_cues = list(narration.get("cues", []))
+    if len(nar_cues) != len(audio_items):
+        print(
+            f"[mux] cue count mismatch: synth has {len(audio_items)} cues, "
+            f"recorder logged {len(nar_cues)}. Re-run preflight + synth + record.",
+            file=sys.stderr,
+        )
+        return 1
+
+    # Sort audio items by cueIndex so list-order matches the recorder's
+    # cue list (which is also in cue order). Then pair 1:1.
+    audio_by_index = {int(it["cueIndex"]): it for it in audio_items}
+    items = []
+    for i, nar in enumerate(nar_cues):
+        audio = audio_by_index.get(i)
+        if audio is None:
+            print(f"[mux] no synth wav for cue {i}", file=sys.stderr)
+            return 1
+        items.append(
+            {
+                "cueIndex": i,
+                "wav": audio["wav"],
+                "durationMs": int(audio["durationMs"]),
+                "videoTimeMs": int(nar["videoTimeMs"]),
+                "text": nar.get("text", ""),
+            }
+        )
+
+    # Refuse to mux overlapping cues — amix would silently mash voices on top
+    # of each other. Sort by start so the order matches what we'll actually
+    # play, then check that each cue ends before the next one starts.
+    ordered = sorted(items, key=lambda it: it["videoTimeMs"])
+    overlaps: list[str] = []
+    for prev, nxt in zip(ordered, ordered[1:]):
+        prev_end = prev["videoTimeMs"] + prev["durationMs"]
+        nxt_start = nxt["videoTimeMs"]
+        if prev_end > nxt_start:
+            overlaps.append(
+                f"cue {prev['cueIndex']} ends at {prev_end}ms but cue {nxt['cueIndex']} "
+                f"starts at {nxt_start}ms (overlap {prev_end - nxt_start}ms)"
+            )
+    if overlaps:
+        raise SystemExit(
+            "[mux] refusing to produce overlapping narration:\n  - "
+            + "\n  - ".join(overlaps)
+        )
+
+    cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
+    for it in items:
+        cmd += ["-i", str(args.audio_dir / it["wav"])]
+
+    filter_parts: list[str] = []
+    mix_inputs: list[str] = []
+    for n, it in enumerate(items, start=1):
+        delay_ms = max(0, it["videoTimeMs"])
+        label = f"a{n}"
+        # adelay needs one delay per channel; "all=1" applies the same delay
+        # to every channel, which is what we want for mono narration.
+        filter_parts.append(
+            f"[{n}:a]aresample=async=1,adelay={delay_ms}|{delay_ms}:all=1[{label}]"
+        )
+        mix_inputs.append(f"[{label}]")
+
+    mix = (
+        f"{''.join(mix_inputs)}amix=inputs={len(items)}"
+        f":duration=longest:dropout_transition=0:normalize=0[aout]"
+    )
+    filter_complex = ";".join(filter_parts + [mix])
+
+    cmd += [
+        "-filter_complex",
+        filter_complex,
+        "-map",
+        "0:v:0",
+        "-map",
+        "[aout]",
+        "-c:v",
+        "copy",
+        "-c:a",
+        "aac",
+        "-b:a",
+        "192k",
+        "-shortest",
+        "-movflags",
+        "+faststart",
+        str(args.out),
+    ]
+
+    print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
+        return result.returncode
+
+    if args.replace:
+        args.out.replace(args.video)
+        print(f"[mux] replaced {args.video} with narrated copy", flush=True)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/video/tts/synth.py
+++ b/video/tts/synth.py
@ -0,0 +1,208 @@
+"""Synthesize the full narration in ONE batched Qwen3-TTS call.
+
+Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
+runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
+batched list — that way every cue shares the same model state, which keeps
+prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
+go to ``output/audio/`` for the recording step (which reads measured cue
+durations) and the mux step (which drops each WAV at its videoTime).
+
+Run from the ``video/`` directory:
+
+    uv run --project tts python tts/synth.py
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+import soundfile as sf
+import torch
+from qwen_tts import Qwen3TTSModel
+
+
+DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+DEFAULT_SPEAKER = "ryan"
+DEFAULT_LANGUAGE = "English"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--script",
+        type=Path,
+        default=Path("output/narration-script.json"),
+        help="Narration script emitted by dist/preflight.js.",
+    )
+    parser.add_argument(
+        "--out-dir",
+        type=Path,
+        default=Path("output/audio"),
+        help="Directory to write WAV files and index.json into.",
+    )
+    parser.add_argument(
+        "--model",
+        default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
+    )
+    parser.add_argument(
+        "--speaker",
+        default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
+        help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
+    )
+    parser.add_argument(
+        "--language",
+        default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
+    )
+    parser.add_argument(
+        "--device",
+        default=os.environ.get("TTS_DEVICE", "cuda:0"),
+    )
+    parser.add_argument(
+        "--list-speakers",
+        action="store_true",
+        help="Load the model, print available speaker names, and exit.",
+    )
+    return parser.parse_args()
+
+
+def load_model(model_id: str, device: str) -> Qwen3TTSModel:
+    dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
+    print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
+    return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)
+
+
+def cached_index_matches(
+    index_path: Path,
+    cues: list[dict],
+    speaker: str,
+    language: str,
+) -> bool:
+    """Return True iff index_path's cue list lines up with `cues` 1:1.
+
+    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
+    settings (``speaker``, ``language``). All cue WAV files must also exist
+    on disk. Mismatched length, reordered cues, or a missing WAV invalidate
+    the cache.
+    """
+    if not index_path.exists():
+        return False
+    try:
+        cached = json.loads(index_path.read_text())
+    except json.JSONDecodeError:
+        return False
+    if cached.get("speaker") != speaker or cached.get("language") != language:
+        return False
+    cached_items = cached.get("items", [])
+    if len(cached_items) != len(cues):
+        return False
+    for live, prev in zip(cues, cached_items):
+        if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
+            return False
+        if live["text"].strip() != str(prev.get("text", "")).strip():
+            return False
+        if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
+            return False
+        wav = prev.get("wav")
+        if not wav or not (index_path.parent / wav).exists():
+            return False
+    return True
+
+
+def main() -> int:
+    args = parse_args()
+
+    if args.list_speakers:
+        model = load_model(args.model, args.device)
+        speakers = model.get_supported_speakers()
+        print(json.dumps(speakers, indent=2, ensure_ascii=False))
+        return 0
+
+    if not args.script.exists():
+        print(f"[synth] script not found: {args.script}", file=sys.stderr)
+        return 1
+
+    script = json.loads(args.script.read_text())
+    cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
+    if not cues:
+        print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
+        return 1
+
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    # Skip generation when the existing audio matches the script — same cue
+    # texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
+    # time when iterating on activity timing without changing narration.
+    if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
+        print(
+            f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
+            flush=True,
+        )
+        return 0
+
+    model = load_model(args.model, args.device)
+
+    texts = [c["text"].strip() for c in cues]
+    print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
+    for i, t in enumerate(texts):
+        print(f"[synth]   {i:2d}: {t}", flush=True)
+
+    # ONE batched call. generate_custom_voice handles text=List[str] natively
+    # and broadcasts the speaker/language across all items, so the entire
+    # narration is decoded in one model pass — same RNG state, same batch,
+    # consistent voice from cue to cue.
+    wavs, sr = model.generate_custom_voice(
+        text=texts,
+        language=args.language,
+        speaker=args.speaker,
+    )
+    if len(wavs) != len(texts):
+        print(
+            f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
+            file=sys.stderr,
+        )
+        return 1
+
+    items = []
+    for cue, audio in zip(cues, wavs):
+        if hasattr(audio, "cpu"):
+            audio = audio.cpu().float().numpy()
+        wav_name = f"cue_{cue['cueIndex']:03d}.wav"
+        wav_path = args.out_dir / wav_name
+        sf.write(str(wav_path), audio, sr)
+        duration_ms = int(round(len(audio) * 1000 / sr))
+        items.append(
+            {
+                "cueIndex": cue["cueIndex"],
+                "text": cue["text"],
+                "gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
+                "wav": wav_name,
+                "sampleRate": sr,
+                "durationMs": duration_ms,
+            }
+        )
+        print(
+            f"[synth] wrote {wav_name}  {duration_ms:>5d}ms  «{cue['text']}»",
+            flush=True,
+        )
+
+    out_index = {
+        "speaker": args.speaker,
+        "language": args.language,
+        "model": args.model,
+        "items": items,
+    }
+    (args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
+    total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
+    print(
+        f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
+        flush=True,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())