perfect-postcode/video/tts/synth.py

"""Synthesize the full narration in ONE batched Qwen3-TTS call.

Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
batched list — that way every cue shares the same model state, which keeps
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
go to ``output/audio/`` for the recording step (which reads measured cue
durations) and the mux step (which drops each WAV at its videoTime).

Run from the ``video/`` directory:

    uv run --project tts python tts/synth.py
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from pathlib import Path

import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel


DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_SPEAKER = "ryan"
DEFAULT_LANGUAGE = "English"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--script",
        type=Path,
        default=Path("output/narration-script.json"),
        help="Narration script emitted by dist/preflight.js.",
    )
    parser.add_argument(
        "--out-dir",
        type=Path,
        default=Path("output/audio"),
        help="Directory to write WAV files and index.json into.",
    )
    parser.add_argument(
        "--model",
        default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
    )
    parser.add_argument(
        "--speaker",
        default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
        help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
    )
    parser.add_argument(
        "--language",
        default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
    )
    parser.add_argument(
        "--device",
        default=os.environ.get("TTS_DEVICE", "cuda:0"),
    )
    parser.add_argument(
        "--list-speakers",
        action="store_true",
        help="Load the model, print available speaker names, and exit.",
    )
    return parser.parse_args()


def load_model(model_id: str, device: str) -> Qwen3TTSModel:
    dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
    print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
    return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)


def cached_index_matches(
    index_path: Path,
    cues: list[dict],
    speaker: str,
    language: str,
) -> bool:
    """Return True iff index_path's cue list lines up with `cues` 1:1.

    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
    settings (``speaker``, ``language``). All cue WAV files must also exist
    on disk. Mismatched length, reordered cues, or a missing WAV invalidate
    the cache.
    """
    if not index_path.exists():
        return False
    try:
        cached = json.loads(index_path.read_text())
    except json.JSONDecodeError:
        return False
    if cached.get("speaker") != speaker or cached.get("language") != language:
        return False
    cached_items = cached.get("items", [])
    if len(cached_items) != len(cues):
        return False
    for live, prev in zip(cues, cached_items):
        if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
            return False
        if live["text"].strip() != str(prev.get("text", "")).strip():
            return False
        if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
            return False
        wav = prev.get("wav")
        if not wav or not (index_path.parent / wav).exists():
            return False
    return True


def main() -> int:
    args = parse_args()

    if args.list_speakers:
        model = load_model(args.model, args.device)
        speakers = model.get_supported_speakers()
        print(json.dumps(speakers, indent=2, ensure_ascii=False))
        return 0

    if not args.script.exists():
        print(f"[synth] script not found: {args.script}", file=sys.stderr)
        return 1

    script = json.loads(args.script.read_text())
    cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
    if not cues:
        print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
        return 1

    args.out_dir.mkdir(parents=True, exist_ok=True)

    # Skip generation when the existing audio matches the script — same cue
    # texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
    # time when iterating on activity timing without changing narration.
    if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
        print(
            f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
            flush=True,
        )
        return 0

    model = load_model(args.model, args.device)

    texts = [c["text"].strip() for c in cues]
    print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
    for i, t in enumerate(texts):
        print(f"[synth]   {i:2d}: {t}", flush=True)

    # ONE batched call. generate_custom_voice handles text=List[str] natively
    # and broadcasts the speaker/language across all items, so the entire
    # narration is decoded in one model pass — same RNG state, same batch,
    # consistent voice from cue to cue.
    wavs, sr = model.generate_custom_voice(
        text=texts,
        language=args.language,
        speaker=args.speaker,
    )
    if len(wavs) != len(texts):
        print(
            f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
            file=sys.stderr,
        )
        return 1

    items = []
    for cue, audio in zip(cues, wavs):
        if hasattr(audio, "cpu"):
            audio = audio.cpu().float().numpy()
        wav_name = f"cue_{cue['cueIndex']:03d}.wav"
        wav_path = args.out_dir / wav_name
        sf.write(str(wav_path), audio, sr)
        duration_ms = int(round(len(audio) * 1000 / sr))
        items.append(
            {
                "cueIndex": cue["cueIndex"],
                "text": cue["text"],
                "gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
                "wav": wav_name,
                "sampleRate": sr,
                "durationMs": duration_ms,
            }
        )
        print(
            f"[synth] wrote {wav_name}  {duration_ms:>5d}ms  «{cue['text']}»",
            flush=True,
        )

    out_index = {
        "speaker": args.speaker,
        "language": args.language,
        "model": args.model,
        "items": items,
    }
    (args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
    total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
    print(
        f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
        flush=True,
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())