perfect-postcode/video/tts/synth.py

"""Synthesize one storyboard's narration in ONE batched Qwen3-TTS call.

Reads ``output/<storyboard>/narration-script.json`` (emitted by
``dist/preflight.js``) and runs ``Qwen3TTSModel.generate_voice_design`` with
all cue texts as a single batched list — that way every cue shares the same
model state, which keeps prosody and timbre consistent across cues. Per-cue
WAVs and an index manifest go to ``output/<storyboard>/audio/`` for the
recording step (which reads measured cue durations) and the mux step (which
drops each WAV at its videoTime).

Voice persona, language, and sampling come from the storyboard via the
``voice`` block of the narration script. CLI flags can still override them
for ad-hoc experimentation; storyboards remain the source of truth for
production runs.

We use the VoiceDesign sibling of CustomVoice because it accepts a free-form
voice persona (British accent, narrator register, "no laughter") via the
``instruct`` parameter. CustomVoice's preset speakers are all American or
non-English, and its ``instruct`` is documented for emotion only — it
ignored accent directives and bled non-speech tokens (laughter, sighs)
between cues.

Run from the ``video/`` directory:

    uv run --project tts python tts/synth.py --storyboard recording
"""

from __future__ import annotations

import argparse
import json
import os
import random
import sys
from pathlib import Path

import numpy as np
import soundfile as sf
import torch
from qwen_tts import Qwen3TTSModel


# Two checkpoints: the design model mints the reference clip in the desired
# persona; the clone model conditions every cue on that reference's x-vector.
# Neither CustomVoice nor VoiceDesign support generate_voice_clone — only the
# Base checkpoint does.
DEFAULT_DESIGN_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
DEFAULT_CLONE_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"

# Fixed reference utterance used to anchor the speaker timbre. The reference
# is generated once per (model, instruct, sampling, seed) tuple and reused
# for every cue, so all narration shares the same x-vector. Two short
# sentences exercise enough phonemes for a stable embedding without bloating
# generation time.
REFERENCE_TEXT = (
    "Welcome to the demonstration. This is the narrator voice you'll hear throughout the video."
)


def _safe_load_json(path: Path) -> object | None:
    try:
        return json.loads(path.read_text())
    except (FileNotFoundError, json.JSONDecodeError):
        return None


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--storyboard",
        required=True,
        help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("output"),
        help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
    )
    parser.add_argument(
        "--design-model",
        default=os.environ.get("TTS_DESIGN_MODEL", DEFAULT_DESIGN_MODEL),
        help="Checkpoint used to mint the voice reference (VoiceDesign by default).",
    )
    parser.add_argument(
        "--clone-model",
        default=os.environ.get("TTS_CLONE_MODEL", DEFAULT_CLONE_MODEL),
        help="Checkpoint used to clone the cue audio from the reference (Base by default).",
    )
    parser.add_argument(
        "--reference-audio",
        type=Path,
        default=(Path(os.environ["TTS_REFERENCE_AUDIO"]) if os.environ.get("TTS_REFERENCE_AUDIO") else None),
        help="Path to an existing reference WAV. If set, skip VoiceDesign and clone from this.",
    )
    parser.add_argument(
        "--reference-text",
        default=os.environ.get("TTS_REFERENCE_TEXT"),
        help="Transcript of --reference-audio. Required if --reference-audio is set.",
    )
    parser.add_argument(
        "--device",
        default=os.environ.get("TTS_DEVICE", "cuda:0"),
    )
    return parser.parse_args()


def load_model(model_id: str, device: str) -> Qwen3TTSModel:
    dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
    print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
    return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)


def cached_index_matches(
    index_path: Path,
    cues: list[dict],
    instruct: str,
    language: str,
    seed: int,
    temperature: float,
    top_p: float,
) -> bool:
    """Return True iff index_path's cue list lines up with `cues` 1:1.

    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
    settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
    All cue WAV files must also exist on disk. Mismatched length, reordered
    cues, or a missing WAV invalidate the cache.
    """
    if not index_path.exists():
        return False
    try:
        cached = json.loads(index_path.read_text())
    except json.JSONDecodeError:
        return False
    if cached.get("instruct") != instruct or cached.get("language") != language:
        return False
    if int(cached.get("seed", -1)) != seed:
        return False
    if float(cached.get("temperature", -1)) != temperature:
        return False
    if float(cached.get("topP", -1)) != top_p:
        return False
    cached_items = cached.get("items", [])
    if len(cached_items) != len(cues):
        return False
    for live, prev in zip(cues, cached_items):
        if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
            return False
        if live["text"].strip() != str(prev.get("text", "")).strip():
            return False
        if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
            return False
        wav = prev.get("wav")
        if not wav or not (index_path.parent / wav).exists():
            return False
    return True


def seed_everything(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def _resolve_reference(
    args: argparse.Namespace,
    audio_dir: Path,
    instruct: str,
    language: str,
    seed: int,
    temperature: float,
    top_p: float,
) -> tuple[Path, str]:
    """Return (ref_wav_path, ref_text) for the clone step.

    If --reference-audio is supplied, validate and use it directly. Otherwise
    mint one via VoiceDesign (cached on disk; cache invalidates when the
    persona/sampling/seed changes). The design model is unloaded before
    returning so the clone model can claim the GPU.
    """
    if args.reference_audio is not None:
        if not args.reference_audio.exists():
            raise SystemExit(f"[synth] --reference-audio does not exist: {args.reference_audio}")
        if not args.reference_text:
            raise SystemExit("[synth] --reference-text is required when --reference-audio is set")
        print(
            f"[synth] using user-supplied reference {args.reference_audio} «{args.reference_text}»",
            flush=True,
        )
        return args.reference_audio, args.reference_text

    ref_wav_path = audio_dir / "_reference.wav"
    ref_meta_path = audio_dir / "_reference.meta.json"
    ref_meta = {
        "model": args.design_model,
        "instruct": instruct,
        "language": language,
        "seed": seed,
        "temperature": temperature,
        "topP": top_p,
        "text": REFERENCE_TEXT,
    }
    if (
        ref_wav_path.exists()
        and ref_meta_path.exists()
        and _safe_load_json(ref_meta_path) == ref_meta
    ):
        print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
        return ref_wav_path, REFERENCE_TEXT

    print(
        f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
        flush=True,
    )
    design_model = load_model(args.design_model, args.device)
    seed_everything(seed)
    ref_wavs, ref_sr = design_model.generate_voice_design(
        text=[REFERENCE_TEXT],
        language=language,
        instruct=instruct,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    ref_audio = ref_wavs[0]
    if hasattr(ref_audio, "cpu"):
        ref_audio = ref_audio.cpu().float().numpy()
    sf.write(str(ref_wav_path), ref_audio, ref_sr)
    ref_meta_path.write_text(json.dumps(ref_meta, indent=2))

    # Free the design model before loading the clone model — both are 1.7B,
    # we don't want them resident at the same time.
    del design_model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return ref_wav_path, REFERENCE_TEXT


def main() -> int:
    args = parse_args()

    storyboard_dir = args.output_dir / args.storyboard
    script_path = storyboard_dir / "narration-script.json"
    audio_dir = storyboard_dir / "audio"

    if not script_path.exists():
        print(f"[synth] script not found: {script_path}", file=sys.stderr)
        return 1

    script = json.loads(script_path.read_text())
    cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
    if not cues:
        print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
        return 1

    voice = script.get("voice")
    if not voice:
        print(
            f"[synth] {script_path} has no `voice` block — re-run preflight.",
            file=sys.stderr,
        )
        return 1
    instruct = voice["instruct"]
    language = voice["language"]
    temperature = float(voice.get("temperature", 0.6))
    top_p = float(voice.get("topP", 0.9))
    seed = int(voice.get("seed", 42))

    audio_dir.mkdir(parents=True, exist_ok=True)

    # Skip generation when the existing audio matches the script — same cue
    # texts and same gapBeforeMs values in the same order, AND same synth
    # settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
    # iterating on activity timing without changing narration or persona.
    if cached_index_matches(
        audio_dir / "index.json",
        cues,
        instruct,
        language,
        seed,
        temperature,
        top_p,
    ):
        print(
            f"[synth] [{args.storyboard}] cached audio matches the current script — skipping generation",
            flush=True,
        )
        return 0

    texts = [c["text"].strip() for c in cues]
    print(f"[synth] [{args.storyboard}] persona: {instruct}", flush=True)
    print(
        f"[synth] [{args.storyboard}] sampling: temperature={temperature} top_p={top_p} seed={seed} language={language}",
        flush=True,
    )

    # Two-stage generation:
    #   1. VoiceDesign mints a single reference clip in the target persona
    #      (or the user supplies one via --reference-audio).
    #   2. Base + generate_voice_clone(x_vector_only_mode=True) conditions
    #      every cue on the reference's speaker embedding.
    # Without (2), batched generation drifts timbre across cues — a persona
    # prompt anchors style but not identity, so each batch item picks its
    # own voice. The reference WAV is cached so subsequent runs only load
    # the clone model (saves ~20s + 3.4 GB of disk download).
    ref_wav_path, ref_text = _resolve_reference(
        args, audio_dir, instruct, language, seed, temperature, top_p
    )

    print(
        f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
        flush=True,
    )
    for i, t in enumerate(texts):
        print(f"[synth]   {i:2d}: {t}", flush=True)

    clone_model = load_model(args.clone_model, args.device)
    seed_everything(seed)
    wavs, sr = clone_model.generate_voice_clone(
        text=texts,
        language=language,
        ref_audio=str(ref_wav_path),
        ref_text=ref_text,
        x_vector_only_mode=True,
        non_streaming_mode=True,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
    )
    if len(wavs) != len(texts):
        print(
            f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
            file=sys.stderr,
        )
        return 1

    items = []
    for cue, audio in zip(cues, wavs):
        if hasattr(audio, "cpu"):
            audio = audio.cpu().float().numpy()
        wav_name = f"cue_{cue['cueIndex']:03d}.wav"
        wav_path = audio_dir / wav_name
        sf.write(str(wav_path), audio, sr)
        duration_ms = int(round(len(audio) * 1000 / sr))
        items.append(
            {
                "cueIndex": cue["cueIndex"],
                "text": cue["text"],
                "gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
                "wav": wav_name,
                "sampleRate": sr,
                "durationMs": duration_ms,
            }
        )
        print(
            f"[synth] wrote {wav_name}  {duration_ms:>5d}ms  «{cue['text']}»",
            flush=True,
        )

    out_index = {
        "storyboard": args.storyboard,
        "instruct": instruct,
        "language": language,
        "designModel": args.design_model,
        "cloneModel": args.clone_model,
        "referenceText": ref_text,
        "seed": seed,
        "temperature": temperature,
        "topP": top_p,
        "items": items,
    }
    (audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
    total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
    print(
        f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
        flush=True,
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())