lgtm

2026-05-12 22:00:56 +01:00 · 2026-05-12 22:00:56 +01:00 · 11711c57e6
commit 11711c57e6
parent 8708bf000d
38 changed files with 5361 additions and 265 deletions
--- a/video/tts/synth.py
+++ b/video/tts/synth.py
@ -116,6 +116,10 @@ def cached_index_matches(
    cues: list[dict],
    instruct: str,
    language: str,
+    reference_text: str,
+    design_model: str,
+    clone_model: str,
+    reference_audio: str,
    seed: int,
    temperature: float,
    top_p: float,
@ -123,7 +127,8 @@ def cached_index_matches(
    """Return True iff index_path's cue list lines up with `cues` 1:1.

    Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
-    settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
+    settings (``instruct``, ``language``, reference text, models, ``seed``,
+    ``temperature``, ``top_p``).
    All cue WAV files must also exist on disk. Mismatched length, reordered
    cues, or a missing WAV invalidate the cache.
    """
@ -135,6 +140,12 @@ def cached_index_matches(
        return False
    if cached.get("instruct") != instruct or cached.get("language") != language:
        return False
+    if cached.get("referenceText") != reference_text:
+        return False
+    if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
+        return False
+    if cached.get("referenceAudio", "") != reference_audio:
+        return False
    if int(cached.get("seed", -1)) != seed:
        return False
    if float(cached.get("temperature", -1)) != temperature:
@ -170,6 +181,7 @@ def _resolve_reference(
    audio_dir: Path,
    instruct: str,
    language: str,
+    reference_text: str,
    seed: int,
    temperature: float,
    top_p: float,
@ -178,8 +190,8 @@ def _resolve_reference(

    If --reference-audio is supplied, validate and use it directly. Otherwise
    mint one via VoiceDesign (cached on disk; cache invalidates when the
-    persona/sampling/seed changes). The design model is unloaded before
-    returning so the clone model can claim the GPU.
+    persona/language/reference/sampling/seed changes). The design model is
+    unloaded before returning so the clone model can claim the GPU.
    """
    if args.reference_audio is not None:
        if not args.reference_audio.exists():
@ -201,7 +213,7 @@ def _resolve_reference(
        "seed": seed,
        "temperature": temperature,
        "topP": top_p,
-        "text": REFERENCE_TEXT,
+        "text": reference_text,
    }
    if (
        ref_wav_path.exists()
@ -209,16 +221,16 @@ def _resolve_reference(
        and _safe_load_json(ref_meta_path) == ref_meta
    ):
        print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
-        return ref_wav_path, REFERENCE_TEXT
+        return ref_wav_path, reference_text

    print(
-        f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
+        f"[synth] minting voice reference via VoiceDesign: «{reference_text}»",
        flush=True,
    )
    design_model = load_model(args.design_model, args.device)
    seed_everything(seed)
    ref_wavs, ref_sr = design_model.generate_voice_design(
-        text=[REFERENCE_TEXT],
+        text=[reference_text],
        language=language,
        instruct=instruct,
        do_sample=True,
@ -237,7 +249,7 @@ def _resolve_reference(
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

-    return ref_wav_path, REFERENCE_TEXT
+    return ref_wav_path, reference_text


 def main() -> int:
@ -266,21 +278,30 @@ def main() -> int:
        return 1
    instruct = voice["instruct"]
    language = voice["language"]
+    reference_text = str(voice.get("referenceText") or REFERENCE_TEXT)
    temperature = float(voice.get("temperature", 0.6))
    top_p = float(voice.get("topP", 0.9))
    seed = int(voice.get("seed", 42))
+    reference_audio_cache_key = (
+        str(args.reference_audio.resolve()) if args.reference_audio is not None else ""
+    )

    audio_dir.mkdir(parents=True, exist_ok=True)

    # Skip generation when the existing audio matches the script — same cue
    # texts and same gapBeforeMs values in the same order, AND same synth
-    # settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
-    # iterating on activity timing without changing narration or persona.
+    # settings (instruct/language/reference/model/seed/temperature/top_p).
+    # Saves ~30s of GPU time when iterating on activity timing without
+    # changing narration or persona.
    if cached_index_matches(
        audio_dir / "index.json",
        cues,
        instruct,
        language,
+        reference_text,
+        args.design_model,
+        args.clone_model,
+        reference_audio_cache_key,
        seed,
        temperature,
        top_p,
@ -308,7 +329,7 @@ def main() -> int:
    # own voice. The reference WAV is cached so subsequent runs only load
    # the clone model (saves ~20s + 3.4 GB of disk download).
    ref_wav_path, ref_text = _resolve_reference(
-        args, audio_dir, instruct, language, seed, temperature, top_p
+        args, audio_dir, instruct, language, reference_text, seed, temperature, top_p
    )

    print(
@ -367,6 +388,7 @@ def main() -> int:
        "language": language,
        "designModel": args.design_model,
        "cloneModel": args.clone_model,
+        "referenceAudio": reference_audio_cache_key,
        "referenceText": ref_text,
        "seed": seed,
        "temperature": temperature,