This commit is contained in:
Andras Schmelczer 2026-05-12 22:00:56 +01:00
parent 8708bf000d
commit 11711c57e6
38 changed files with 5361 additions and 265 deletions

View file

@ -116,6 +116,10 @@ def cached_index_matches(
cues: list[dict],
instruct: str,
language: str,
reference_text: str,
design_model: str,
clone_model: str,
reference_audio: str,
seed: int,
temperature: float,
top_p: float,
@ -123,7 +127,8 @@ def cached_index_matches(
"""Return True iff index_path's cue list lines up with `cues` 1:1.
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
settings (``instruct``, ``language``, reference text, models, ``seed``,
``temperature``, ``top_p``).
All cue WAV files must also exist on disk. Mismatched length, reordered
cues, or a missing WAV invalidate the cache.
"""
@ -135,6 +140,12 @@ def cached_index_matches(
return False
if cached.get("instruct") != instruct or cached.get("language") != language:
return False
if cached.get("referenceText") != reference_text:
return False
if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
return False
if cached.get("referenceAudio", "") != reference_audio:
return False
if int(cached.get("seed", -1)) != seed:
return False
if float(cached.get("temperature", -1)) != temperature:
@ -170,6 +181,7 @@ def _resolve_reference(
audio_dir: Path,
instruct: str,
language: str,
reference_text: str,
seed: int,
temperature: float,
top_p: float,
@ -178,8 +190,8 @@ def _resolve_reference(
If --reference-audio is supplied, validate and use it directly. Otherwise
mint one via VoiceDesign (cached on disk; cache invalidates when the
persona/sampling/seed changes). The design model is unloaded before
returning so the clone model can claim the GPU.
persona/language/reference/sampling/seed changes). The design model is
unloaded before returning so the clone model can claim the GPU.
"""
if args.reference_audio is not None:
if not args.reference_audio.exists():
@ -201,7 +213,7 @@ def _resolve_reference(
"seed": seed,
"temperature": temperature,
"topP": top_p,
"text": REFERENCE_TEXT,
"text": reference_text,
}
if (
ref_wav_path.exists()
@ -209,16 +221,16 @@ def _resolve_reference(
and _safe_load_json(ref_meta_path) == ref_meta
):
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
return ref_wav_path, REFERENCE_TEXT
return ref_wav_path, reference_text
print(
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
f"[synth] minting voice reference via VoiceDesign: «{reference_text}»",
flush=True,
)
design_model = load_model(args.design_model, args.device)
seed_everything(seed)
ref_wavs, ref_sr = design_model.generate_voice_design(
text=[REFERENCE_TEXT],
text=[reference_text],
language=language,
instruct=instruct,
do_sample=True,
@ -237,7 +249,7 @@ def _resolve_reference(
if torch.cuda.is_available():
torch.cuda.empty_cache()
return ref_wav_path, REFERENCE_TEXT
return ref_wav_path, reference_text
def main() -> int:
@ -266,21 +278,30 @@ def main() -> int:
return 1
instruct = voice["instruct"]
language = voice["language"]
reference_text = str(voice.get("referenceText") or REFERENCE_TEXT)
temperature = float(voice.get("temperature", 0.6))
top_p = float(voice.get("topP", 0.9))
seed = int(voice.get("seed", 42))
reference_audio_cache_key = (
str(args.reference_audio.resolve()) if args.reference_audio is not None else ""
)
audio_dir.mkdir(parents=True, exist_ok=True)
# Skip generation when the existing audio matches the script — same cue
# texts and same gapBeforeMs values in the same order, AND same synth
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
# iterating on activity timing without changing narration or persona.
# settings (instruct/language/reference/model/seed/temperature/top_p).
# Saves ~30s of GPU time when iterating on activity timing without
# changing narration or persona.
if cached_index_matches(
audio_dir / "index.json",
cues,
instruct,
language,
reference_text,
args.design_model,
args.clone_model,
reference_audio_cache_key,
seed,
temperature,
top_p,
@ -308,7 +329,7 @@ def main() -> int:
# own voice. The reference WAV is cached so subsequent runs only load
# the clone model (saves ~20s + 3.4 GB of disk download).
ref_wav_path, ref_text = _resolve_reference(
args, audio_dir, instruct, language, seed, temperature, top_p
args, audio_dir, instruct, language, reference_text, seed, temperature, top_p
)
print(
@ -367,6 +388,7 @@ def main() -> int:
"language": language,
"designModel": args.design_model,
"cloneModel": args.clone_model,
"referenceAudio": reference_audio_cache_key,
"referenceText": ref_text,
"seed": seed,
"temperature": temperature,