lgtm
This commit is contained in:
parent
8708bf000d
commit
11711c57e6
38 changed files with 5361 additions and 265 deletions
|
|
@ -116,6 +116,10 @@ def cached_index_matches(
|
|||
cues: list[dict],
|
||||
instruct: str,
|
||||
language: str,
|
||||
reference_text: str,
|
||||
design_model: str,
|
||||
clone_model: str,
|
||||
reference_audio: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
|
|
@ -123,7 +127,8 @@ def cached_index_matches(
|
|||
"""Return True iff index_path's cue list lines up with `cues` 1:1.
|
||||
|
||||
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
|
||||
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
|
||||
settings (``instruct``, ``language``, reference text, models, ``seed``,
|
||||
``temperature``, ``top_p``).
|
||||
All cue WAV files must also exist on disk. Mismatched length, reordered
|
||||
cues, or a missing WAV invalidate the cache.
|
||||
"""
|
||||
|
|
@ -135,6 +140,12 @@ def cached_index_matches(
|
|||
return False
|
||||
if cached.get("instruct") != instruct or cached.get("language") != language:
|
||||
return False
|
||||
if cached.get("referenceText") != reference_text:
|
||||
return False
|
||||
if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
|
||||
return False
|
||||
if cached.get("referenceAudio", "") != reference_audio:
|
||||
return False
|
||||
if int(cached.get("seed", -1)) != seed:
|
||||
return False
|
||||
if float(cached.get("temperature", -1)) != temperature:
|
||||
|
|
@ -170,6 +181,7 @@ def _resolve_reference(
|
|||
audio_dir: Path,
|
||||
instruct: str,
|
||||
language: str,
|
||||
reference_text: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
|
|
@ -178,8 +190,8 @@ def _resolve_reference(
|
|||
|
||||
If --reference-audio is supplied, validate and use it directly. Otherwise
|
||||
mint one via VoiceDesign (cached on disk; cache invalidates when the
|
||||
persona/sampling/seed changes). The design model is unloaded before
|
||||
returning so the clone model can claim the GPU.
|
||||
persona/language/reference/sampling/seed changes). The design model is
|
||||
unloaded before returning so the clone model can claim the GPU.
|
||||
"""
|
||||
if args.reference_audio is not None:
|
||||
if not args.reference_audio.exists():
|
||||
|
|
@ -201,7 +213,7 @@ def _resolve_reference(
|
|||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"text": REFERENCE_TEXT,
|
||||
"text": reference_text,
|
||||
}
|
||||
if (
|
||||
ref_wav_path.exists()
|
||||
|
|
@ -209,16 +221,16 @@ def _resolve_reference(
|
|||
and _safe_load_json(ref_meta_path) == ref_meta
|
||||
):
|
||||
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
return ref_wav_path, reference_text
|
||||
|
||||
print(
|
||||
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
|
||||
f"[synth] minting voice reference via VoiceDesign: «{reference_text}»",
|
||||
flush=True,
|
||||
)
|
||||
design_model = load_model(args.design_model, args.device)
|
||||
seed_everything(seed)
|
||||
ref_wavs, ref_sr = design_model.generate_voice_design(
|
||||
text=[REFERENCE_TEXT],
|
||||
text=[reference_text],
|
||||
language=language,
|
||||
instruct=instruct,
|
||||
do_sample=True,
|
||||
|
|
@ -237,7 +249,7 @@ def _resolve_reference(
|
|||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
return ref_wav_path, reference_text
|
||||
|
||||
|
||||
def main() -> int:
|
||||
|
|
@ -266,21 +278,30 @@ def main() -> int:
|
|||
return 1
|
||||
instruct = voice["instruct"]
|
||||
language = voice["language"]
|
||||
reference_text = str(voice.get("referenceText") or REFERENCE_TEXT)
|
||||
temperature = float(voice.get("temperature", 0.6))
|
||||
top_p = float(voice.get("topP", 0.9))
|
||||
seed = int(voice.get("seed", 42))
|
||||
reference_audio_cache_key = (
|
||||
str(args.reference_audio.resolve()) if args.reference_audio is not None else ""
|
||||
)
|
||||
|
||||
audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Skip generation when the existing audio matches the script — same cue
|
||||
# texts and same gapBeforeMs values in the same order, AND same synth
|
||||
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
|
||||
# iterating on activity timing without changing narration or persona.
|
||||
# settings (instruct/language/reference/model/seed/temperature/top_p).
|
||||
# Saves ~30s of GPU time when iterating on activity timing without
|
||||
# changing narration or persona.
|
||||
if cached_index_matches(
|
||||
audio_dir / "index.json",
|
||||
cues,
|
||||
instruct,
|
||||
language,
|
||||
reference_text,
|
||||
args.design_model,
|
||||
args.clone_model,
|
||||
reference_audio_cache_key,
|
||||
seed,
|
||||
temperature,
|
||||
top_p,
|
||||
|
|
@ -308,7 +329,7 @@ def main() -> int:
|
|||
# own voice. The reference WAV is cached so subsequent runs only load
|
||||
# the clone model (saves ~20s + 3.4 GB of disk download).
|
||||
ref_wav_path, ref_text = _resolve_reference(
|
||||
args, audio_dir, instruct, language, seed, temperature, top_p
|
||||
args, audio_dir, instruct, language, reference_text, seed, temperature, top_p
|
||||
)
|
||||
|
||||
print(
|
||||
|
|
@ -367,6 +388,7 @@ def main() -> int:
|
|||
"language": language,
|
||||
"designModel": args.design_model,
|
||||
"cloneModel": args.clone_model,
|
||||
"referenceAudio": reference_audio_cache_key,
|
||||
"referenceText": ref_text,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue