This commit is contained in:
Andras Schmelczer 2026-05-26 19:45:13 +01:00
parent c645b0f1d4
commit 39ef5c6646
79 changed files with 5660 additions and 2199 deletions

View file

@ -168,6 +168,61 @@ def cached_index_matches(
return True
def load_reusable_items(
index_path: Path,
cues: list[dict],
instruct: str,
language: str,
reference_text: str,
design_model: str,
clone_model: str,
reference_audio: str,
seed: int,
temperature: float,
top_p: float,
) -> dict[int, dict]:
"""Return cue-indexed cached items that match the current synth settings.
Unlike ``cached_index_matches`` this accepts a partial index, so a long
CPU synthesis run can be resumed cue-by-cue after an interruption.
"""
if not index_path.exists():
return {}
try:
cached = json.loads(index_path.read_text())
except json.JSONDecodeError:
return {}
if cached.get("instruct") != instruct or cached.get("language") != language:
return {}
if cached.get("referenceText") != reference_text:
return {}
if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
return {}
if cached.get("referenceAudio", "") != reference_audio:
return {}
if int(cached.get("seed", -1)) != seed:
return {}
if float(cached.get("temperature", -1)) != temperature:
return {}
if float(cached.get("topP", -1)) != top_p:
return {}
cue_by_index = {int(c["cueIndex"]): c for c in cues}
reusable: dict[int, dict] = {}
for item in cached.get("items", []):
cue_index = int(item.get("cueIndex", -1))
cue = cue_by_index.get(cue_index)
wav = item.get("wav")
if cue is None or not wav or not (index_path.parent / wav).exists():
continue
if cue["text"].strip() != str(item.get("text", "")).strip():
continue
if int(cue.get("gapBeforeMs", 0)) != int(item.get("gapBeforeMs", -1)):
continue
reusable[cue_index] = item
return reusable
def seed_everything(seed: int) -> None:
random.seed(seed)
np.random.seed(seed)
@ -333,34 +388,74 @@ def main() -> int:
)
print(
f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
f"[synth] cloning {len(texts)} cues from reference (x_vector_only)",
flush=True,
)
for i, t in enumerate(texts):
print(f"[synth] {i:2d}: {t}", flush=True)
clone_model = load_model(args.clone_model, args.device)
seed_everything(seed)
wavs, sr = clone_model.generate_voice_clone(
text=texts,
language=language,
ref_audio=str(ref_wav_path),
ref_text=ref_text,
x_vector_only_mode=True,
non_streaming_mode=True,
do_sample=True,
temperature=temperature,
top_p=top_p,
out_index_base = {
"storyboard": args.storyboard,
"instruct": instruct,
"language": language,
"designModel": args.design_model,
"cloneModel": args.clone_model,
"referenceAudio": reference_audio_cache_key,
"referenceText": ref_text,
"seed": seed,
"temperature": temperature,
"topP": top_p,
}
index_path = audio_dir / "index.json"
reusable = load_reusable_items(
index_path,
cues,
instruct,
language,
reference_text,
args.design_model,
args.clone_model,
reference_audio_cache_key,
seed,
temperature,
top_p,
)
if len(wavs) != len(texts):
print(
f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
file=sys.stderr,
)
return 1
def write_index(items: list[dict]) -> None:
index_path.write_text(json.dumps({**out_index_base, "items": items}, indent=2))
items = []
for cue, audio in zip(cues, wavs):
for cue_index, cue in enumerate(cues):
cached_item = reusable.get(int(cue["cueIndex"]))
if cached_item:
items.append(cached_item)
write_index(items)
print(
f"[synth] reusing {cached_item['wav']} {int(cached_item['durationMs']):>5d}ms «{cue['text']}»",
flush=True,
)
continue
seed_everything(seed + cue_index)
wavs, sr = clone_model.generate_voice_clone(
text=[texts[cue_index]],
language=language,
ref_audio=str(ref_wav_path),
ref_text=ref_text,
x_vector_only_mode=True,
non_streaming_mode=True,
do_sample=True,
temperature=temperature,
top_p=top_p,
)
if len(wavs) != 1:
print(
f"[synth] model returned {len(wavs)} wavs for cue {cue_index}",
file=sys.stderr,
)
return 1
audio = wavs[0]
if hasattr(audio, "cpu"):
audio = audio.cpu().float().numpy()
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
@ -377,25 +472,13 @@ def main() -> int:
"durationMs": duration_ms,
}
)
write_index(items)
print(
f"[synth] wrote {wav_name} {duration_ms:>5d}ms «{cue['text']}»",
flush=True,
)
out_index = {
"storyboard": args.storyboard,
"instruct": instruct,
"language": language,
"designModel": args.design_model,
"cloneModel": args.clone_model,
"referenceAudio": reference_audio_cache_key,
"referenceText": ref_text,
"seed": seed,
"temperature": temperature,
"topP": top_p,
"items": items,
}
(audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
write_index(items)
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
print(
f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",