alright
This commit is contained in:
parent
c645b0f1d4
commit
39ef5c6646
79 changed files with 5660 additions and 2199 deletions
|
|
@ -169,7 +169,6 @@ def main() -> int:
|
|||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
"-shortest",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
str(out_path),
|
||||
|
|
|
|||
|
|
@ -168,6 +168,61 @@ def cached_index_matches(
|
|||
return True
|
||||
|
||||
|
||||
def load_reusable_items(
|
||||
index_path: Path,
|
||||
cues: list[dict],
|
||||
instruct: str,
|
||||
language: str,
|
||||
reference_text: str,
|
||||
design_model: str,
|
||||
clone_model: str,
|
||||
reference_audio: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
) -> dict[int, dict]:
|
||||
"""Return cue-indexed cached items that match the current synth settings.
|
||||
|
||||
Unlike ``cached_index_matches`` this accepts a partial index, so a long
|
||||
CPU synthesis run can be resumed cue-by-cue after an interruption.
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return {}
|
||||
try:
|
||||
cached = json.loads(index_path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return {}
|
||||
if cached.get("instruct") != instruct or cached.get("language") != language:
|
||||
return {}
|
||||
if cached.get("referenceText") != reference_text:
|
||||
return {}
|
||||
if cached.get("designModel") != design_model or cached.get("cloneModel") != clone_model:
|
||||
return {}
|
||||
if cached.get("referenceAudio", "") != reference_audio:
|
||||
return {}
|
||||
if int(cached.get("seed", -1)) != seed:
|
||||
return {}
|
||||
if float(cached.get("temperature", -1)) != temperature:
|
||||
return {}
|
||||
if float(cached.get("topP", -1)) != top_p:
|
||||
return {}
|
||||
|
||||
cue_by_index = {int(c["cueIndex"]): c for c in cues}
|
||||
reusable: dict[int, dict] = {}
|
||||
for item in cached.get("items", []):
|
||||
cue_index = int(item.get("cueIndex", -1))
|
||||
cue = cue_by_index.get(cue_index)
|
||||
wav = item.get("wav")
|
||||
if cue is None or not wav or not (index_path.parent / wav).exists():
|
||||
continue
|
||||
if cue["text"].strip() != str(item.get("text", "")).strip():
|
||||
continue
|
||||
if int(cue.get("gapBeforeMs", 0)) != int(item.get("gapBeforeMs", -1)):
|
||||
continue
|
||||
reusable[cue_index] = item
|
||||
return reusable
|
||||
|
||||
|
||||
def seed_everything(seed: int) -> None:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
|
|
@ -333,34 +388,74 @@ def main() -> int:
|
|||
)
|
||||
|
||||
print(
|
||||
f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
|
||||
f"[synth] cloning {len(texts)} cues from reference (x_vector_only)",
|
||||
flush=True,
|
||||
)
|
||||
for i, t in enumerate(texts):
|
||||
print(f"[synth] {i:2d}: {t}", flush=True)
|
||||
|
||||
clone_model = load_model(args.clone_model, args.device)
|
||||
seed_everything(seed)
|
||||
wavs, sr = clone_model.generate_voice_clone(
|
||||
text=texts,
|
||||
language=language,
|
||||
ref_audio=str(ref_wav_path),
|
||||
ref_text=ref_text,
|
||||
x_vector_only_mode=True,
|
||||
non_streaming_mode=True,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
out_index_base = {
|
||||
"storyboard": args.storyboard,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"designModel": args.design_model,
|
||||
"cloneModel": args.clone_model,
|
||||
"referenceAudio": reference_audio_cache_key,
|
||||
"referenceText": ref_text,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
}
|
||||
index_path = audio_dir / "index.json"
|
||||
reusable = load_reusable_items(
|
||||
index_path,
|
||||
cues,
|
||||
instruct,
|
||||
language,
|
||||
reference_text,
|
||||
args.design_model,
|
||||
args.clone_model,
|
||||
reference_audio_cache_key,
|
||||
seed,
|
||||
temperature,
|
||||
top_p,
|
||||
)
|
||||
if len(wavs) != len(texts):
|
||||
print(
|
||||
f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
def write_index(items: list[dict]) -> None:
|
||||
index_path.write_text(json.dumps({**out_index_base, "items": items}, indent=2))
|
||||
|
||||
items = []
|
||||
for cue, audio in zip(cues, wavs):
|
||||
for cue_index, cue in enumerate(cues):
|
||||
cached_item = reusable.get(int(cue["cueIndex"]))
|
||||
if cached_item:
|
||||
items.append(cached_item)
|
||||
write_index(items)
|
||||
print(
|
||||
f"[synth] reusing {cached_item['wav']} {int(cached_item['durationMs']):>5d}ms «{cue['text']}»",
|
||||
flush=True,
|
||||
)
|
||||
continue
|
||||
|
||||
seed_everything(seed + cue_index)
|
||||
wavs, sr = clone_model.generate_voice_clone(
|
||||
text=[texts[cue_index]],
|
||||
language=language,
|
||||
ref_audio=str(ref_wav_path),
|
||||
ref_text=ref_text,
|
||||
x_vector_only_mode=True,
|
||||
non_streaming_mode=True,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
if len(wavs) != 1:
|
||||
print(
|
||||
f"[synth] model returned {len(wavs)} wavs for cue {cue_index}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
audio = wavs[0]
|
||||
if hasattr(audio, "cpu"):
|
||||
audio = audio.cpu().float().numpy()
|
||||
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
|
||||
|
|
@ -377,25 +472,13 @@ def main() -> int:
|
|||
"durationMs": duration_ms,
|
||||
}
|
||||
)
|
||||
write_index(items)
|
||||
print(
|
||||
f"[synth] wrote {wav_name} {duration_ms:>5d}ms «{cue['text']}»",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
out_index = {
|
||||
"storyboard": args.storyboard,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"designModel": args.design_model,
|
||||
"cloneModel": args.clone_model,
|
||||
"referenceAudio": reference_audio_cache_key,
|
||||
"referenceText": ref_text,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"items": items,
|
||||
}
|
||||
(audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
write_index(items)
|
||||
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
|
||||
print(
|
||||
f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue