LGTM
This commit is contained in:
parent
9248e26af2
commit
f2a2651b8a
95 changed files with 3993 additions and 1471 deletions
|
|
@ -1,19 +1,19 @@
|
|||
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
|
||||
"""Mux per-cue WAVs into one storyboard's recording.mp4 at narration offsets.
|
||||
|
||||
Reads two manifests:
|
||||
Reads two manifests inside ``output/<storyboard>/``:
|
||||
|
||||
* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
* ``audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
|
||||
* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
* ``narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
the trimmed video. Generated DURING recording.
|
||||
|
||||
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
|
||||
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
|
||||
video stream, and writes ``output/recording.narrated.mp4``.
|
||||
video stream, and writes ``output/<storyboard>/recording.narrated.mp4``.
|
||||
|
||||
Run from the ``video/`` directory after recording:
|
||||
|
||||
uv run --project tts python tts/mux.py
|
||||
uv run --project tts python tts/mux.py --storyboard recording
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -28,23 +28,21 @@ from pathlib import Path
|
|||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
|
||||
parser.add_argument(
|
||||
"--narration",
|
||||
type=Path,
|
||||
default=Path("output/narration.json"),
|
||||
help="Per-cue videoTimeMs manifest written by the recorder.",
|
||||
"--storyboard",
|
||||
required=True,
|
||||
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
|
||||
)
|
||||
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("output/recording.narrated.mp4"),
|
||||
default=Path("output"),
|
||||
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace",
|
||||
action="store_true",
|
||||
help="After muxing, atomically replace --video with --out.",
|
||||
help="After muxing, atomically replace the storyboard's recording.mp4.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
|
@ -56,7 +54,13 @@ def main() -> int:
|
|||
print("[mux] ffmpeg not on PATH", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index_path = args.audio_dir / "index.json"
|
||||
storyboard_dir = args.output_dir / args.storyboard
|
||||
audio_dir = storyboard_dir / "audio"
|
||||
narration_path = storyboard_dir / "narration.json"
|
||||
video_path = storyboard_dir / "recording.mp4"
|
||||
out_path = storyboard_dir / "recording.narrated.mp4"
|
||||
|
||||
audio_index_path = audio_dir / "index.json"
|
||||
if not audio_index_path.exists():
|
||||
print(
|
||||
f"[mux] {audio_index_path} not found; run tts/synth.py first",
|
||||
|
|
@ -64,25 +68,25 @@ def main() -> int:
|
|||
)
|
||||
return 1
|
||||
|
||||
if not args.narration.exists():
|
||||
if not narration_path.exists():
|
||||
print(
|
||||
f"[mux] {args.narration} not found; the recorder must run before mux",
|
||||
f"[mux] {narration_path} not found; the recorder must run before mux",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.video.exists():
|
||||
print(f"[mux] video not found: {args.video}", file=sys.stderr)
|
||||
if not video_path.exists():
|
||||
print(f"[mux] video not found: {video_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index = json.loads(audio_index_path.read_text())
|
||||
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
|
||||
if not audio_items:
|
||||
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
|
||||
shutil.copyfile(args.video, args.out)
|
||||
shutil.copyfile(video_path, out_path)
|
||||
return 0
|
||||
|
||||
narration = json.loads(args.narration.read_text())
|
||||
narration = json.loads(narration_path.read_text())
|
||||
nar_cues = list(narration.get("cues", []))
|
||||
if len(nar_cues) != len(audio_items):
|
||||
print(
|
||||
|
|
@ -130,9 +134,9 @@ def main() -> int:
|
|||
+ "\n - ".join(overlaps)
|
||||
)
|
||||
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(video_path)]
|
||||
for it in items:
|
||||
cmd += ["-i", str(args.audio_dir / it["wav"])]
|
||||
cmd += ["-i", str(audio_dir / it["wav"])]
|
||||
|
||||
filter_parts: list[str] = []
|
||||
mix_inputs: list[str] = []
|
||||
|
|
@ -168,18 +172,21 @@ def main() -> int:
|
|||
"-shortest",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
str(args.out),
|
||||
str(out_path),
|
||||
]
|
||||
|
||||
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
|
||||
print(
|
||||
f"[mux] [{args.storyboard}] muxing {len(items)} narration cues into {out_path}",
|
||||
flush=True,
|
||||
)
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
|
||||
return result.returncode
|
||||
|
||||
if args.replace:
|
||||
args.out.replace(args.video)
|
||||
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
|
||||
out_path.replace(video_path)
|
||||
print(f"[mux] replaced {video_path} with narrated copy", flush=True)
|
||||
|
||||
return 0
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,28 @@
|
|||
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
|
||||
"""Synthesize one storyboard's narration in ONE batched Qwen3-TTS call.
|
||||
|
||||
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
|
||||
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
|
||||
batched list — that way every cue shares the same model state, which keeps
|
||||
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
|
||||
go to ``output/audio/`` for the recording step (which reads measured cue
|
||||
durations) and the mux step (which drops each WAV at its videoTime).
|
||||
Reads ``output/<storyboard>/narration-script.json`` (emitted by
|
||||
``dist/preflight.js``) and runs ``Qwen3TTSModel.generate_voice_design`` with
|
||||
all cue texts as a single batched list — that way every cue shares the same
|
||||
model state, which keeps prosody and timbre consistent across cues. Per-cue
|
||||
WAVs and an index manifest go to ``output/<storyboard>/audio/`` for the
|
||||
recording step (which reads measured cue durations) and the mux step (which
|
||||
drops each WAV at its videoTime).
|
||||
|
||||
Voice persona, language, and sampling come from the storyboard via the
|
||||
``voice`` block of the narration script. CLI flags can still override them
|
||||
for ad-hoc experimentation; storyboards remain the source of truth for
|
||||
production runs.
|
||||
|
||||
We use the VoiceDesign sibling of CustomVoice because it accepts a free-form
|
||||
voice persona (British accent, narrator register, "no laughter") via the
|
||||
``instruct`` parameter. CustomVoice's preset speakers are all American or
|
||||
non-English, and its ``instruct`` is documented for emotion only — it
|
||||
ignored accent directives and bled non-speech tokens (laughter, sighs)
|
||||
between cues.
|
||||
|
||||
Run from the ``video/`` directory:
|
||||
|
||||
uv run --project tts python tts/synth.py
|
||||
uv run --project tts python tts/synth.py --storyboard recording
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -17,55 +30,78 @@ from __future__ import annotations
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_SPEAKER = "ryan"
|
||||
DEFAULT_LANGUAGE = "English"
|
||||
# Two checkpoints: the design model mints the reference clip in the desired
|
||||
# persona; the clone model conditions every cue on that reference's x-vector.
|
||||
# Neither CustomVoice nor VoiceDesign support generate_voice_clone — only the
|
||||
# Base checkpoint does.
|
||||
DEFAULT_DESIGN_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign"
|
||||
DEFAULT_CLONE_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
|
||||
|
||||
# Fixed reference utterance used to anchor the speaker timbre. The reference
|
||||
# is generated once per (model, instruct, sampling, seed) tuple and reused
|
||||
# for every cue, so all narration shares the same x-vector. Two short
|
||||
# sentences exercise enough phonemes for a stable embedding without bloating
|
||||
# generation time.
|
||||
REFERENCE_TEXT = (
|
||||
"Welcome to the demonstration. This is the narrator voice you'll hear throughout the video."
|
||||
)
|
||||
|
||||
|
||||
def _safe_load_json(path: Path) -> object | None:
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except (FileNotFoundError, json.JSONDecodeError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--script",
|
||||
"--storyboard",
|
||||
required=True,
|
||||
help="Storyboard slug (matches Storyboard.name in src/storyboard.ts).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("output/narration-script.json"),
|
||||
help="Narration script emitted by dist/preflight.js.",
|
||||
default=Path("output"),
|
||||
help="Root output directory; per-storyboard files live in <root>/<storyboard>/.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-dir",
|
||||
"--design-model",
|
||||
default=os.environ.get("TTS_DESIGN_MODEL", DEFAULT_DESIGN_MODEL),
|
||||
help="Checkpoint used to mint the voice reference (VoiceDesign by default).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clone-model",
|
||||
default=os.environ.get("TTS_CLONE_MODEL", DEFAULT_CLONE_MODEL),
|
||||
help="Checkpoint used to clone the cue audio from the reference (Base by default).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference-audio",
|
||||
type=Path,
|
||||
default=Path("output/audio"),
|
||||
help="Directory to write WAV files and index.json into.",
|
||||
default=(Path(os.environ["TTS_REFERENCE_AUDIO"]) if os.environ.get("TTS_REFERENCE_AUDIO") else None),
|
||||
help="Path to an existing reference WAV. If set, skip VoiceDesign and clone from this.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker",
|
||||
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
|
||||
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
|
||||
"--reference-text",
|
||||
default=os.environ.get("TTS_REFERENCE_TEXT"),
|
||||
help="Transcript of --reference-audio. Required if --reference-audio is set.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default=os.environ.get("TTS_DEVICE", "cuda:0"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-speakers",
|
||||
action="store_true",
|
||||
help="Load the model, print available speaker names, and exit.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -78,15 +114,18 @@ def load_model(model_id: str, device: str) -> Qwen3TTSModel:
|
|||
def cached_index_matches(
|
||||
index_path: Path,
|
||||
cues: list[dict],
|
||||
speaker: str,
|
||||
instruct: str,
|
||||
language: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
) -> bool:
|
||||
"""Return True iff index_path's cue list lines up with `cues` 1:1.
|
||||
|
||||
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
|
||||
settings (``speaker``, ``language``). All cue WAV files must also exist
|
||||
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
|
||||
the cache.
|
||||
settings (``instruct``, ``language``, ``seed``, ``temperature``, ``top_p``).
|
||||
All cue WAV files must also exist on disk. Mismatched length, reordered
|
||||
cues, or a missing WAV invalidate the cache.
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return False
|
||||
|
|
@ -94,7 +133,13 @@ def cached_index_matches(
|
|||
cached = json.loads(index_path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
if cached.get("speaker") != speaker or cached.get("language") != language:
|
||||
if cached.get("instruct") != instruct or cached.get("language") != language:
|
||||
return False
|
||||
if int(cached.get("seed", -1)) != seed:
|
||||
return False
|
||||
if float(cached.get("temperature", -1)) != temperature:
|
||||
return False
|
||||
if float(cached.get("topP", -1)) != top_p:
|
||||
return False
|
||||
cached_items = cached.get("items", [])
|
||||
if len(cached_items) != len(cues):
|
||||
|
|
@ -112,52 +157,179 @@ def cached_index_matches(
|
|||
return True
|
||||
|
||||
|
||||
def seed_everything(seed: int) -> None:
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
|
||||
def _resolve_reference(
|
||||
args: argparse.Namespace,
|
||||
audio_dir: Path,
|
||||
instruct: str,
|
||||
language: str,
|
||||
seed: int,
|
||||
temperature: float,
|
||||
top_p: float,
|
||||
) -> tuple[Path, str]:
|
||||
"""Return (ref_wav_path, ref_text) for the clone step.
|
||||
|
||||
If --reference-audio is supplied, validate and use it directly. Otherwise
|
||||
mint one via VoiceDesign (cached on disk; cache invalidates when the
|
||||
persona/sampling/seed changes). The design model is unloaded before
|
||||
returning so the clone model can claim the GPU.
|
||||
"""
|
||||
if args.reference_audio is not None:
|
||||
if not args.reference_audio.exists():
|
||||
raise SystemExit(f"[synth] --reference-audio does not exist: {args.reference_audio}")
|
||||
if not args.reference_text:
|
||||
raise SystemExit("[synth] --reference-text is required when --reference-audio is set")
|
||||
print(
|
||||
f"[synth] using user-supplied reference {args.reference_audio} «{args.reference_text}»",
|
||||
flush=True,
|
||||
)
|
||||
return args.reference_audio, args.reference_text
|
||||
|
||||
ref_wav_path = audio_dir / "_reference.wav"
|
||||
ref_meta_path = audio_dir / "_reference.meta.json"
|
||||
ref_meta = {
|
||||
"model": args.design_model,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"text": REFERENCE_TEXT,
|
||||
}
|
||||
if (
|
||||
ref_wav_path.exists()
|
||||
and ref_meta_path.exists()
|
||||
and _safe_load_json(ref_meta_path) == ref_meta
|
||||
):
|
||||
print(f"[synth] reusing cached voice reference {ref_wav_path.name}", flush=True)
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
|
||||
print(
|
||||
f"[synth] minting voice reference via VoiceDesign: «{REFERENCE_TEXT}»",
|
||||
flush=True,
|
||||
)
|
||||
design_model = load_model(args.design_model, args.device)
|
||||
seed_everything(seed)
|
||||
ref_wavs, ref_sr = design_model.generate_voice_design(
|
||||
text=[REFERENCE_TEXT],
|
||||
language=language,
|
||||
instruct=instruct,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
ref_audio = ref_wavs[0]
|
||||
if hasattr(ref_audio, "cpu"):
|
||||
ref_audio = ref_audio.cpu().float().numpy()
|
||||
sf.write(str(ref_wav_path), ref_audio, ref_sr)
|
||||
ref_meta_path.write_text(json.dumps(ref_meta, indent=2))
|
||||
|
||||
# Free the design model before loading the clone model — both are 1.7B,
|
||||
# we don't want them resident at the same time.
|
||||
del design_model
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return ref_wav_path, REFERENCE_TEXT
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if args.list_speakers:
|
||||
model = load_model(args.model, args.device)
|
||||
speakers = model.get_supported_speakers()
|
||||
print(json.dumps(speakers, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
storyboard_dir = args.output_dir / args.storyboard
|
||||
script_path = storyboard_dir / "narration-script.json"
|
||||
audio_dir = storyboard_dir / "audio"
|
||||
|
||||
if not args.script.exists():
|
||||
print(f"[synth] script not found: {args.script}", file=sys.stderr)
|
||||
if not script_path.exists():
|
||||
print(f"[synth] script not found: {script_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
script = json.loads(args.script.read_text())
|
||||
script = json.loads(script_path.read_text())
|
||||
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
|
||||
if not cues:
|
||||
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
voice = script.get("voice")
|
||||
if not voice:
|
||||
print(
|
||||
f"[synth] {script_path} has no `voice` block — re-run preflight.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
instruct = voice["instruct"]
|
||||
language = voice["language"]
|
||||
temperature = float(voice.get("temperature", 0.6))
|
||||
top_p = float(voice.get("topP", 0.9))
|
||||
seed = int(voice.get("seed", 42))
|
||||
|
||||
audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Skip generation when the existing audio matches the script — same cue
|
||||
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
|
||||
# time when iterating on activity timing without changing narration.
|
||||
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
|
||||
# texts and same gapBeforeMs values in the same order, AND same synth
|
||||
# settings (instruct/seed/temperature/top_p). Saves ~30s of GPU time when
|
||||
# iterating on activity timing without changing narration or persona.
|
||||
if cached_index_matches(
|
||||
audio_dir / "index.json",
|
||||
cues,
|
||||
instruct,
|
||||
language,
|
||||
seed,
|
||||
temperature,
|
||||
top_p,
|
||||
):
|
||||
print(
|
||||
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
|
||||
f"[synth] [{args.storyboard}] cached audio matches the current script — skipping generation",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
model = load_model(args.model, args.device)
|
||||
|
||||
texts = [c["text"].strip() for c in cues]
|
||||
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
|
||||
print(f"[synth] [{args.storyboard}] persona: {instruct}", flush=True)
|
||||
print(
|
||||
f"[synth] [{args.storyboard}] sampling: temperature={temperature} top_p={top_p} seed={seed} language={language}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
# Two-stage generation:
|
||||
# 1. VoiceDesign mints a single reference clip in the target persona
|
||||
# (or the user supplies one via --reference-audio).
|
||||
# 2. Base + generate_voice_clone(x_vector_only_mode=True) conditions
|
||||
# every cue on the reference's speaker embedding.
|
||||
# Without (2), batched generation drifts timbre across cues — a persona
|
||||
# prompt anchors style but not identity, so each batch item picks its
|
||||
# own voice. The reference WAV is cached so subsequent runs only load
|
||||
# the clone model (saves ~20s + 3.4 GB of disk download).
|
||||
ref_wav_path, ref_text = _resolve_reference(
|
||||
args, audio_dir, instruct, language, seed, temperature, top_p
|
||||
)
|
||||
|
||||
print(
|
||||
f"[synth] cloning {len(texts)} cues from reference (x_vector_only) — one batched call",
|
||||
flush=True,
|
||||
)
|
||||
for i, t in enumerate(texts):
|
||||
print(f"[synth] {i:2d}: {t}", flush=True)
|
||||
|
||||
# ONE batched call. generate_custom_voice handles text=List[str] natively
|
||||
# and broadcasts the speaker/language across all items, so the entire
|
||||
# narration is decoded in one model pass — same RNG state, same batch,
|
||||
# consistent voice from cue to cue.
|
||||
wavs, sr = model.generate_custom_voice(
|
||||
clone_model = load_model(args.clone_model, args.device)
|
||||
seed_everything(seed)
|
||||
wavs, sr = clone_model.generate_voice_clone(
|
||||
text=texts,
|
||||
language=args.language,
|
||||
speaker=args.speaker,
|
||||
language=language,
|
||||
ref_audio=str(ref_wav_path),
|
||||
ref_text=ref_text,
|
||||
x_vector_only_mode=True,
|
||||
non_streaming_mode=True,
|
||||
do_sample=True,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
)
|
||||
if len(wavs) != len(texts):
|
||||
print(
|
||||
|
|
@ -171,7 +343,7 @@ def main() -> int:
|
|||
if hasattr(audio, "cpu"):
|
||||
audio = audio.cpu().float().numpy()
|
||||
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
|
||||
wav_path = args.out_dir / wav_name
|
||||
wav_path = audio_dir / wav_name
|
||||
sf.write(str(wav_path), audio, sr)
|
||||
duration_ms = int(round(len(audio) * 1000 / sr))
|
||||
items.append(
|
||||
|
|
@ -190,15 +362,21 @@ def main() -> int:
|
|||
)
|
||||
|
||||
out_index = {
|
||||
"speaker": args.speaker,
|
||||
"language": args.language,
|
||||
"model": args.model,
|
||||
"storyboard": args.storyboard,
|
||||
"instruct": instruct,
|
||||
"language": language,
|
||||
"designModel": args.design_model,
|
||||
"cloneModel": args.clone_model,
|
||||
"referenceText": ref_text,
|
||||
"seed": seed,
|
||||
"temperature": temperature,
|
||||
"topP": top_p,
|
||||
"items": items,
|
||||
}
|
||||
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
(audio_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
|
||||
print(
|
||||
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
|
||||
f"[synth] [{args.storyboard}] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {audio_dir}",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue