More FE changes
This commit is contained in:
parent
f114ada255
commit
a48eb945e0
48 changed files with 4127 additions and 1751 deletions
188
video/tts/mux.py
Normal file
188
video/tts/mux.py
Normal file
|
|
@ -0,0 +1,188 @@
|
|||
"""Mux per-cue WAVs into recording.mp4 at their narration offsets.
|
||||
|
||||
Reads two manifests:
|
||||
|
||||
* ``output/audio/index.json`` (synth output) — per-cue WAV filename + measured
|
||||
duration. Generated BEFORE recording in one batched Qwen3-TTS call.
|
||||
* ``output/narration.json`` (recorder output) — per-cue ``videoTimeMs`` against
|
||||
the trimmed video. Generated DURING recording.
|
||||
|
||||
Joins them by ``cueIndex`` (index in the cue list, 1:1 between manifests),
|
||||
runs ffmpeg with one ``adelay`` per cue plus a single ``amix``, copies the
|
||||
video stream, and writes ``output/recording.narrated.mp4``.
|
||||
|
||||
Run from the ``video/`` directory after recording:
|
||||
|
||||
uv run --project tts python tts/mux.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--audio-dir", type=Path, default=Path("output/audio"))
|
||||
parser.add_argument(
|
||||
"--narration",
|
||||
type=Path,
|
||||
default=Path("output/narration.json"),
|
||||
help="Per-cue videoTimeMs manifest written by the recorder.",
|
||||
)
|
||||
parser.add_argument("--video", type=Path, default=Path("output/recording.mp4"))
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
default=Path("output/recording.narrated.mp4"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace",
|
||||
action="store_true",
|
||||
help="After muxing, atomically replace --video with --out.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if not shutil.which("ffmpeg"):
|
||||
print("[mux] ffmpeg not on PATH", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index_path = args.audio_dir / "index.json"
|
||||
if not audio_index_path.exists():
|
||||
print(
|
||||
f"[mux] {audio_index_path} not found; run tts/synth.py first",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.narration.exists():
|
||||
print(
|
||||
f"[mux] {args.narration} not found; the recorder must run before mux",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
if not args.video.exists():
|
||||
print(f"[mux] video not found: {args.video}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
audio_index = json.loads(audio_index_path.read_text())
|
||||
audio_items = [it for it in audio_index.get("items", []) if it.get("wav")]
|
||||
if not audio_items:
|
||||
print("[mux] synth produced no cues; copying video unchanged", file=sys.stderr)
|
||||
shutil.copyfile(args.video, args.out)
|
||||
return 0
|
||||
|
||||
narration = json.loads(args.narration.read_text())
|
||||
nar_cues = list(narration.get("cues", []))
|
||||
if len(nar_cues) != len(audio_items):
|
||||
print(
|
||||
f"[mux] cue count mismatch: synth has {len(audio_items)} cues, "
|
||||
f"recorder logged {len(nar_cues)}. Re-run preflight + synth + record.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
# Sort audio items by cueIndex so list-order matches the recorder's
|
||||
# cue list (which is also in cue order). Then pair 1:1.
|
||||
audio_by_index = {int(it["cueIndex"]): it for it in audio_items}
|
||||
items = []
|
||||
for i, nar in enumerate(nar_cues):
|
||||
audio = audio_by_index.get(i)
|
||||
if audio is None:
|
||||
print(f"[mux] no synth wav for cue {i}", file=sys.stderr)
|
||||
return 1
|
||||
items.append(
|
||||
{
|
||||
"cueIndex": i,
|
||||
"wav": audio["wav"],
|
||||
"durationMs": int(audio["durationMs"]),
|
||||
"videoTimeMs": int(nar["videoTimeMs"]),
|
||||
"text": nar.get("text", ""),
|
||||
}
|
||||
)
|
||||
|
||||
# Refuse to mux overlapping cues — amix would silently mash voices on top
|
||||
# of each other. Sort by start so the order matches what we'll actually
|
||||
# play, then check that each cue ends before the next one starts.
|
||||
ordered = sorted(items, key=lambda it: it["videoTimeMs"])
|
||||
overlaps: list[str] = []
|
||||
for prev, nxt in zip(ordered, ordered[1:]):
|
||||
prev_end = prev["videoTimeMs"] + prev["durationMs"]
|
||||
nxt_start = nxt["videoTimeMs"]
|
||||
if prev_end > nxt_start:
|
||||
overlaps.append(
|
||||
f"cue {prev['cueIndex']} ends at {prev_end}ms but cue {nxt['cueIndex']} "
|
||||
f"starts at {nxt_start}ms (overlap {prev_end - nxt_start}ms)"
|
||||
)
|
||||
if overlaps:
|
||||
raise SystemExit(
|
||||
"[mux] refusing to produce overlapping narration:\n - "
|
||||
+ "\n - ".join(overlaps)
|
||||
)
|
||||
|
||||
cmd: list[str] = ["ffmpeg", "-y", "-loglevel", "warning", "-i", str(args.video)]
|
||||
for it in items:
|
||||
cmd += ["-i", str(args.audio_dir / it["wav"])]
|
||||
|
||||
filter_parts: list[str] = []
|
||||
mix_inputs: list[str] = []
|
||||
for n, it in enumerate(items, start=1):
|
||||
delay_ms = max(0, it["videoTimeMs"])
|
||||
label = f"a{n}"
|
||||
# adelay needs one delay per channel; "all=1" applies the same delay
|
||||
# to every channel, which is what we want for mono narration.
|
||||
filter_parts.append(
|
||||
f"[{n}:a]aresample=async=1,adelay={delay_ms}|{delay_ms}:all=1[{label}]"
|
||||
)
|
||||
mix_inputs.append(f"[{label}]")
|
||||
|
||||
mix = (
|
||||
f"{''.join(mix_inputs)}amix=inputs={len(items)}"
|
||||
f":duration=longest:dropout_transition=0:normalize=0[aout]"
|
||||
)
|
||||
filter_complex = ";".join(filter_parts + [mix])
|
||||
|
||||
cmd += [
|
||||
"-filter_complex",
|
||||
filter_complex,
|
||||
"-map",
|
||||
"0:v:0",
|
||||
"-map",
|
||||
"[aout]",
|
||||
"-c:v",
|
||||
"copy",
|
||||
"-c:a",
|
||||
"aac",
|
||||
"-b:a",
|
||||
"192k",
|
||||
"-shortest",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
str(args.out),
|
||||
]
|
||||
|
||||
print(f"[mux] muxing {len(items)} narration cues into {args.out}", flush=True)
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
print(f"[mux] ffmpeg exited {result.returncode}", file=sys.stderr)
|
||||
return result.returncode
|
||||
|
||||
if args.replace:
|
||||
args.out.replace(args.video)
|
||||
print(f"[mux] replaced {args.video} with narrated copy", flush=True)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
208
video/tts/synth.py
Normal file
208
video/tts/synth.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
"""Synthesize the full narration in ONE batched Qwen3-TTS call.
|
||||
|
||||
Reads ``output/narration-script.json`` (emitted by ``dist/preflight.js``) and
|
||||
runs ``Qwen3TTSModel.generate_custom_voice`` with all cue texts as a single
|
||||
batched list — that way every cue shares the same model state, which keeps
|
||||
prosody and timbre consistent across cues. Per-cue WAVs and an index manifest
|
||||
go to ``output/audio/`` for the recording step (which reads measured cue
|
||||
durations) and the mux step (which drops each WAV at its videoTime).
|
||||
|
||||
Run from the ``video/`` directory:
|
||||
|
||||
uv run --project tts python tts/synth.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
|
||||
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_SPEAKER = "ryan"
|
||||
DEFAULT_LANGUAGE = "English"
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--script",
|
||||
type=Path,
|
||||
default=Path("output/narration-script.json"),
|
||||
help="Narration script emitted by dist/preflight.js.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out-dir",
|
||||
type=Path,
|
||||
default=Path("output/audio"),
|
||||
help="Directory to write WAV files and index.json into.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=os.environ.get("TTS_MODEL", DEFAULT_MODEL),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speaker",
|
||||
default=os.environ.get("TTS_SPEAKER", DEFAULT_SPEAKER),
|
||||
help="CustomVoice preset speaker name (use --list-speakers to enumerate).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
default=os.environ.get("TTS_LANGUAGE", DEFAULT_LANGUAGE),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
default=os.environ.get("TTS_DEVICE", "cuda:0"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-speakers",
|
||||
action="store_true",
|
||||
help="Load the model, print available speaker names, and exit.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_model(model_id: str, device: str) -> Qwen3TTSModel:
|
||||
dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
|
||||
print(f"[synth] loading {model_id} on {device} ({dtype})", flush=True)
|
||||
return Qwen3TTSModel.from_pretrained(model_id, device_map=device, dtype=dtype)
|
||||
|
||||
|
||||
def cached_index_matches(
|
||||
index_path: Path,
|
||||
cues: list[dict],
|
||||
speaker: str,
|
||||
language: str,
|
||||
) -> bool:
|
||||
"""Return True iff index_path's cue list lines up with `cues` 1:1.
|
||||
|
||||
Compared fields: ``cueIndex``, ``text``, ``gapBeforeMs`` plus the synth
|
||||
settings (``speaker``, ``language``). All cue WAV files must also exist
|
||||
on disk. Mismatched length, reordered cues, or a missing WAV invalidate
|
||||
the cache.
|
||||
"""
|
||||
if not index_path.exists():
|
||||
return False
|
||||
try:
|
||||
cached = json.loads(index_path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
if cached.get("speaker") != speaker or cached.get("language") != language:
|
||||
return False
|
||||
cached_items = cached.get("items", [])
|
||||
if len(cached_items) != len(cues):
|
||||
return False
|
||||
for live, prev in zip(cues, cached_items):
|
||||
if int(live["cueIndex"]) != int(prev.get("cueIndex", -1)):
|
||||
return False
|
||||
if live["text"].strip() != str(prev.get("text", "")).strip():
|
||||
return False
|
||||
if int(live.get("gapBeforeMs", 0)) != int(prev.get("gapBeforeMs", -1)):
|
||||
return False
|
||||
wav = prev.get("wav")
|
||||
if not wav or not (index_path.parent / wav).exists():
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
if args.list_speakers:
|
||||
model = load_model(args.model, args.device)
|
||||
speakers = model.get_supported_speakers()
|
||||
print(json.dumps(speakers, indent=2, ensure_ascii=False))
|
||||
return 0
|
||||
|
||||
if not args.script.exists():
|
||||
print(f"[synth] script not found: {args.script}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
script = json.loads(args.script.read_text())
|
||||
cues = [c for c in script.get("items", []) if c.get("text", "").strip()]
|
||||
if not cues:
|
||||
print("[synth] script has no cues; nothing to generate.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
args.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Skip generation when the existing audio matches the script — same cue
|
||||
# texts and same gapBeforeMs values in the same order. Saves ~30s of GPU
|
||||
# time when iterating on activity timing without changing narration.
|
||||
if cached_index_matches(args.out_dir / "index.json", cues, args.speaker, args.language):
|
||||
print(
|
||||
f"[synth] cached audio in {args.out_dir} matches the current script — skipping generation",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
model = load_model(args.model, args.device)
|
||||
|
||||
texts = [c["text"].strip() for c in cues]
|
||||
print(f"[synth] generating {len(texts)} cues in one batched call", flush=True)
|
||||
for i, t in enumerate(texts):
|
||||
print(f"[synth] {i:2d}: {t}", flush=True)
|
||||
|
||||
# ONE batched call. generate_custom_voice handles text=List[str] natively
|
||||
# and broadcasts the speaker/language across all items, so the entire
|
||||
# narration is decoded in one model pass — same RNG state, same batch,
|
||||
# consistent voice from cue to cue.
|
||||
wavs, sr = model.generate_custom_voice(
|
||||
text=texts,
|
||||
language=args.language,
|
||||
speaker=args.speaker,
|
||||
)
|
||||
if len(wavs) != len(texts):
|
||||
print(
|
||||
f"[synth] model returned {len(wavs)} wavs for {len(texts)} cues",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
items = []
|
||||
for cue, audio in zip(cues, wavs):
|
||||
if hasattr(audio, "cpu"):
|
||||
audio = audio.cpu().float().numpy()
|
||||
wav_name = f"cue_{cue['cueIndex']:03d}.wav"
|
||||
wav_path = args.out_dir / wav_name
|
||||
sf.write(str(wav_path), audio, sr)
|
||||
duration_ms = int(round(len(audio) * 1000 / sr))
|
||||
items.append(
|
||||
{
|
||||
"cueIndex": cue["cueIndex"],
|
||||
"text": cue["text"],
|
||||
"gapBeforeMs": int(cue.get("gapBeforeMs", 0)),
|
||||
"wav": wav_name,
|
||||
"sampleRate": sr,
|
||||
"durationMs": duration_ms,
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[synth] wrote {wav_name} {duration_ms:>5d}ms «{cue['text']}»",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
out_index = {
|
||||
"speaker": args.speaker,
|
||||
"language": args.language,
|
||||
"model": args.model,
|
||||
"items": items,
|
||||
}
|
||||
(args.out_dir / "index.json").write_text(json.dumps(out_index, indent=2))
|
||||
total_ms = sum(it["gapBeforeMs"] + it["durationMs"] for it in items)
|
||||
print(
|
||||
f"[synth] {len(items)} cues, {total_ms}ms of audio (incl. gaps) -> {args.out_dir}",
|
||||
flush=True,
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
Add table
Add a link
Reference in a new issue