This commit is contained in:
Andras Schmelczer 2026-05-06 23:13:58 +01:00
parent 94f9c0d594
commit 5c3b87f2d5
69 changed files with 1334 additions and 213 deletions

View file

@ -81,11 +81,7 @@ def find_bad_files(
bad: list[BadFile] = []
stats: dict[str, dict] = {}
modes = sorted(
d
for d in os.listdir(base_dir)
if (base_dir / d).is_dir()
)
modes = sorted(d for d in os.listdir(base_dir) if (base_dir / d).is_dir())
for mode in modes:
mode_dir = base_dir / mode
@ -149,7 +145,9 @@ def find_duplicates(base_dir: Path) -> tuple[list[BadFile], dict[str, dict]]:
# Keep the file with the most rows
files.sort(key=lambda x: x[1], reverse=True)
for filename, rows in files[1:]:
dupes.append(BadFile(mode=mode, filename=filename, slug=slug, rows=rows))
dupes.append(
BadFile(mode=mode, filename=filename, slug=slug, rows=rows)
)
mode_dupes += 1
duped_slugs = sum(1 for fs in slug_files.values() if len(fs) > 1)
@ -197,7 +195,9 @@ def main() -> None:
bad_files, stats = find_bad_files(args.travel_times, args.threshold_pct)
print("=== Per-mode summary ===\n")
print(f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}")
print(
f"{'Mode':<10} {'Total':>6} {'Bad':>5} {'Threshold':>10} {'Median':>8} {'Range':>20}"
)
print("-" * 65)
for mode, s in sorted(stats.items()):
rng = f"{s['min']:,}{s['max']:,}"
@ -231,7 +231,9 @@ def main() -> None:
total_removable = sum(s["removable"] for s in dupe_stats.values())
if total_removable > 0:
print(f"\n=== Duplicates ({total_removable} removable files) ===\n")
print(f"{'Mode':<10} {'Total':>6} {'Unique':>7} {'Duped slugs':>12} {'Removable':>10}")
print(
f"{'Mode':<10} {'Total':>6} {'Unique':>7} {'Duped slugs':>12} {'Removable':>10}"
)
print("-" * 50)
for mode, s in sorted(dupe_stats.items()):
if s["removable"] > 0:
@ -242,9 +244,15 @@ def main() -> None:
if args.dedup:
# Exclude files already deleted by --delete
deleted_set = {(bf.mode, bf.filename) for bf in bad_files} if args.delete else set()
to_delete = [df for df in dupe_files if (df.mode, df.filename) not in deleted_set]
print(f"\nRemoving {len(to_delete)} duplicate files (keeping largest per slug)...")
deleted_set = (
{(bf.mode, bf.filename) for bf in bad_files} if args.delete else set()
)
to_delete = [
df for df in dupe_files if (df.mode, df.filename) not in deleted_set
]
print(
f"\nRemoving {len(to_delete)} duplicate files (keeping largest per slug)..."
)
deleted = _delete_files(args.travel_times, to_delete)
print(f"Deleted {deleted}/{len(to_delete)} files.")
else: