Rerun data pipelines

2026-05-10 14:49:53 +01:00 · 2026-05-10 14:49:53 +01:00 · fc10381692
commit fc10381692
parent 4c95815dc8
27 changed files with 2143 additions and 215 deletions
--- a/pipeline/download/crime.py
+++ b/pipeline/download/crime.py
@ -0,0 +1,393 @@
+"""Download police.uk crime archive ZIPs.
+
+The archive page lists rolling monthly snapshots. Newer snapshots overlap older
+ones, so extraction keeps files already written by newer archives.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import html
+import json
+import re
+import shutil
+import sys
+import zipfile
+from dataclasses import asdict, dataclass
+from datetime import UTC, datetime
+from pathlib import Path, PurePosixPath
+from urllib.parse import urljoin
+
+import httpx
+from tqdm import tqdm
+
+ARCHIVE_URL = "https://data.police.uk/data/archive/"
+ARCHIVE_LINK_RE = re.compile(
+    r'<div class="download">\s*.*?'
+    r'<a href="(?P<href>/data/archive/(?P<month>\d{4}-\d{2})\.zip)">'
+    r"(?P<label>[^<]+)</a>\s*\((?P<size>[^)]+)\)</span>\s*"
+    r'<p class="contained-range">\s*(?P<contained_range>.*?)\s*</p>\s*'
+    r'<p class="md5sum">(?P<md5>.*?)</p>',
+    re.DOTALL,
+)
+VALID_MD5_RE = re.compile(r"^[0-9a-fA-F]{32}$")
+MONTH_RE = re.compile(r"^\d{4}-\d{2}$")
+
+
+@dataclass(frozen=True)
+class CrimeArchive:
+    month: str
+    label: str
+    url: str
+    filename: str
+    size: str
+    contained_range: str
+    md5: str | None
+    raw_md5: str
+
+
+def _clean_text(value: str) -> str:
+    text = re.sub(r"<[^>]+>", " ", value)
+    return re.sub(r"\s+", " ", html.unescape(text)).strip()
+
+
+def parse_archives(page_html: str, base_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
+    """Parse monthly crime archive links from the police.uk archive page."""
+    archives: list[CrimeArchive] = []
+    for match in ARCHIVE_LINK_RE.finditer(page_html):
+        raw_md5 = _clean_text(match.group("md5")).lower()
+        md5 = raw_md5 if VALID_MD5_RE.fullmatch(raw_md5) else None
+        href = html.unescape(match.group("href"))
+        archives.append(
+            CrimeArchive(
+                month=match.group("month"),
+                label=_clean_text(match.group("label")),
+                url=urljoin(base_url, href),
+                filename=Path(href).name,
+                size=_clean_text(match.group("size")),
+                contained_range=_clean_text(match.group("contained_range")),
+                md5=md5,
+                raw_md5=raw_md5,
+            )
+        )
+
+    return archives
+
+
+def fetch_archives(archive_url: str = ARCHIVE_URL) -> list[CrimeArchive]:
+    """Fetch and parse the archive index."""
+    with httpx.Client(
+        follow_redirects=True,
+        timeout=httpx.Timeout(30.0, read=60.0),
+        headers={"User-Agent": "perfect-postcode-data-pipeline/1.0"},
+    ) as client:
+        response = client.get(archive_url)
+        response.raise_for_status()
+
+    archives = parse_archives(response.text, archive_url)
+    if not archives:
+        raise RuntimeError(f"No monthly archive ZIPs found at {archive_url}")
+    return archives
+
+
+def filter_archives(
+    archives: list[CrimeArchive],
+    *,
+    from_month: str | None = None,
+    to_month: str | None = None,
+    limit: int | None = None,
+) -> list[CrimeArchive]:
+    """Filter archives by inclusive YYYY-MM bounds while preserving page order."""
+    filtered = [
+        archive
+        for archive in archives
+        if (from_month is None or archive.month >= from_month)
+        and (to_month is None or archive.month <= to_month)
+    ]
+    if limit is not None:
+        filtered = filtered[:limit]
+    return filtered
+
+
+def file_md5(path: Path) -> str:
+    digest = hashlib.md5()
+    with path.open("rb") as file:
+        for chunk in iter(lambda: file.read(1024 * 1024), b""):
+            digest.update(chunk)
+    return digest.hexdigest()
+
+
+def download_archive(
+    archive: CrimeArchive,
+    archive_dir: Path,
+    *,
+    verify: bool,
+    force: bool,
+    timeout: float,
+) -> Path:
+    """Download one archive ZIP, resuming an existing .part file when possible."""
+    dest = archive_dir / archive.filename
+    partial = dest.with_suffix(dest.suffix + ".part")
+
+    if force:
+        dest.unlink(missing_ok=True)
+        partial.unlink(missing_ok=True)
+
+    if dest.exists():
+        if verify and archive.md5 is not None:
+            actual_md5 = file_md5(dest)
+            if actual_md5 == archive.md5:
+                print(f"{archive.filename}: already downloaded")
+                return dest
+            print(
+                f"{archive.filename}: checksum mismatch, downloading again",
+                file=sys.stderr,
+            )
+            dest.unlink()
+            partial.unlink(missing_ok=True)
+        else:
+            print(f"{archive.filename}: already downloaded")
+            return dest
+
+    resume_from = partial.stat().st_size if partial.exists() else 0
+    headers = {"Range": f"bytes={resume_from}-"} if resume_from else {}
+
+    with httpx.stream(
+        "GET",
+        archive.url,
+        headers=headers,
+        follow_redirects=True,
+        timeout=httpx.Timeout(30.0, read=timeout),
+    ) as response:
+        if response.status_code == 206 and resume_from:
+            mode = "ab"
+            initial = resume_from
+        else:
+            response.raise_for_status()
+            mode = "wb"
+            initial = 0
+
+        total_header = int(response.headers.get("content-length", 0))
+        total = initial + total_header if total_header else None
+        with (
+            partial.open(mode) as output,
+            tqdm(
+                total=total,
+                initial=initial,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+                desc=archive.filename,
+            ) as progress,
+        ):
+            for chunk in response.iter_bytes(chunk_size=1024 * 1024):
+                output.write(chunk)
+                progress.update(len(chunk))
+
+    partial.replace(dest)
+
+    if verify and archive.md5 is not None:
+        actual_md5 = file_md5(dest)
+        if actual_md5 != archive.md5:
+            dest.unlink(missing_ok=True)
+            raise RuntimeError(
+                f"{archive.filename}: MD5 mismatch: expected {archive.md5}, got {actual_md5}"
+            )
+
+    return dest
+
+
+def _safe_csv_members(
+    archive: zipfile.ZipFile,
+) -> list[tuple[zipfile.ZipInfo, PurePosixPath]]:
+    members: list[tuple[zipfile.ZipInfo, PurePosixPath]] = []
+    for info in archive.infolist():
+        rel_path = PurePosixPath(info.filename)
+        if (
+            info.is_dir()
+            or rel_path.is_absolute()
+            or ".." in rel_path.parts
+            or rel_path.suffix.lower() != ".csv"
+        ):
+            continue
+        members.append((info, rel_path))
+    return members
+
+
+def extract_csvs(
+    zip_path: Path,
+    output_dir: Path,
+    *,
+    overwrite: bool = False,
+) -> tuple[int, int]:
+    """Extract CSVs from one ZIP. Returns (extracted, skipped)."""
+    extracted = 0
+    skipped = 0
+
+    with zipfile.ZipFile(zip_path) as archive:
+        for info, rel_path in _safe_csv_members(archive):
+            dest = output_dir.joinpath(*rel_path.parts)
+            if dest.exists() and not overwrite:
+                skipped += 1
+                continue
+
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            with archive.open(info) as source, dest.open("wb") as target:
+                shutil.copyfileobj(source, target)
+            extracted += 1
+
+    return extracted, skipped
+
+
+def write_manifest(
+    output_dir: Path, archive_url: str, archives: list[CrimeArchive]
+) -> None:
+    manifest = {
+        "source": archive_url,
+        "fetched_at": datetime.now(UTC).isoformat(),
+        "archives": [asdict(archive) for archive in archives],
+    }
+    path = output_dir / "archive_manifest.json"
+    path.write_text(json.dumps(manifest, indent=2) + "\n")
+
+
+def _month_arg(value: str) -> str:
+    if not MONTH_RE.fullmatch(value):
+        raise argparse.ArgumentTypeError("month must be in YYYY-MM format")
+    return value
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download all monthly police.uk crime archive ZIPs"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Directory for extracted CSVs; ZIPs are kept under _archives/",
+    )
+    parser.add_argument(
+        "--archive-url",
+        default=ARCHIVE_URL,
+        help=f"Archive index URL (default: {ARCHIVE_URL})",
+    )
+    parser.add_argument(
+        "--from-month",
+        type=_month_arg,
+        help="Only download archives from this YYYY-MM onwards",
+    )
+    parser.add_argument(
+        "--to-month",
+        type=_month_arg,
+        help="Only download archives up to this YYYY-MM",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Download at most this many archives after filtering",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="Print the archive URLs that would be downloaded and exit",
+    )
+    parser.add_argument(
+        "--no-extract",
+        dest="extract",
+        action="store_false",
+        help="Download ZIPs only; do not extract CSVs",
+    )
+    parser.add_argument(
+        "--overwrite-extracted",
+        action="store_true",
+        help="Overwrite CSVs when extracting overlapping archive snapshots",
+    )
+    parser.add_argument(
+        "--no-verify",
+        dest="verify",
+        action="store_false",
+        help="Skip MD5 verification",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Redownload archives even if ZIP files already exist",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=600.0,
+        help="Per-read timeout in seconds for large ZIP downloads",
+    )
+    args = parser.parse_args()
+
+    print("Fetching police.uk archive index...")
+    archives = filter_archives(
+        fetch_archives(args.archive_url),
+        from_month=args.from_month,
+        to_month=args.to_month,
+        limit=args.limit,
+    )
+    if not archives:
+        raise SystemExit("No archives matched the requested filters")
+
+    bad_md5 = [
+        archive.filename for archive in archives if archive.raw_md5 and not archive.md5
+    ]
+    if bad_md5:
+        print(
+            "Warning: ignoring malformed MD5 values for "
+            + ", ".join(bad_md5[:5])
+            + ("..." if len(bad_md5) > 5 else ""),
+            file=sys.stderr,
+        )
+
+    print(f"Found {len(archives)} monthly archive ZIPs")
+    if args.list:
+        for archive in archives:
+            print(f"{archive.month}\t{archive.url}\t{archive.raw_md5}")
+        return
+
+    args.output.mkdir(parents=True, exist_ok=True)
+    archive_dir = args.output / "_archives"
+    archive_dir.mkdir(parents=True, exist_ok=True)
+    write_manifest(args.output, args.archive_url, archives)
+
+    total_extracted = 0
+    total_skipped = 0
+    for index, archive in enumerate(archives, start=1):
+        print(f"[{index}/{len(archives)}] {archive.label} ({archive.size})")
+        zip_path = download_archive(
+            archive,
+            archive_dir,
+            verify=args.verify,
+            force=args.force,
+            timeout=args.timeout,
+        )
+        if args.extract:
+            extracted, skipped = extract_csvs(
+                zip_path,
+                args.output,
+                overwrite=args.overwrite_extracted,
+            )
+            total_extracted += extracted
+            total_skipped += skipped
+            print(
+                f"{archive.filename}: extracted {extracted} CSVs"
+                + (f", skipped {skipped} existing CSVs" if skipped else "")
+            )
+
+    if args.extract:
+        print(
+            f"Done. ZIPs saved in {archive_dir}; extracted {total_extracted} CSVs"
+            + (f" and skipped {total_skipped} existing CSVs" if total_skipped else "")
+            + "."
+        )
+    else:
+        print(f"Done. ZIPs saved in {archive_dir}.")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/map_assets.py
+++ b/pipeline/download/map_assets.py
@ -1,9 +1,15 @@
 import argparse
+import base64
+import json
+import re
 import sys
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
 from pathlib import Path

+from PIL import Image, ImageDraw
+
 from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES

 GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
@ -14,53 +20,80 @@ POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
 # Font stacks used by @protomaps/basemaps with lang='en'
 FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]

-# Fallback emoji not in any category
-_FALLBACK_EMOJIS = ["📍"]
-
 POI_ICON_PATHS = [
-    "asda/asda_express_24px.svg",
-    "asda/asda_green_basket_24px.svg",
-    "asda/asda_green_trolley_24px.svg",
-    "asda/asda_living_24px.svg",
-    "asda/asda_pfs_24px.svg",
-    "asda/asda_primary.svg",
-    "asda/asda_superstore_green_trolley_24px.svg",
-    "brands/aldi_24px.svg",
-    "brands/amazon_fresh_alt_24px.svg",
-    "brands/booths_24px.svg",
-    "brands/budgens_24px.svg",
-    "brands/centra_24px.svg",
-    "brands/cook.svg",
-    "brands/coop_24px.svg",
-    "brands/costco_24px.svg",
-    "brands/dunnes_stores_24px.svg",
-    "brands/farmfoods_updated_24px.svg",
-    "brands/heron_24px.svg",
-    "brands/iceland_24px.svg",
-    "brands/iceland_food_warehouse_24px.svg",
-    "brands/lidl_24px.svg",
-    "brands/little_waitrose_24px.svg",
-    "brands/makro_24px.svg",
-    "brands/mns_24px.svg",
-    "brands/mns_food_24px.svg",
-    "brands/mns_high_street_24px.svg",
-    "brands/mns_hospital_24px.svg",
-    "brands/mns_moto_24px.svg",
-    "brands/mns_outlet_24px.svg",
-    "brands/morrisons_24px.svg",
-    "brands/morrisons_daily_24px.svg",
-    "brands/sainsburys_24px.svg",
-    "brands/sainsburys_local_24px.svg",
-    "brands/spar_24px.svg",
-    "brands/tesco_24px.svg",
-    "brands/tesco_express_24px.svg",
-    "brands/tesco_extra_24px.svg",
-    "brands/waitrose_24px.svg",
-    "brands/wholefoods_24px.svg",
-    "logos/planet_organic_24px.svg",
+    "brands_2023/supermarkets/farmfoods.svg",
+    "brands_2023/supermarkets/heron_foods.svg",
+    "brands_2023/supermarkets/little_waitrose.svg",
+    "brands_2024/amazon_fresh.svg",
+    "brands_2024/booths.svg",
+    "brands_2024/budgens.svg",
+    "brands_2024/cook.svg",
+    "brands_2024/dunnes_stores.svg",
+    "brands_2024/iceland.svg",
+    "brands_2024/makro.svg",
+    "brands_2024/mns.svg",
+    "brands_2024/morrisons_daily.svg",
+    "brands_2024/sainsburys_local.svg",
+    "brands_2024/wholefoods.svg",
+    "logos/aldi.svg",
+    "logos/asda.svg",
+    "logos/centra.svg",
+    "logos/coop.svg",
+    "logos/lidl.svg",
+    "logos/morrisons.svg",
+    "logos/planet_organic.svg",
+    "logos/sainsburys.svg",
+    "logos/spar.svg",
+    "logos/tesco.svg",
+    "logos/tesco_express.svg",
+    "logos/tesco_extra.svg",
+    "logos/waitrose.svg",
    "public_transport/london_tube.svg",
+    "visuals/mns.svg",
 ]

+DERIVED_POI_ICON_PATHS = [
+    ("costco_logo", "brands/costco.svg", "logos/costco.svg"),
+    (
+        "embedded_png",
+        "brands/iceland_food_warehouse_24px.svg",
+        "logos/the_food_warehouse.png",
+    ),
+]
+
+POI_ICON_SVG_CROPS = {
+    "brands_2023/supermarkets/farmfoods.svg": (1.293, 7.314, 15.48, 3.293),
+    "brands_2023/supermarkets/heron_foods.svg": (0.062, 6.68, 17.995, 5.325),
+    "brands_2023/supermarkets/little_waitrose.svg": (0.916, 5.645, 16.365, 6.719),
+    "brands_2024/amazon_fresh.svg": (3.817, 1.646, 16.367, 16.358),
+    "brands_2024/booths.svg": (1.456, 7.143, 15.313, 3.512),
+    "brands_2024/budgens.svg": (2.251, 2.278, 13.6, 13.612),
+    "brands_2024/cook.svg": (5.028, 5.493, 13.945, 9.648),
+    "brands_2024/dunnes_stores.svg": (4.375, 7.732, 15.249, 5.055),
+    "brands_2024/iceland.svg": (1.136, 6.823, 16.067, 4.302),
+    "brands_2024/makro.svg": (4.411, 6.098, 16.397, 5.428),
+    "brands_2024/mns.svg": (4.042, 6.986, 16.171, 6.724),
+    "brands_2024/morrisons_daily.svg": (3.341, 4.414, 17.317, 8.248),
+    "brands_2024/sainsburys_local.svg": (4.58, 1.61, 14.84, 14.849),
+    "brands_2024/wholefoods.svg": (4.17, 2.193, 15.659, 15.668),
+    "logos/aldi.svg": (4.813, 2.563, 14.374, 14.383),
+    "logos/asda.svg": (3.91, 7.135, 16.181, 5.442),
+    "logos/centra.svg": (3.36, 7.35, 17.28, 4.651),
+    "logos/coop.svg": (6.407, 4.658, 11.187, 11.793),
+    "logos/costco.svg": (70.61, 144.908, 256.67, 85.825),
+    "logos/lidl.svg": (4.938, 2.973, 13.985, 13.985),
+    "logos/morrisons.svg": (5.231, 2.985, 13.538, 13.398),
+    "logos/planet_organic.svg": (5.528, 3.564, 12.943, 12.943),
+    "logos/sainsburys.svg": (7.502, 3.572, 8.996, 12.646),
+    "logos/spar.svg": (4.933, 2.968, 14.133, 13.853),
+    "logos/tesco.svg": (4.338, 6.865, 15.324, 5.359),
+    "logos/tesco_express.svg": (5.231, 5.933, 13.538, 8.345),
+    "logos/tesco_extra.svg": (4.933, 5.775, 14.133, 8.519),
+    "logos/waitrose.svg": (5.528, 6.09, 12.943, 9.855),
+}
+
+POI_ICON_SVG_INTRINSIC_MAX = 512
+

 def collect_twemoji_codes() -> list[str]:
    """Derive twemoji hex codes from transform_poi categories.
@ -76,9 +109,6 @@ def collect_twemoji_codes() -> list[str]:
    for emoji in NAPTAN_EMOJIS.values():
        emojis.add(emoji)

-    for emoji in _FALLBACK_EMOJIS:
-        emojis.add(emoji)
-
    # First codepoint hex, matching frontend logic
    return sorted({f"{ord(e[0]):x}" for e in emojis})

@ -97,6 +127,214 @@ def download_file(url: str, dest: Path) -> tuple[bool, str]:
        return False, url


+def download_text(url: str) -> str:
+    with urllib.request.urlopen(url) as response:
+        return response.read().decode("utf-8")
+
+
+def build_costco_logo(marker_svg: str) -> str:
+    start = marker_svg.find('<g><path d=" M 316.312')
+    end = marker_svg.rfind("</g></g></svg>")
+    if start < 0 or end < 0:
+        raise ValueError("Costco marker SVG layout changed")
+
+    logo_group = marker_svg[start : end + 4]
+    return (
+        '<?xml version="1.0" encoding="UTF-8"?>\n'
+        '<svg xmlns="http://www.w3.org/2000/svg" viewBox="70 145 260 90" '
+        'width="260pt" height="90pt" preserveAspectRatio="xMidYMid meet">\n'
+        f"{logo_group}\n"
+        "</svg>\n"
+    )
+
+
+def trim_white_png(png_bytes: bytes) -> bytes:
+    image = Image.open(BytesIO(png_bytes)).convert("RGBA")
+    pixels = image.load()
+
+    for y in range(image.height):
+        for x in range(image.width):
+            red, green, blue, alpha = pixels[x, y]
+            if red > 245 and green > 245 and blue > 245:
+                pixels[x, y] = (red, green, blue, 0)
+
+    alpha_box = image.getchannel("A").getbbox()
+    if alpha_box:
+        image = image.crop(alpha_box)
+
+    out = BytesIO()
+    image.save(out, format="PNG")
+    return out.getvalue()
+
+
+def extract_embedded_png(marker_svg: str) -> bytes:
+    match = re.search(r"base64,([^\"']+)", marker_svg)
+    if not match:
+        raise ValueError("POI marker SVG did not contain an embedded PNG")
+    return trim_white_png(base64.b64decode(match.group(1)))
+
+
+def svg_intrinsic_size(width: float, height: float) -> tuple[int, int]:
+    if width <= 0 or height <= 0:
+        return (POI_ICON_SVG_INTRINSIC_MAX, POI_ICON_SVG_INTRINSIC_MAX)
+    if width >= height:
+        return (
+            POI_ICON_SVG_INTRINSIC_MAX,
+            max(1, round(POI_ICON_SVG_INTRINSIC_MAX * height / width)),
+        )
+    return (
+        max(1, round(POI_ICON_SVG_INTRINSIC_MAX * width / height)),
+        POI_ICON_SVG_INTRINSIC_MAX,
+    )
+
+
+def set_svg_geometry(svg_text: str, crop: tuple[float, float, float, float]) -> str:
+    x, y, width, height = crop
+    view_box = f"{x:g} {y:g} {width:g} {height:g}"
+    intrinsic_width, intrinsic_height = svg_intrinsic_size(width, height)
+
+    svg_text = re.sub(r'viewBox="[^"]+"', f'viewBox="{view_box}"', svg_text, count=1)
+    if 'viewBox="' not in svg_text:
+        svg_text = re.sub(r"<svg\b", f'<svg viewBox="{view_box}"', svg_text, count=1)
+
+    svg_text = re.sub(r'width="[^"]+"', f'width="{intrinsic_width}"', svg_text, count=1)
+    if 'width="' not in svg_text:
+        svg_text = re.sub(
+            r"<svg\b", f'<svg width="{intrinsic_width}"', svg_text, count=1
+        )
+
+    svg_text = re.sub(
+        r'height="[^"]+"', f'height="{intrinsic_height}"', svg_text, count=1
+    )
+    if 'height="' not in svg_text:
+        svg_text = re.sub(
+            r"<svg\b", f'<svg height="{intrinsic_height}"', svg_text, count=1
+        )
+
+    return svg_text
+
+
+def get_svg_view_box(svg_text: str) -> tuple[float, float, float, float] | None:
+    match = re.search(r'viewBox="([^"]+)"', svg_text)
+    if not match:
+        return None
+    parts = [
+        float(part) for part in re.split(r"[\s,]+", match.group(1).strip()) if part
+    ]
+    if len(parts) != 4:
+        return None
+    return (parts[0], parts[1], parts[2], parts[3])
+
+
+def crop_poi_svg_icons(poi_icons_dir: Path) -> None:
+    for icon_path, crop in POI_ICON_SVG_CROPS.items():
+        dest = poi_icons_dir / icon_path
+        if not dest.exists():
+            continue
+        svg_text = dest.read_text(encoding="utf-8")
+        if icon_path == "brands_2024/dunnes_stores.svg":
+            svg_text = svg_text.replace('fill="#fffcfc"', 'fill="#111111"')
+            svg_text = svg_text.replace('fill="#fcfcfc"', 'fill="#111111"')
+        dest.write_text(set_svg_geometry(svg_text, crop), encoding="utf-8")
+
+    for dest in poi_icons_dir.rglob("*.svg"):
+        svg_text = dest.read_text(encoding="utf-8")
+        view_box = get_svg_view_box(svg_text)
+        if view_box:
+            dest.write_text(set_svg_geometry(svg_text, view_box), encoding="utf-8")
+
+
+def download_derived_poi_icon(
+    kind: str, source_path: str, dest: Path
+) -> tuple[bool, str]:
+    url = f"{POI_ICON_BASE}/{source_path}"
+    dest.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        source = download_text(url)
+        if kind == "costco_logo":
+            dest.write_text(build_costco_logo(source), encoding="utf-8")
+        elif kind == "embedded_png":
+            dest.write_bytes(extract_embedded_png(source))
+        else:
+            raise ValueError(f"Unknown derived POI icon kind: {kind}")
+        return True, url
+    except urllib.error.HTTPError as e:
+        print(f"  {e.code} {url}", file=sys.stderr)
+        return False, url
+    except Exception as e:
+        print(f"  ERROR {url}: {e}", file=sys.stderr)
+        return False, url
+
+
+# Slategray accent used by civic POI icons (school, library, building, …) in
+# protomaps' v4 sprite. We match it so the townhall blends in with its peers.
+_TOWNHALL_COLOR = {
+    "light": (135, 128, 171),
+    "dark": (118, 118, 127),
+}
+_TOWNHALL_LOGICAL_SIZE = 17
+
+
+def _render_townhall_glyph(size_px: int, color: tuple[int, int, int]) -> Image.Image:
+    # Draw at 8× resolution and downsample with Lanczos so the pediment's
+    # diagonals come out anti-aliased; PIL's polygon fill is otherwise aliased.
+    super_factor = 8
+    canvas = size_px * super_factor
+    img = Image.new("RGBA", (canvas, canvas), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    fill = (*color, 255)
+
+    def s(v: float) -> float:
+        return v * canvas / _TOWNHALL_LOGICAL_SIZE
+
+    draw.polygon([(s(8.5), s(1)), (s(15), s(6.5)), (s(2), s(6.5))], fill=fill)
+    draw.rectangle([(s(1), s(6.5)), (s(16), s(8.5))], fill=fill)
+    for column_x in (3, 8, 13):
+        draw.rectangle([(s(column_x), s(8.5)), (s(column_x + 1.5), s(14))], fill=fill)
+    draw.rectangle([(s(0), s(14)), (s(17), s(15.5))], fill=fill)
+
+    return img.resize((size_px, size_px), Image.LANCZOS)
+
+
+def inject_townhall_sprite(sprites_dir: Path) -> None:
+    """Append a townhall glyph to each downloaded sprite sheet.
+
+    Protomaps' v4 sprite omits `townhall` even though the basemap style
+    references it; we add the icon here so MapLibre can resolve the name
+    natively at runtime.
+    """
+    for theme in ("light", "dark"):
+        color = _TOWNHALL_COLOR[theme]
+        for suffix, scale in (("", 1), ("@2x", 2)):
+            json_path = sprites_dir / f"{theme}{suffix}.json"
+            png_path = sprites_dir / f"{theme}{suffix}.png"
+            if not json_path.exists() or not png_path.exists():
+                continue
+
+            manifest = json.loads(json_path.read_text())
+            sheet = Image.open(png_path).convert("RGBA")
+
+            glyph_size = _TOWNHALL_LOGICAL_SIZE * scale
+            glyph = _render_townhall_glyph(glyph_size, color)
+
+            new_width = max(sheet.width, glyph_size)
+            new_height = sheet.height + glyph_size
+            extended = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
+            extended.paste(sheet, (0, 0))
+            extended.paste(glyph, (0, sheet.height))
+            extended.save(png_path, optimize=True)
+
+            manifest["townhall"] = {
+                "x": 0,
+                "y": sheet.height,
+                "width": glyph_size,
+                "height": glyph_size,
+                "pixelRatio": scale,
+            }
+            json_path.write_text(json.dumps(manifest))
+
+
 def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
@ -147,7 +385,7 @@ def main():
    # Skip already-downloaded files
    remaining = [(url, dest) for url, dest in tasks]

-    print(f"Downloading {len(remaining)} assets")
+    print(f"Downloading {len(remaining) + len(DERIVED_POI_ICON_PATHS)} assets")

    ok = 0
    fail = 0
@ -162,6 +400,18 @@ def main():
            else:
                fail += 1

+    for kind, source_path, dest_path in DERIVED_POI_ICON_PATHS:
+        success, _url = download_derived_poi_icon(
+            kind, source_path, poi_icons_dir / dest_path
+        )
+        if success:
+            ok += 1
+        else:
+            fail += 1
+
+    crop_poi_svg_icons(poi_icons_dir)
+    inject_townhall_sprite(sprites_dir)
+
    print(f"Done: {ok} downloaded, {fail} failed")


--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -18,6 +18,7 @@ endpoint is broken for that coverage).

 import argparse
 import tempfile
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path

@ -29,8 +30,10 @@ from pyproj import Transformer
 from rasterio.merge import merge
 from rasterio.transform import rowcol

-# Noise sources: (label, column_name, WCS base URL, coverage ID, WCS version)
-# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1.
+# Noise sources:
+# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
+# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
+# for many sparse/no-coverage tiles, which should become nulls.
 NOISE_SOURCES = [
    (
        "Road",
@ -38,6 +41,7 @@ NOISE_SOURCES = [
        "https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs",
        "Road_Noise_Lden_England_Round_4_All",
        "1.0.0",
+        False,
    ),
    (
        "Rail",
@ -45,6 +49,7 @@ NOISE_SOURCES = [
        "https://environment.data.gov.uk/spatialdata/noise-data/wcs",
        "Rail_Noise_Lden_England_Round_4_All",
        "1.0.0",
+        False,
    ),
    (
        "Airport",
@ -52,6 +57,7 @@ NOISE_SOURCES = [
        "https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs",
        "dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden",
        "2.0.1",
+        True,
    ),
 ]

@ -74,6 +80,14 @@ NATIVE_RESOLUTION = 10
 # and keeps download size ~100x smaller than native 10m)
 RESOLUTION = 100

+# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
+# intermittently return 504s; smaller fallback requests usually succeed.
+MAX_RETRIES = 3
+RETRY_BACKOFF_SECONDS = 5
+MIN_TILE_SIZE = 25_000
+
+type Tile = tuple[int, int, int, int]
+

 def _wcs_get_coverage_url(
    wcs_base: str,
@ -117,6 +131,53 @@ def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.n
    return _TO_BNG.transform(lon, lat)  # pyproj takes (x=lon, y=lat)


+def _looks_like_tiff(response: httpx.Response) -> bool:
+    content_type = response.headers.get("content-type", "")
+    return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
+
+
+def _fetch_tile_bytes(
+    wcs_base: str,
+    coverage_id: str,
+    min_e: int,
+    min_n: int,
+    max_e: int,
+    max_n: int,
+    wcs_version: str = "1.0.0",
+) -> bytes | None:
+    """Fetch one WCS tile. Returns None when the server reports no GeoTIFF."""
+    url = _wcs_get_coverage_url(
+        wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
+    )
+    with httpx.Client(timeout=300, follow_redirects=True) as client:
+        resp = client.get(url)
+        resp.raise_for_status()
+
+    if not _looks_like_tiff(resp):
+        return None
+    return resp.content
+
+
+def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]:
+    e_edges = [min_e, max_e]
+    n_edges = [min_n, max_n]
+    if max_e - min_e > MIN_TILE_SIZE:
+        e_edges.insert(1, (min_e + max_e) // 2)
+    if max_n - min_n > MIN_TILE_SIZE:
+        n_edges.insert(1, (min_n + max_n) // 2)
+
+    subtiles: list[Tile] = []
+    for e0, e1 in zip(e_edges, e_edges[1:]):
+        for n0, n1 in zip(n_edges, n_edges[1:]):
+            if e1 > e0 and n1 > n0:
+                subtiles.append((e0, n0, e1, n1))
+    return subtiles
+
+
+def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path:
+    return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif"
+
+
 def _download_tile(
    wcs_base: str,
    coverage_id: str,
@ -124,30 +185,63 @@ def _download_tile(
    min_n: int,
    max_e: int,
    max_n: int,
-    tile_path: Path,
+    tile_dir: Path,
    wcs_version: str = "1.0.0",
-) -> Path | None:
-    """Download a single WCS tile. Returns path if successful, None otherwise."""
-    url = _wcs_get_coverage_url(
-        wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
+    split_failures: bool = True,
+) -> tuple[list[Path], list[Tile]]:
+    """Download a WCS tile, splitting on repeated server failures."""
+    tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n)
+    if tile_path.exists() and tile_path.stat().st_size > 0:
+        return [tile_path], []
+
+    last_error: Exception | None = None
+    for attempt in range(1, MAX_RETRIES + 1):
+        try:
+            content = _fetch_tile_bytes(
+                wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
+            )
+            if content is None:
+                return [], []
+            tile_path.write_bytes(content)
+            return [tile_path], []
+        except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
+            last_error = e
+            if attempt < MAX_RETRIES:
+                sleep_for = RETRY_BACKOFF_SECONDS * attempt
+                print(
+                    f"  Retrying tile ({min_e},{min_n})-({max_e},{max_n}) "
+                    f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})"
+                )
+                time.sleep(sleep_for)
+
+    subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else []
+    if len(subtiles) > 1:
+        print(
+            f"  Splitting tile ({min_e},{min_n})-({max_e},{max_n}) "
+            f"into {len(subtiles)} smaller requests after: {last_error}"
+        )
+        paths: list[Path] = []
+        failures: list[Tile] = []
+        for e0, n0, e1, n1 in subtiles:
+            child_paths, child_failures = _download_tile(
+                wcs_base,
+                coverage_id,
+                e0,
+                n0,
+                e1,
+                n1,
+                tile_dir,
+                wcs_version,
+                split_failures,
+            )
+            paths.extend(child_paths)
+            failures.extend(child_failures)
+        return paths, failures
+
+    print(
+        f"  Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}"
    )
-    try:
-        with httpx.Client(timeout=300, follow_redirects=True) as client:
-            resp = client.get(url)
-            resp.raise_for_status()
-
-        content_type = resp.headers.get("content-type", "")
-        if "tiff" not in content_type and resp.content[:4] not in (
-            b"II*\x00",
-            b"MM\x00*",
-        ):
-            return None
-
-        tile_path.write_bytes(resp.content)
-        return tile_path
-    except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
-        print(f"  Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {e}")
-        return None
+    return [], [(min_e, min_n, max_e, max_n)]


 def download_raster(
@ -156,6 +250,7 @@ def download_raster(
    coverage_id: str,
    label: str,
    wcs_version: str = "1.0.0",
+    allow_missing_tiles: bool = False,
 ) -> list[Path]:
    """Download noise GeoTIFF raster covering England, returning paths to saved files."""
    tiles = []
@ -168,13 +263,13 @@ def download_raster(
    print(
        f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
    )
-    paths = []
+    paths: list[Path] = []
+    failures: list[Tile] = []
    completed = 0

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {}
        for min_e, min_n, max_e, max_n in tiles:
-            tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
            fut = executor.submit(
                _download_tile,
                wcs_base,
@ -183,23 +278,41 @@ def download_raster(
                min_n,
                max_e,
                max_n,
-                tile_path,
+                tile_dir,
                wcs_version,
+                not allow_missing_tiles,
            )
            futures[fut] = (min_e, min_n)

        for fut in as_completed(futures):
            completed += 1
-            result = fut.result()
-            if result is not None:
-                paths.append(result)
+            result_paths, result_failures = fut.result()
+            paths.extend(result_paths)
+            failures.extend(result_failures)
            print(
-                f"\r  [{completed}/{len(tiles)}] Downloaded {len(paths)} valid tiles",
+                f"\r  [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs",
                end="",
                flush=True,
            )

-    print(f"\n[{label}] Downloaded {len(paths)}/{len(tiles)} tiles")
+    if failures:
+        preview = ", ".join(
+            f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5]
+        )
+        suffix = "..." if len(failures) > 5 else ""
+        if allow_missing_tiles:
+            print(
+                f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data "
+                f"tile requests: {preview}{suffix}"
+            )
+            print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
+            return paths
+        raise RuntimeError(
+            f"[{label}] Failed to download {len(failures)} tile requests after "
+            f"retries and splitting: {preview}{suffix}"
+        )
+
+    print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
    return paths


@ -281,11 +394,23 @@ def main() -> None:
    result = postcodes.select("postcode")

    with tempfile.TemporaryDirectory() as tmp:
-        for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
+        for (
+            label,
+            col_name,
+            wcs_base,
+            coverage_id,
+            wcs_version,
+            allow_missing_tiles,
+        ) in NOISE_SOURCES:
            tile_dir = Path(tmp) / label.lower()
            tile_dir.mkdir()
            tile_paths = download_raster(
-                tile_dir, wcs_base, coverage_id, label, wcs_version
+                tile_dir,
+                wcs_base,
+                coverage_id,
+                label,
+                wcs_version,
+                allow_missing_tiles,
            )

            if not tile_paths:
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -6,6 +6,7 @@ Reuses the same england-latest.osm.pbf as pois.py.
 """

 import argparse
+import re
 from pathlib import Path

 import osmium
@ -44,11 +45,37 @@ _STATION_STRIP = (
    " underground station",
    " railway station",
    " dlr station",
+    " station dlr",
+    " dlr",
    " overground station",
    " tram stop",
    " station",
 )

+_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
+
+
+def _is_dlr_station(tags: dict[str, str]) -> bool:
+    name = tags.get("name", "").lower()
+    network = tags.get("network", "").lower()
+    operator = tags.get("operator", "").lower()
+    return (
+        "docklands" in network
+        or "dlr" in network
+        or "docklands" in operator
+        or "dlr" in operator
+        or name.endswith(" dlr")
+        or " dlr " in name
+    )
+
+
+def _is_tram_station(tags: dict[str, str]) -> bool:
+    if _is_dlr_station(tags):
+        return False
+    station_tag = tags.get("station", "")
+    network = tags.get("network", "").lower()
+    return station_tag == "light_rail" or "tramlink" in network or "tram" in network
+

 def _station_display_name(name: str, tags: dict[str, str]) -> str:
    """Build a descriptive station name like 'Bank tube station'."""
@ -78,6 +105,96 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
    return f"{name} {suffix}"


+def _station_name_score(name: str) -> tuple[int, int]:
+    lower = name.lower()
+    suffix_penalty = int(
+        lower.endswith(
+            (
+                " underground station",
+                " tube station",
+                " dlr station",
+                " railway station",
+                " rail station",
+                " station dlr",
+                " station",
+            )
+        )
+        or lower.endswith(" dlr")
+    )
+    return (suffix_penalty, len(name))
+
+
+def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
+    """Extract station-level DLR destinations from NaPTAN access nodes."""
+    df = pl.read_parquet(naptan_path)
+    required = {"id", "name", "category", "lat", "lng"}
+    missing = required - set(df.columns)
+    if missing:
+        raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")
+
+    rows: dict[str, dict] = {}
+    for row in df.iter_rows(named=True):
+        atco_id = str(row["id"] or "")
+        match = _DLR_CODE_RE.search(atco_id)
+        if not match:
+            continue
+        if row["category"] not in {"Tube station", "Rail station"}:
+            continue
+
+        code = match.group(1)
+        raw_name = str(row["name"] or "")
+        if not raw_name:
+            continue
+
+        lat = float(row["lat"])
+        lon = float(row["lng"])
+        current = rows.get(code)
+        if current is None:
+            rows[code] = {
+                "raw_name": raw_name,
+                "lat_sum": lat,
+                "lon_sum": lon,
+                "count": 1,
+            }
+            continue
+
+        current["lat_sum"] += lat
+        current["lon_sum"] += lon
+        current["count"] += 1
+        if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
+            current["raw_name"] = raw_name
+
+    stations = []
+    for station in rows.values():
+        count = station["count"]
+        display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
+        stations.append(
+            {
+                "name": display_name,
+                "place_type": "station",
+                "lat": station["lat_sum"] / count,
+                "lon": station["lon_sum"] / count,
+                "population": 0,
+                "travel_destination": True,
+            }
+        )
+
+    return sorted(stations, key=lambda station: station["name"])
+
+
+def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
+    existing_names = {str(place["name"]).casefold() for place in places}
+    added = 0
+    for station in _naptan_dlr_stations(naptan_path):
+        key = station["name"].casefold()
+        if key in existing_names:
+            continue
+        places.append(station)
+        existing_names.add(key)
+        added += 1
+    return added
+
+
 class PlaceHandler(osmium.SimpleHandler):
    def __init__(self, progress: tqdm, england_polygon) -> None:
        super().__init__()
@ -145,14 +262,7 @@ class PlaceHandler(osmium.SimpleHandler):
        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
        if n.tags.get("railway") == "station":
            tags = dict(n.tags)
-            station_tag = tags.get("station", "")
-            network = tags.get("network", "").lower()
-            # Skip tram stops
-            if (
-                station_tag == "light_rail"
-                or "tramlink" in network
-                or "tram" in network
-            ):
+            if _is_tram_station(tags):
                return
            display_name = _station_display_name(name, tags)
            self._add(
@ -178,6 +288,11 @@ def main() -> None:
        required=True,
        help="England boundary GeoJSON file",
    )
+    parser.add_argument(
+        "--naptan",
+        type=Path,
+        help="Optional NaPTAN parquet file used to add DLR station destinations",
+    )
    args = parser.parse_args()

    pbf_file = args.pbf
@ -195,6 +310,9 @@ def main() -> None:
        handler.apply_file(str(pbf_file), locations=True)

    print(f"Extracted {len(handler.places):,} place nodes")
+    if args.naptan:
+        added = _append_naptan_dlr_stations(handler.places, args.naptan)
+        print(f"Added {added:,} DLR station destinations from NaPTAN")

    if handler.places:
        df = pl.DataFrame(handler.places)
--- a/pipeline/download/test_crime.py
+++ b/pipeline/download/test_crime.py
@ -0,0 +1,60 @@
+from zipfile import ZipFile
+
+from pipeline.download.crime import extract_csvs, parse_archives
+
+
+def test_parse_archives_reads_monthly_zip_links_only():
+    html = """
+    <p><a href="/data/archive/latest.zip">latest.zip</a></p>
+    <div class="archive crime">
+      <div class="download">
+        <i class="icon-file"></i> <span><a href="/data/archive/2026-03.zip">March 2026</a> (1.6&nbsp;GB)</span>
+        <p class="contained-range">Contains data from Apr 2023 to Mar 2026</p>
+        <p class="md5sum">6dde462489389445877f3988ef3f4f4b</p>
+      </div>
+      <div class="download">
+        <i class="icon-file"></i> <span><a href="/data/archive/2019-06.zip">June 2019</a> (1.6&nbsp;GB)</span>
+        <p class="contained-range">Contains data from Jul 2016 to Jun 2019</p>
+        <p class="md5sum">d6494297b24c1434bdb2504e95261bf8-100</p>
+      </div>
+    </div>
+    <div class="archive neighbourhood">
+      <div class="download">
+        <span><a href="/data/neighbourhood.zip">Neighbourhood crime</a> (2.2 MB)</span>
+        <small class="md5sum">6b80e2b97d87f6668b7a45953924d191</small>
+      </div>
+    </div>
+    """
+
+    archives = parse_archives(html, "https://data.police.uk/data/archive/")
+
+    assert [archive.filename for archive in archives] == [
+        "2026-03.zip",
+        "2019-06.zip",
+    ]
+    assert archives[0].url == "https://data.police.uk/data/archive/2026-03.zip"
+    assert archives[0].md5 == "6dde462489389445877f3988ef3f4f4b"
+    assert archives[1].md5 is None
+    assert archives[1].raw_md5 == "d6494297b24c1434bdb2504e95261bf8-100"
+
+
+def test_extract_csvs_preserves_existing_newer_files(tmp_path):
+    zip_path = tmp_path / "older.zip"
+    output = tmp_path / "crime"
+    existing = output / "2023-01" / "2023-01-city-street.csv"
+    existing.parent.mkdir(parents=True)
+    existing.write_text("newer\n")
+
+    with ZipFile(zip_path, "w") as archive:
+        archive.writestr("2023-01/2023-01-city-street.csv", "older\n")
+        archive.writestr("2022-12/2022-12-city-street.csv", "old\n")
+        archive.writestr("../escape.csv", "bad\n")
+        archive.writestr("notes.txt", "ignored\n")
+
+    extracted, skipped = extract_csvs(zip_path, output)
+
+    assert extracted == 1
+    assert skipped == 1
+    assert existing.read_text() == "newer\n"
+    assert (output / "2022-12" / "2022-12-city-street.csv").read_text() == "old\n"
+    assert not (tmp_path / "escape.csv").exists()
--- a/pipeline/download/test_noise.py
+++ b/pipeline/download/test_noise.py
@ -0,0 +1,89 @@
+import httpx
+import pytest
+
+from pipeline.download import noise
+
+
+def test_download_tile_splits_after_retries(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "MAX_RETRIES", 1)
+    monkeypatch.setattr(noise, "MIN_TILE_SIZE", 50)
+
+    def fake_fetch_tile_bytes(
+        wcs_base,
+        coverage_id,
+        min_e,
+        min_n,
+        max_e,
+        max_n,
+        wcs_version="1.0.0",
+    ):
+        if max_e - min_e > 50 or max_n - min_n > 50:
+            raise httpx.TimeoutException("too large")
+        return b"II*\x00fake-tiff"
+
+    monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
+
+    paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
+
+    assert failures == []
+    assert len(paths) == 4
+    assert sorted(path.name for path in paths) == [
+        "tile_0_0_50_50.tif",
+        "tile_0_50_50_100.tif",
+        "tile_50_0_100_50.tif",
+        "tile_50_50_100_100.tif",
+    ]
+
+
+def test_download_tile_reports_unsplittable_failure(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "MAX_RETRIES", 1)
+    monkeypatch.setattr(noise, "MIN_TILE_SIZE", 100)
+
+    def fake_fetch_tile_bytes(*args, **kwargs):
+        raise httpx.ConnectError("offline")
+
+    monkeypatch.setattr(noise, "_fetch_tile_bytes", fake_fetch_tile_bytes)
+
+    paths, failures = noise._download_tile("base", "coverage", 0, 0, 100, 100, tmp_path)
+
+    assert paths == []
+    assert failures == [(0, 0, 100, 100)]
+
+
+def test_download_raster_tolerates_missing_tiles_when_allowed(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "BNG_MIN_E", 0)
+    monkeypatch.setattr(noise, "BNG_MAX_E", 100)
+    monkeypatch.setattr(noise, "BNG_MIN_N", 0)
+    monkeypatch.setattr(noise, "BNG_MAX_N", 100)
+    monkeypatch.setattr(noise, "TILE_SIZE", 100)
+
+    def fake_download_tile(*args, **kwargs):
+        return [], [(0, 0, 100, 100)]
+
+    monkeypatch.setattr(noise, "_download_tile", fake_download_tile)
+
+    paths = noise.download_raster(
+        tmp_path,
+        "base",
+        "coverage",
+        "Airport",
+        allow_missing_tiles=True,
+    )
+
+    assert paths == []
+
+
+def test_download_raster_raises_on_missing_strict_tiles(monkeypatch, tmp_path):
+    monkeypatch.setattr(noise, "BNG_MIN_E", 0)
+    monkeypatch.setattr(noise, "BNG_MAX_E", 100)
+    monkeypatch.setattr(noise, "BNG_MIN_N", 0)
+    monkeypatch.setattr(noise, "BNG_MAX_N", 100)
+    monkeypatch.setattr(noise, "TILE_SIZE", 100)
+
+    def fake_download_tile(*args, **kwargs):
+        return [], [(0, 0, 100, 100)]
+
+    monkeypatch.setattr(noise, "_download_tile", fake_download_tile)
+
+    with pytest.raises(RuntimeError, match=r"\[Road\] Failed to download"):
+        noise.download_raster(tmp_path, "base", "coverage", "Road")
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -0,0 +1,81 @@
+import polars as pl
+
+from pipeline.download.places import (
+    _is_dlr_station,
+    _is_tram_station,
+    _naptan_dlr_stations,
+    _station_display_name,
+)
+
+
+def test_dlr_light_rail_is_not_treated_as_tram():
+    dlr_tags = {
+        "name": "Lewisham DLR",
+        "railway": "station",
+        "station": "light_rail",
+        "network": "Docklands Light Railway",
+    }
+
+    assert _is_dlr_station(dlr_tags)
+    assert not _is_tram_station(dlr_tags)
+    assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
+    assert (
+        _station_display_name("Tower Gateway Station DLR", dlr_tags)
+        == "Tower Gateway DLR station"
+    )
+
+
+def test_tram_light_rail_is_still_excluded():
+    tram_tags = {
+        "name": "East Croydon",
+        "railway": "station",
+        "station": "light_rail",
+        "network": "London Trams",
+    }
+
+    assert not _is_dlr_station(tram_tags)
+    assert _is_tram_station(tram_tags)
+
+
+def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
+    naptan = tmp_path / "naptan.parquet"
+    pl.DataFrame(
+        {
+            "id": [
+                "4900ZZDLSHA3",
+                "9400ZZDLSHA",
+                "4900ZZDLGRE1",
+                "490002076RV",
+                "4900ZZLUBNK",
+            ],
+            "name": [
+                "Shadwell DLR",
+                "Shadwell DLR Station",
+                "Greenwich Station",
+                "Tower Gateway Station DLR",
+                "Bank",
+            ],
+            "category": [
+                "Tube station",
+                "Tube station",
+                "Rail station",
+                "Bus stop",
+                "Tube station",
+            ],
+            "lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
+            "lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
+        }
+    ).write_parquet(naptan)
+
+    stations = _naptan_dlr_stations(naptan)
+
+    assert [station["name"] for station in stations] == [
+        "Greenwich DLR station",
+        "Shadwell DLR station",
+    ]
+    shadwell = next(
+        station for station in stations if station["name"].startswith("Shadwell")
+    )
+    assert shadwell["lat"] == (51.51156 + 51.511693) / 2
+    assert shadwell["place_type"] == "station"
+    assert shadwell["travel_destination"] is True
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -56,6 +56,7 @@ NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
 NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"

 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
+TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"


 def _download_http(
@ -473,10 +474,50 @@ def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
    download_naptan()

    print("Converting TfL TransXChange → GTFS...")
+    # The shim patches known packaging/runtime issues in the pinned npm package
+    # before loading its CLI from npx's temporary install.
+    shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
    subprocess.run(
-        ["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
+        [
+            "npx",
+            "--yes",
+            "--package",
+            TRANSXCHANGE2GTFS_PACKAGE,
+            "sh",
+            "-c",
+            "\n".join(
+                [
+                    'bin="$(command -v transxchange2gtfs)"',
+                    'script="$(readlink -f "$bin")"',
+                    'pkg_dir="$(dirname "$(dirname "$script")")"',
+                    'shim="$1"',
+                    "shift",
+                    'exec node "$shim" "$pkg_dir" "$@"',
+                ]
+            ),
+            "transxchange2gtfs",
+            str(shim_path.resolve()),
+            str(txc_path.resolve()),
+            str(dest.resolve()),
+        ],
        check=True,
    )
+    required_files = {
+        "agency.txt",
+        "calendar.txt",
+        "calendar_dates.txt",
+        "routes.txt",
+        "stop_times.txt",
+        "stops.txt",
+        "trips.txt",
+    }
+    if not dest.exists() or not zipfile.is_zipfile(dest):
+        raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
+    with zipfile.ZipFile(dest) as z:
+        missing = required_files - set(z.namelist())
+    if missing:
+        missing_str = ", ".join(sorted(missing))
+        raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
    size_mb = dest.stat().st_size / (1024 * 1024)
    print(f"  Saved to {dest} ({size_mb:.1f} MB)")
    return dest
--- a/pipeline/download/transxchange2gtfs_shim.js
+++ b/pipeline/download/transxchange2gtfs_shim.js
@ -0,0 +1,76 @@
+#!/usr/bin/env node
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+const { createRequire } = require("module");
+
+const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
+
+if (!pkgDirArg || converterArgs.length < 2) {
+  console.error(
+    "Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
+  );
+  process.exit(2);
+}
+
+const pkgDir = path.resolve(pkgDirArg);
+
+function replaceOnce(relativePath, before, after) {
+  const file = path.join(pkgDir, relativePath);
+  const original = fs.readFileSync(file, "utf8");
+  if (original.includes(before)) {
+    fs.writeFileSync(file, original.replace(before, after));
+  } else if (original.includes(after)) {
+    return;
+  } else {
+    throw new Error(`Could not patch ${relativePath}: expected text not found`);
+  }
+}
+
+// The published 1.12.0 package has a few compatibility issues with current
+// TfL TransXChange exports:
+// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
+// - the compiled date-holidays import expects a synthetic default export
+// - some TfL journeys reference timing links without matching route-link geometry
+//
+// GTFS shapes are optional for R5 routing. Clear shape references and omit
+// shapes.txt so missing route geometry does not drop otherwise usable trips.
+function patchPackage() {
+  replaceOnce(
+    "dist/transxchange/TransXChangeJourneyStream.js",
+    "distanceSoFarM += routeLink.Distance;",
+    "distanceSoFarM += routeLink ? routeLink.Distance : 0;",
+  );
+  replaceOnce(
+    "dist/gtfs/TripsStream.js",
+    "(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
+    "\"\");",
+  );
+  replaceOnce(
+    "dist/gtfs/StopTimesStream.js",
+    "stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
+    "\"\", stop.exactTime ? \"1\" : \"0\");",
+  );
+  replaceOnce(
+    "dist/Container.js",
+    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n            \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
+    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+  );
+  replaceOnce(
+    "dist/Container.js",
+    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+  );
+}
+
+patchPackage();
+
+const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
+const Holidays = pkgRequire("date-holidays");
+if (!Holidays.default) {
+  Holidays.default = Holidays;
+}
+
+process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
+require(path.join(pkgDir, "dist", "cli.js"));