Test changes

2026-05-09 11:35:38 +01:00 · 2026-05-09 11:35:38 +01:00 · be02fc16bb
commit be02fc16bb
parent 4c95815dc8
41 changed files with 4224 additions and 759 deletions
--- a/pipeline/download/map_assets.py
+++ b/pipeline/download/map_assets.py
@ -1,9 +1,15 @@
 import argparse
+import base64
+import json
+import re
 import sys
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from io import BytesIO
 from pathlib import Path

+from PIL import Image, ImageDraw
+
 from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES

 GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
@ -14,53 +20,80 @@ POI_ICON_BASE = "https://geolytix.github.io/MapIcons"
 # Font stacks used by @protomaps/basemaps with lang='en'
 FONT_STACKS = ["Noto Sans Regular", "Noto Sans Italic", "Noto Sans Medium"]

-# Fallback emoji not in any category
-_FALLBACK_EMOJIS = ["📍"]
-
 POI_ICON_PATHS = [
-    "asda/asda_express_24px.svg",
-    "asda/asda_green_basket_24px.svg",
-    "asda/asda_green_trolley_24px.svg",
-    "asda/asda_living_24px.svg",
-    "asda/asda_pfs_24px.svg",
-    "asda/asda_primary.svg",
-    "asda/asda_superstore_green_trolley_24px.svg",
-    "brands/aldi_24px.svg",
-    "brands/amazon_fresh_alt_24px.svg",
-    "brands/booths_24px.svg",
-    "brands/budgens_24px.svg",
-    "brands/centra_24px.svg",
-    "brands/cook.svg",
-    "brands/coop_24px.svg",
-    "brands/costco_24px.svg",
-    "brands/dunnes_stores_24px.svg",
-    "brands/farmfoods_updated_24px.svg",
-    "brands/heron_24px.svg",
-    "brands/iceland_24px.svg",
-    "brands/iceland_food_warehouse_24px.svg",
-    "brands/lidl_24px.svg",
-    "brands/little_waitrose_24px.svg",
-    "brands/makro_24px.svg",
-    "brands/mns_24px.svg",
-    "brands/mns_food_24px.svg",
-    "brands/mns_high_street_24px.svg",
-    "brands/mns_hospital_24px.svg",
-    "brands/mns_moto_24px.svg",
-    "brands/mns_outlet_24px.svg",
-    "brands/morrisons_24px.svg",
-    "brands/morrisons_daily_24px.svg",
-    "brands/sainsburys_24px.svg",
-    "brands/sainsburys_local_24px.svg",
-    "brands/spar_24px.svg",
-    "brands/tesco_24px.svg",
-    "brands/tesco_express_24px.svg",
-    "brands/tesco_extra_24px.svg",
-    "brands/waitrose_24px.svg",
-    "brands/wholefoods_24px.svg",
-    "logos/planet_organic_24px.svg",
+    "brands_2023/supermarkets/farmfoods.svg",
+    "brands_2023/supermarkets/heron_foods.svg",
+    "brands_2023/supermarkets/little_waitrose.svg",
+    "brands_2024/amazon_fresh.svg",
+    "brands_2024/booths.svg",
+    "brands_2024/budgens.svg",
+    "brands_2024/cook.svg",
+    "brands_2024/dunnes_stores.svg",
+    "brands_2024/iceland.svg",
+    "brands_2024/makro.svg",
+    "brands_2024/mns.svg",
+    "brands_2024/morrisons_daily.svg",
+    "brands_2024/sainsburys_local.svg",
+    "brands_2024/wholefoods.svg",
+    "logos/aldi.svg",
+    "logos/asda.svg",
+    "logos/centra.svg",
+    "logos/coop.svg",
+    "logos/lidl.svg",
+    "logos/morrisons.svg",
+    "logos/planet_organic.svg",
+    "logos/sainsburys.svg",
+    "logos/spar.svg",
+    "logos/tesco.svg",
+    "logos/tesco_express.svg",
+    "logos/tesco_extra.svg",
+    "logos/waitrose.svg",
    "public_transport/london_tube.svg",
+    "visuals/mns.svg",
 ]

+DERIVED_POI_ICON_PATHS = [
+    ("costco_logo", "brands/costco.svg", "logos/costco.svg"),
+    (
+        "embedded_png",
+        "brands/iceland_food_warehouse_24px.svg",
+        "logos/the_food_warehouse.png",
+    ),
+]
+
+POI_ICON_SVG_CROPS = {
+    "brands_2023/supermarkets/farmfoods.svg": (1.293, 7.314, 15.48, 3.293),
+    "brands_2023/supermarkets/heron_foods.svg": (0.062, 6.68, 17.995, 5.325),
+    "brands_2023/supermarkets/little_waitrose.svg": (0.916, 5.645, 16.365, 6.719),
+    "brands_2024/amazon_fresh.svg": (3.817, 1.646, 16.367, 16.358),
+    "brands_2024/booths.svg": (1.456, 7.143, 15.313, 3.512),
+    "brands_2024/budgens.svg": (2.251, 2.278, 13.6, 13.612),
+    "brands_2024/cook.svg": (5.028, 5.493, 13.945, 9.648),
+    "brands_2024/dunnes_stores.svg": (4.375, 7.732, 15.249, 5.055),
+    "brands_2024/iceland.svg": (1.136, 6.823, 16.067, 4.302),
+    "brands_2024/makro.svg": (4.411, 6.098, 16.397, 5.428),
+    "brands_2024/mns.svg": (4.042, 6.986, 16.171, 6.724),
+    "brands_2024/morrisons_daily.svg": (3.341, 4.414, 17.317, 8.248),
+    "brands_2024/sainsburys_local.svg": (4.58, 1.61, 14.84, 14.849),
+    "brands_2024/wholefoods.svg": (4.17, 2.193, 15.659, 15.668),
+    "logos/aldi.svg": (4.813, 2.563, 14.374, 14.383),
+    "logos/asda.svg": (3.91, 7.135, 16.181, 5.442),
+    "logos/centra.svg": (3.36, 7.35, 17.28, 4.651),
+    "logos/coop.svg": (6.407, 4.658, 11.187, 11.793),
+    "logos/costco.svg": (70.61, 144.908, 256.67, 85.825),
+    "logos/lidl.svg": (4.938, 2.973, 13.985, 13.985),
+    "logos/morrisons.svg": (5.231, 2.985, 13.538, 13.398),
+    "logos/planet_organic.svg": (5.528, 3.564, 12.943, 12.943),
+    "logos/sainsburys.svg": (7.502, 3.572, 8.996, 12.646),
+    "logos/spar.svg": (4.933, 2.968, 14.133, 13.853),
+    "logos/tesco.svg": (4.338, 6.865, 15.324, 5.359),
+    "logos/tesco_express.svg": (5.231, 5.933, 13.538, 8.345),
+    "logos/tesco_extra.svg": (4.933, 5.775, 14.133, 8.519),
+    "logos/waitrose.svg": (5.528, 6.09, 12.943, 9.855),
+}
+
+POI_ICON_SVG_INTRINSIC_MAX = 512
+

 def collect_twemoji_codes() -> list[str]:
    """Derive twemoji hex codes from transform_poi categories.
@ -76,9 +109,6 @@ def collect_twemoji_codes() -> list[str]:
    for emoji in NAPTAN_EMOJIS.values():
        emojis.add(emoji)

-    for emoji in _FALLBACK_EMOJIS:
-        emojis.add(emoji)
-
    # First codepoint hex, matching frontend logic
    return sorted({f"{ord(e[0]):x}" for e in emojis})

@ -97,6 +127,214 @@ def download_file(url: str, dest: Path) -> tuple[bool, str]:
        return False, url


+def download_text(url: str) -> str:
+    with urllib.request.urlopen(url) as response:
+        return response.read().decode("utf-8")
+
+
+def build_costco_logo(marker_svg: str) -> str:
+    start = marker_svg.find('<g><path d=" M 316.312')
+    end = marker_svg.rfind("</g></g></svg>")
+    if start < 0 or end < 0:
+        raise ValueError("Costco marker SVG layout changed")
+
+    logo_group = marker_svg[start : end + 4]
+    return (
+        '<?xml version="1.0" encoding="UTF-8"?>\n'
+        '<svg xmlns="http://www.w3.org/2000/svg" viewBox="70 145 260 90" '
+        'width="260pt" height="90pt" preserveAspectRatio="xMidYMid meet">\n'
+        f"{logo_group}\n"
+        "</svg>\n"
+    )
+
+
+def trim_white_png(png_bytes: bytes) -> bytes:
+    image = Image.open(BytesIO(png_bytes)).convert("RGBA")
+    pixels = image.load()
+
+    for y in range(image.height):
+        for x in range(image.width):
+            red, green, blue, alpha = pixels[x, y]
+            if red > 245 and green > 245 and blue > 245:
+                pixels[x, y] = (red, green, blue, 0)
+
+    alpha_box = image.getchannel("A").getbbox()
+    if alpha_box:
+        image = image.crop(alpha_box)
+
+    out = BytesIO()
+    image.save(out, format="PNG")
+    return out.getvalue()
+
+
+def extract_embedded_png(marker_svg: str) -> bytes:
+    match = re.search(r"base64,([^\"']+)", marker_svg)
+    if not match:
+        raise ValueError("POI marker SVG did not contain an embedded PNG")
+    return trim_white_png(base64.b64decode(match.group(1)))
+
+
+def svg_intrinsic_size(width: float, height: float) -> tuple[int, int]:
+    if width <= 0 or height <= 0:
+        return (POI_ICON_SVG_INTRINSIC_MAX, POI_ICON_SVG_INTRINSIC_MAX)
+    if width >= height:
+        return (
+            POI_ICON_SVG_INTRINSIC_MAX,
+            max(1, round(POI_ICON_SVG_INTRINSIC_MAX * height / width)),
+        )
+    return (
+        max(1, round(POI_ICON_SVG_INTRINSIC_MAX * width / height)),
+        POI_ICON_SVG_INTRINSIC_MAX,
+    )
+
+
+def set_svg_geometry(svg_text: str, crop: tuple[float, float, float, float]) -> str:
+    x, y, width, height = crop
+    view_box = f"{x:g} {y:g} {width:g} {height:g}"
+    intrinsic_width, intrinsic_height = svg_intrinsic_size(width, height)
+
+    svg_text = re.sub(r'viewBox="[^"]+"', f'viewBox="{view_box}"', svg_text, count=1)
+    if 'viewBox="' not in svg_text:
+        svg_text = re.sub(r"<svg\b", f'<svg viewBox="{view_box}"', svg_text, count=1)
+
+    svg_text = re.sub(r'width="[^"]+"', f'width="{intrinsic_width}"', svg_text, count=1)
+    if 'width="' not in svg_text:
+        svg_text = re.sub(
+            r"<svg\b", f'<svg width="{intrinsic_width}"', svg_text, count=1
+        )
+
+    svg_text = re.sub(
+        r'height="[^"]+"', f'height="{intrinsic_height}"', svg_text, count=1
+    )
+    if 'height="' not in svg_text:
+        svg_text = re.sub(
+            r"<svg\b", f'<svg height="{intrinsic_height}"', svg_text, count=1
+        )
+
+    return svg_text
+
+
+def get_svg_view_box(svg_text: str) -> tuple[float, float, float, float] | None:
+    match = re.search(r'viewBox="([^"]+)"', svg_text)
+    if not match:
+        return None
+    parts = [
+        float(part) for part in re.split(r"[\s,]+", match.group(1).strip()) if part
+    ]
+    if len(parts) != 4:
+        return None
+    return (parts[0], parts[1], parts[2], parts[3])
+
+
+def crop_poi_svg_icons(poi_icons_dir: Path) -> None:
+    for icon_path, crop in POI_ICON_SVG_CROPS.items():
+        dest = poi_icons_dir / icon_path
+        if not dest.exists():
+            continue
+        svg_text = dest.read_text(encoding="utf-8")
+        if icon_path == "brands_2024/dunnes_stores.svg":
+            svg_text = svg_text.replace('fill="#fffcfc"', 'fill="#111111"')
+            svg_text = svg_text.replace('fill="#fcfcfc"', 'fill="#111111"')
+        dest.write_text(set_svg_geometry(svg_text, crop), encoding="utf-8")
+
+    for dest in poi_icons_dir.rglob("*.svg"):
+        svg_text = dest.read_text(encoding="utf-8")
+        view_box = get_svg_view_box(svg_text)
+        if view_box:
+            dest.write_text(set_svg_geometry(svg_text, view_box), encoding="utf-8")
+
+
+def download_derived_poi_icon(
+    kind: str, source_path: str, dest: Path
+) -> tuple[bool, str]:
+    url = f"{POI_ICON_BASE}/{source_path}"
+    dest.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        source = download_text(url)
+        if kind == "costco_logo":
+            dest.write_text(build_costco_logo(source), encoding="utf-8")
+        elif kind == "embedded_png":
+            dest.write_bytes(extract_embedded_png(source))
+        else:
+            raise ValueError(f"Unknown derived POI icon kind: {kind}")
+        return True, url
+    except urllib.error.HTTPError as e:
+        print(f"  {e.code} {url}", file=sys.stderr)
+        return False, url
+    except Exception as e:
+        print(f"  ERROR {url}: {e}", file=sys.stderr)
+        return False, url
+
+
+# Slategray accent used by civic POI icons (school, library, building, …) in
+# protomaps' v4 sprite. We match it so the townhall blends in with its peers.
+_TOWNHALL_COLOR = {
+    "light": (135, 128, 171),
+    "dark": (118, 118, 127),
+}
+_TOWNHALL_LOGICAL_SIZE = 17
+
+
+def _render_townhall_glyph(size_px: int, color: tuple[int, int, int]) -> Image.Image:
+    # Draw at 8× resolution and downsample with Lanczos so the pediment's
+    # diagonals come out anti-aliased; PIL's polygon fill is otherwise aliased.
+    super_factor = 8
+    canvas = size_px * super_factor
+    img = Image.new("RGBA", (canvas, canvas), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(img)
+    fill = (*color, 255)
+
+    def s(v: float) -> float:
+        return v * canvas / _TOWNHALL_LOGICAL_SIZE
+
+    draw.polygon([(s(8.5), s(1)), (s(15), s(6.5)), (s(2), s(6.5))], fill=fill)
+    draw.rectangle([(s(1), s(6.5)), (s(16), s(8.5))], fill=fill)
+    for column_x in (3, 8, 13):
+        draw.rectangle([(s(column_x), s(8.5)), (s(column_x + 1.5), s(14))], fill=fill)
+    draw.rectangle([(s(0), s(14)), (s(17), s(15.5))], fill=fill)
+
+    return img.resize((size_px, size_px), Image.LANCZOS)
+
+
+def inject_townhall_sprite(sprites_dir: Path) -> None:
+    """Append a townhall glyph to each downloaded sprite sheet.
+
+    Protomaps' v4 sprite omits `townhall` even though the basemap style
+    references it; we add the icon here so MapLibre can resolve the name
+    natively at runtime.
+    """
+    for theme in ("light", "dark"):
+        color = _TOWNHALL_COLOR[theme]
+        for suffix, scale in (("", 1), ("@2x", 2)):
+            json_path = sprites_dir / f"{theme}{suffix}.json"
+            png_path = sprites_dir / f"{theme}{suffix}.png"
+            if not json_path.exists() or not png_path.exists():
+                continue
+
+            manifest = json.loads(json_path.read_text())
+            sheet = Image.open(png_path).convert("RGBA")
+
+            glyph_size = _TOWNHALL_LOGICAL_SIZE * scale
+            glyph = _render_townhall_glyph(glyph_size, color)
+
+            new_width = max(sheet.width, glyph_size)
+            new_height = sheet.height + glyph_size
+            extended = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
+            extended.paste(sheet, (0, 0))
+            extended.paste(glyph, (0, sheet.height))
+            extended.save(png_path, optimize=True)
+
+            manifest["townhall"] = {
+                "x": 0,
+                "y": sheet.height,
+                "width": glyph_size,
+                "height": glyph_size,
+                "pixelRatio": scale,
+            }
+            json_path.write_text(json.dumps(manifest))
+
+
 def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
@ -147,7 +385,7 @@ def main():
    # Skip already-downloaded files
    remaining = [(url, dest) for url, dest in tasks]

-    print(f"Downloading {len(remaining)} assets")
+    print(f"Downloading {len(remaining) + len(DERIVED_POI_ICON_PATHS)} assets")

    ok = 0
    fail = 0
@ -162,6 +400,18 @@ def main():
            else:
                fail += 1

+    for kind, source_path, dest_path in DERIVED_POI_ICON_PATHS:
+        success, _url = download_derived_poi_icon(
+            kind, source_path, poi_icons_dir / dest_path
+        )
+        if success:
+            ok += 1
+        else:
+            fail += 1
+
+    crop_poi_svg_icons(poi_icons_dir)
+    inject_townhall_sprite(sprites_dir)
+
    print(f"Done: {ok} downloaded, {fail} failed")


--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -6,6 +6,7 @@ Reuses the same england-latest.osm.pbf as pois.py.
 """

 import argparse
+import re
 from pathlib import Path

 import osmium
@ -44,11 +45,37 @@ _STATION_STRIP = (
    " underground station",
    " railway station",
    " dlr station",
+    " station dlr",
+    " dlr",
    " overground station",
    " tram stop",
    " station",
 )

+_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
+
+
+def _is_dlr_station(tags: dict[str, str]) -> bool:
+    name = tags.get("name", "").lower()
+    network = tags.get("network", "").lower()
+    operator = tags.get("operator", "").lower()
+    return (
+        "docklands" in network
+        or "dlr" in network
+        or "docklands" in operator
+        or "dlr" in operator
+        or name.endswith(" dlr")
+        or " dlr " in name
+    )
+
+
+def _is_tram_station(tags: dict[str, str]) -> bool:
+    if _is_dlr_station(tags):
+        return False
+    station_tag = tags.get("station", "")
+    network = tags.get("network", "").lower()
+    return station_tag == "light_rail" or "tramlink" in network or "tram" in network
+

 def _station_display_name(name: str, tags: dict[str, str]) -> str:
    """Build a descriptive station name like 'Bank tube station'."""
@ -78,6 +105,96 @@ def _station_display_name(name: str, tags: dict[str, str]) -> str:
    return f"{name} {suffix}"


+def _station_name_score(name: str) -> tuple[int, int]:
+    lower = name.lower()
+    suffix_penalty = int(
+        lower.endswith(
+            (
+                " underground station",
+                " tube station",
+                " dlr station",
+                " railway station",
+                " rail station",
+                " station dlr",
+                " station",
+            )
+        )
+        or lower.endswith(" dlr")
+    )
+    return (suffix_penalty, len(name))
+
+
+def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
+    """Extract station-level DLR destinations from NaPTAN access nodes."""
+    df = pl.read_parquet(naptan_path)
+    required = {"id", "name", "category", "lat", "lng"}
+    missing = required - set(df.columns)
+    if missing:
+        raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")
+
+    rows: dict[str, dict] = {}
+    for row in df.iter_rows(named=True):
+        atco_id = str(row["id"] or "")
+        match = _DLR_CODE_RE.search(atco_id)
+        if not match:
+            continue
+        if row["category"] not in {"Tube station", "Rail station"}:
+            continue
+
+        code = match.group(1)
+        raw_name = str(row["name"] or "")
+        if not raw_name:
+            continue
+
+        lat = float(row["lat"])
+        lon = float(row["lng"])
+        current = rows.get(code)
+        if current is None:
+            rows[code] = {
+                "raw_name": raw_name,
+                "lat_sum": lat,
+                "lon_sum": lon,
+                "count": 1,
+            }
+            continue
+
+        current["lat_sum"] += lat
+        current["lon_sum"] += lon
+        current["count"] += 1
+        if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
+            current["raw_name"] = raw_name
+
+    stations = []
+    for station in rows.values():
+        count = station["count"]
+        display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
+        stations.append(
+            {
+                "name": display_name,
+                "place_type": "station",
+                "lat": station["lat_sum"] / count,
+                "lon": station["lon_sum"] / count,
+                "population": 0,
+                "travel_destination": True,
+            }
+        )
+
+    return sorted(stations, key=lambda station: station["name"])
+
+
+def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
+    existing_names = {str(place["name"]).casefold() for place in places}
+    added = 0
+    for station in _naptan_dlr_stations(naptan_path):
+        key = station["name"].casefold()
+        if key in existing_names:
+            continue
+        places.append(station)
+        existing_names.add(key)
+        added += 1
+    return added
+
+
 class PlaceHandler(osmium.SimpleHandler):
    def __init__(self, progress: tqdm, england_polygon) -> None:
        super().__init__()
@ -145,14 +262,7 @@ class PlaceHandler(osmium.SimpleHandler):
        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
        if n.tags.get("railway") == "station":
            tags = dict(n.tags)
-            station_tag = tags.get("station", "")
-            network = tags.get("network", "").lower()
-            # Skip tram stops
-            if (
-                station_tag == "light_rail"
-                or "tramlink" in network
-                or "tram" in network
-            ):
+            if _is_tram_station(tags):
                return
            display_name = _station_display_name(name, tags)
            self._add(
@ -178,6 +288,11 @@ def main() -> None:
        required=True,
        help="England boundary GeoJSON file",
    )
+    parser.add_argument(
+        "--naptan",
+        type=Path,
+        help="Optional NaPTAN parquet file used to add DLR station destinations",
+    )
    args = parser.parse_args()

    pbf_file = args.pbf
@ -195,6 +310,9 @@ def main() -> None:
        handler.apply_file(str(pbf_file), locations=True)

    print(f"Extracted {len(handler.places):,} place nodes")
+    if args.naptan:
+        added = _append_naptan_dlr_stations(handler.places, args.naptan)
+        print(f"Added {added:,} DLR station destinations from NaPTAN")

    if handler.places:
        df = pl.DataFrame(handler.places)
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -0,0 +1,81 @@
+import polars as pl
+
+from pipeline.download.places import (
+    _is_dlr_station,
+    _is_tram_station,
+    _naptan_dlr_stations,
+    _station_display_name,
+)
+
+
+def test_dlr_light_rail_is_not_treated_as_tram():
+    dlr_tags = {
+        "name": "Lewisham DLR",
+        "railway": "station",
+        "station": "light_rail",
+        "network": "Docklands Light Railway",
+    }
+
+    assert _is_dlr_station(dlr_tags)
+    assert not _is_tram_station(dlr_tags)
+    assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
+    assert (
+        _station_display_name("Tower Gateway Station DLR", dlr_tags)
+        == "Tower Gateway DLR station"
+    )
+
+
+def test_tram_light_rail_is_still_excluded():
+    tram_tags = {
+        "name": "East Croydon",
+        "railway": "station",
+        "station": "light_rail",
+        "network": "London Trams",
+    }
+
+    assert not _is_dlr_station(tram_tags)
+    assert _is_tram_station(tram_tags)
+
+
+def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
+    naptan = tmp_path / "naptan.parquet"
+    pl.DataFrame(
+        {
+            "id": [
+                "4900ZZDLSHA3",
+                "9400ZZDLSHA",
+                "4900ZZDLGRE1",
+                "490002076RV",
+                "4900ZZLUBNK",
+            ],
+            "name": [
+                "Shadwell DLR",
+                "Shadwell DLR Station",
+                "Greenwich Station",
+                "Tower Gateway Station DLR",
+                "Bank",
+            ],
+            "category": [
+                "Tube station",
+                "Tube station",
+                "Rail station",
+                "Bus stop",
+                "Tube station",
+            ],
+            "lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
+            "lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
+        }
+    ).write_parquet(naptan)
+
+    stations = _naptan_dlr_stations(naptan)
+
+    assert [station["name"] for station in stations] == [
+        "Greenwich DLR station",
+        "Shadwell DLR station",
+    ]
+    shadwell = next(
+        station for station in stations if station["name"].startswith("Shadwell")
+    )
+    assert shadwell["lat"] == (51.51156 + 51.511693) / 2
+    assert shadwell["place_type"] == "station"
+    assert shadwell["travel_destination"] is True
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -56,6 +56,7 @@ NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
 NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"

 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
+TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"


 def _download_http(
@ -473,10 +474,50 @@ def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
    download_naptan()

    print("Converting TfL TransXChange → GTFS...")
+    # The shim patches known packaging/runtime issues in the pinned npm package
+    # before loading its CLI from npx's temporary install.
+    shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
    subprocess.run(
-        ["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
+        [
+            "npx",
+            "--yes",
+            "--package",
+            TRANSXCHANGE2GTFS_PACKAGE,
+            "sh",
+            "-c",
+            "\n".join(
+                [
+                    'bin="$(command -v transxchange2gtfs)"',
+                    'script="$(readlink -f "$bin")"',
+                    'pkg_dir="$(dirname "$(dirname "$script")")"',
+                    'shim="$1"',
+                    "shift",
+                    'exec node "$shim" "$pkg_dir" "$@"',
+                ]
+            ),
+            "transxchange2gtfs",
+            str(shim_path.resolve()),
+            str(txc_path.resolve()),
+            str(dest.resolve()),
+        ],
        check=True,
    )
+    required_files = {
+        "agency.txt",
+        "calendar.txt",
+        "calendar_dates.txt",
+        "routes.txt",
+        "stop_times.txt",
+        "stops.txt",
+        "trips.txt",
+    }
+    if not dest.exists() or not zipfile.is_zipfile(dest):
+        raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
+    with zipfile.ZipFile(dest) as z:
+        missing = required_files - set(z.namelist())
+    if missing:
+        missing_str = ", ".join(sorted(missing))
+        raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
    size_mb = dest.stat().st_size / (1024 * 1024)
    print(f"  Saved to {dest} ({size_mb:.1f} MB)")
    return dest
--- a/pipeline/download/transxchange2gtfs_shim.js
+++ b/pipeline/download/transxchange2gtfs_shim.js
@ -0,0 +1,76 @@
+#!/usr/bin/env node
+"use strict";
+
+const fs = require("fs");
+const path = require("path");
+const { createRequire } = require("module");
+
+const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
+
+if (!pkgDirArg || converterArgs.length < 2) {
+  console.error(
+    "Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
+  );
+  process.exit(2);
+}
+
+const pkgDir = path.resolve(pkgDirArg);
+
+function replaceOnce(relativePath, before, after) {
+  const file = path.join(pkgDir, relativePath);
+  const original = fs.readFileSync(file, "utf8");
+  if (original.includes(before)) {
+    fs.writeFileSync(file, original.replace(before, after));
+  } else if (original.includes(after)) {
+    return;
+  } else {
+    throw new Error(`Could not patch ${relativePath}: expected text not found`);
+  }
+}
+
+// The published 1.12.0 package has a few compatibility issues with current
+// TfL TransXChange exports:
+// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
+// - the compiled date-holidays import expects a synthetic default export
+// - some TfL journeys reference timing links without matching route-link geometry
+//
+// GTFS shapes are optional for R5 routing. Clear shape references and omit
+// shapes.txt so missing route geometry does not drop otherwise usable trips.
+function patchPackage() {
+  replaceOnce(
+    "dist/transxchange/TransXChangeJourneyStream.js",
+    "distanceSoFarM += routeLink.Distance;",
+    "distanceSoFarM += routeLink ? routeLink.Distance : 0;",
+  );
+  replaceOnce(
+    "dist/gtfs/TripsStream.js",
+    "(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
+    "\"\");",
+  );
+  replaceOnce(
+    "dist/gtfs/StopTimesStream.js",
+    "stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
+    "\"\", stop.exactTime ? \"1\" : \"0\");",
+  );
+  replaceOnce(
+    "dist/Container.js",
+    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n            \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
+    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+  );
+  replaceOnce(
+    "dist/Container.js",
+    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
+  );
+}
+
+patchPackage();
+
+const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
+const Holidays = pkgRequire("date-holidays");
+if (!Holidays.default) {
+  Holidays.default = Holidays;
+}
+
+process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
+require(path.join(pkgDir, "dist", "cli.js"));
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10

+_IOD_PERCENTILE_COLUMNS = [
+    "Education, Skills and Training Score",
+    "Income Score (rate)",
+    "Employment Score (rate)",
+    "Health Deprivation and Disability Score",
+    "Indoors Sub-domain Score",
+    "Outdoors Sub-domain Score",
+]
+

 _AREA_COLUMNS = [
    "Postcode",
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
    "Number of parks within 1km",
    "Distance to nearest train or tube station (km)",
    "Distance to nearest park (km)",
+    "Distance to nearest grocery store (km)",
+    "Distance to nearest tube station (km)",
+    "Distance to nearest rail station (km)",
+    "Distance to nearest Waitrose (km)",
+    "Distance to nearest Tesco (km)",
+    "Distance to nearest cafe (km)",
+    "Distance to nearest pub (km)",
+    "Distance to nearest restaurant (km)",
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
 ]


+def _is_dynamic_poi_metric_column(column: str) -> bool:
+    return (
+        column.startswith("Distance to nearest ")
+        and column.endswith(" POI (km)")
+    ) or (
+        column.startswith("Number of ")
+        and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
+    )
+
+
+def _less_deprived_percentile_expr(column: str) -> pl.Expr:
+    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
+    non_null_count = pl.col(column).count()
+    descending_rank = pl.col(column).rank("average", descending=True)
+    return (
+        pl.when(pl.col(column).is_null())
+        .then(None)
+        .when(pl.col(column) == pl.col(column).min())
+        .then(100.0)
+        .when(pl.col(column) == pl.col(column).max())
+        .then(0.0)
+        .when(non_null_count > 1)
+        .then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
+        .otherwise(100.0)
+        .alias(column)
+    )
+
+
 def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
@ -134,20 +179,11 @@ def _build(
    )
    wide = wide.join(arcgis, on="postcode", how="left")

-    iod = pl.scan_parquet(iod_path)
+    iod = pl.scan_parquet(iod_path).with_columns(
+        *(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
+    )
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

-    # Invert deprivation scores so that higher values = less deprived (better)
-    iod_score_cols = [
-        "Education, Skills and Training Score",
-        "Income Score (rate)",
-        "Employment Score (rate)",
-        "Health Deprivation and Disability Score",
-        "Indoors Sub-domain Score",
-        "Outdoors Sub-domain Score",
-    ]
-    wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
-
    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,
@ -351,6 +387,14 @@ def _build(
                "parks_1km": "Number of parks within 1km",
                "train_tube_nearest_km": "Distance to nearest train or tube station (km)",
                "parks_nearest_km": "Distance to nearest park (km)",
+                "grocery_store_nearest_km": "Distance to nearest grocery store (km)",
+                "tube_station_nearest_km": "Distance to nearest tube station (km)",
+                "rail_station_nearest_km": "Distance to nearest rail station (km)",
+                "waitrose_nearest_km": "Distance to nearest Waitrose (km)",
+                "tesco_nearest_km": "Distance to nearest Tesco (km)",
+                "cafe_nearest_km": "Distance to nearest cafe (km)",
+                "pub_nearest_km": "Distance to nearest pub (km)",
+                "restaurant_nearest_km": "Distance to nearest restaurant (km)",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
@ -381,10 +425,14 @@ def _build(

    # Split into postcode-level and property-level dataframes
    area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
+    area_cols.extend(
+        c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
+    )
+    area_col_set = set(area_cols)
    postcode_df = df.select(area_cols).group_by("Postcode").first()
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")

-    property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
+    property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -1,6 +1,8 @@
 """Compute POI proximity counts and distances per postcode from ArcGIS + filtered POIs."""

 import argparse
+import re
+import unicodedata
 from pathlib import Path

 import polars as pl
@ -15,9 +17,25 @@ POI_GROUPS_2KM = {
    "groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
 }

-# Groups for which to compute distance to nearest POI (from filtered POIs)
+# Groups for which to compute distance to nearest POI (from filtered POIs).
+# Keep `train_tube` for the existing backend feature; the individual POI
+# distance filters below power the frontend dropdown.
 DISTANCE_GROUPS = {
    "train_tube": ["Tube station", "Rail station"],
+    "grocery_store": [
+        "Greengrocer",
+        "Supermarket",
+        "Convenience Store",
+        "Waitrose",
+        "Tesco",
+    ],
+    "tube_station": ["Tube station"],
+    "rail_station": ["Rail station"],
+    "waitrose": ["Waitrose"],
+    "tesco": ["Tesco"],
+    "cafe": ["Café"],
+    "pub": ["Pub"],
+    "restaurant": ["Restaurant"],
 }

 # OS Open Greenspace function types used for park counts and distance calculation.
@ -27,6 +45,69 @@ GREENSPACE_PARK_FUNCTIONS = {
    "parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
 }

+GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
+DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
+DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
+
+
+def _poi_category_slug(category: str) -> str:
+    ascii_text = (
+        unicodedata.normalize("NFKD", category)
+        .encode("ascii", "ignore")
+        .decode("ascii")
+        .lower()
+    )
+    slug = re.sub(r"[^a-z0-9]+", "_", ascii_text).strip("_")
+    return slug or "poi"
+
+
+def _build_poi_category_groups(
+    pois: pl.DataFrame,
+) -> tuple[dict[str, list[str]], dict[str, str]]:
+    """Build one proximity group for each POI category selected for filters."""
+    if "group" not in pois.columns:
+        raise ValueError("POI dataframe must include a 'group' column")
+
+    categories = (
+        pois.group_by("group", "category")
+        .len()
+        .filter(
+            pl.col("group").is_in(list(DYNAMIC_FILTER_ALL_GROUPS))
+            | (
+                pl.col("group").is_in(list(DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS))
+                & (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
+            )
+        )
+        .select("category")
+        .sort("category")
+        .to_series()
+        .to_list()
+    )
+    used_slugs: dict[str, int] = {}
+    groups: dict[str, list[str]] = {}
+    display_names: dict[str, str] = {}
+
+    for category in categories:
+        if not isinstance(category, str) or not category:
+            continue
+        base_slug = f"poi_{_poi_category_slug(category)}"
+        slug_count = used_slugs.get(base_slug, 0)
+        used_slugs[base_slug] = slug_count + 1
+        group_key = base_slug if slug_count == 0 else f"{base_slug}_{slug_count + 1}"
+        groups[group_key] = [category]
+        display_names[group_key] = category
+
+    return groups, display_names
+
+
+def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
+    renames: dict[str, str] = {}
+    for group_key, category in display_names.items():
+        renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
+        renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
+        renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
+    return renames
+

 def main():
    parser = argparse.ArgumentParser(
@ -56,12 +137,35 @@ def main():
    )

    pois = pl.read_parquet(args.pois)
+    poi_category_groups, poi_display_names = _build_poi_category_groups(pois)

    # Count amenity POIs within 2km
    counts_2km = count_pois_per_postcode(
        postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
    )

+    # Dynamic POI filters: nearest distance plus counts within 2km and 5km for
+    # the selected public transport, grocery, and leisure categories.
+    dynamic_counts_2km = count_pois_per_postcode(
+        postcodes, pois, groups=poi_category_groups, radius_km=2
+    )
+    dynamic_counts_5km = count_pois_per_postcode(
+        postcodes, pois, groups=poi_category_groups, radius_km=5
+    )
+    dynamic_distances = min_distance_per_postcode(
+        postcodes, pois, groups=poi_category_groups
+    )
+    dynamic_renames = _dynamic_poi_metric_renames(poi_display_names)
+    dynamic_counts_2km = dynamic_counts_2km.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_2km.columns}
+    )
+    dynamic_counts_5km = dynamic_counts_5km.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_counts_5km.columns}
+    )
+    dynamic_distances = dynamic_distances.rename(
+        {k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
+    )
+
    # Distance to nearest train/tube station (from filtered POIs)
    distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)

@ -77,6 +181,9 @@ def main():
    # Join all results on postcode
    result = (
        counts_2km.join(distances, on="postcode")
+        .join(dynamic_counts_2km, on="postcode")
+        .join(dynamic_counts_5km, on="postcode")
+        .join(dynamic_distances, on="postcode")
        .join(park_counts_1km, on="postcode")
        .join(park_distances, on="postcode")
    )
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -0,0 +1,33 @@
+import polars as pl
+
+from pipeline.transform.merge import (
+    _is_dynamic_poi_metric_column,
+    _less_deprived_percentile_expr,
+)
+
+
+def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
+    df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
+
+    result = df.lazy().with_columns(
+        _less_deprived_percentile_expr("Income Score (rate)")
+    ).collect()
+
+    assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
+
+
+def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
+    df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
+
+    result = df.lazy().with_columns(
+        _less_deprived_percentile_expr("Income Score (rate)")
+    ).collect()
+
+    assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
+
+
+def test_dynamic_poi_metric_columns_are_area_level() -> None:
+    assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
+    assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
+    assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
+    assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -0,0 +1,41 @@
+import polars as pl
+
+from pipeline.transform.poi_proximity import _build_poi_category_groups
+
+
+def test_dynamic_poi_groups_include_requested_categories_only() -> None:
+    pois = pl.DataFrame(
+        {
+            "group": (
+                ["Public Transport"] * 2
+                + ["Leisure"] * 2
+                + ["Groceries"] * 101
+                + ["Groceries"] * 100
+                + ["Education"] * 200
+                + ["Health"] * 200
+            ),
+            "category": (
+                ["Rail station", "Bus stop"]
+                + ["Café", "Restaurant"]
+                + ["Tesco"] * 101
+                + ["Waitrose"] * 100
+                + ["School"] * 200
+                + ["Pharmacy"] * 200
+            ),
+            "lat": [51.5] * 605,
+            "lng": [-0.1] * 605,
+        }
+    )
+
+    groups, display_names = _build_poi_category_groups(pois)
+
+    assert set(display_names.values()) == {
+        "Bus stop",
+        "Café",
+        "Rail station",
+        "Restaurant",
+        "Tesco",
+    }
+    assert "poi_waitrose" not in groups
+    assert "poi_school" not in groups
+    assert "poi_pharmacy" not in groups
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1128,12 +1128,18 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
 def normalize_grocery_retailer(retailer: str | None) -> str:
    if retailer is None:
        return ""
-    return GROCERY_RETAILER_DISPLAY_NAMES.get(retailer, retailer)
+    display_name = GROCERY_RETAILER_DISPLAY_NAMES.get(retailer)
+    if display_name is None:
+        raise ValueError(f"Missing grocery retailer display name for {retailer!r}")
+    return display_name


 def normalize_grocery_icon_category(fascia: str | None, retailer: str | None) -> str:
    if fascia:
-        return GROCERY_FASCIA_ICON_NAMES.get(fascia, normalize_grocery_retailer(fascia))
+        icon_name = GROCERY_FASCIA_ICON_NAMES.get(fascia)
+        if icon_name is None:
+            raise ValueError(f"Missing grocery fascia icon name for {fascia!r}")
+        return icon_name
    return normalize_grocery_retailer(retailer)


--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -2,9 +2,12 @@

 import numpy as np
 import polars as pl
+from scipy.spatial import cKDTree

 from .haversine import haversine_km

+EARTH_RADIUS_KM = 6371.0088
+

 def _build_poi_grid(
    pois: pl.DataFrame, grid_size: float = 0.05
@ -49,6 +52,21 @@ def _get_nearby_indices(
    return np.concatenate(nearby_indices)


+def _project_lat_lng_km(
+    lats: np.ndarray, lngs: np.ndarray, origin_lat: float
+) -> np.ndarray:
+    """Project WGS84 coordinates to local km coordinates for nearest-neighbour lookup."""
+    lat_rad = np.radians(lats)
+    lng_rad = np.radians(lngs)
+    origin_lat_rad = np.radians(origin_lat)
+    return np.column_stack(
+        (
+            EARTH_RADIUS_KM * lng_rad * np.cos(origin_lat_rad),
+            EARTH_RADIUS_KM * lat_rad,
+        )
+    )
+
+
 def count_pois_per_postcode(
    postcodes_df: pl.DataFrame,
    pois: pl.DataFrame,
@ -136,7 +154,7 @@ def min_distance_per_postcode(
 ) -> pl.DataFrame:
    """
    For each postcode, compute the distance (km) to the closest POI per group.
-    Returns NaN where no POI of that group exists within the grid search range (~5.5km).
+    Returns NaN where no POI of that group exists.
    """
    print("Computing minimum POI distances per postcode...")

@ -144,51 +162,84 @@ def min_distance_per_postcode(
    n_pois = len(pois)
    print(f"  {n_postcodes:,} postcodes, {n_pois:,} POIs")

-    grid_size = 0.05
-    print("  Building POI spatial grid...")
-    poi_lats, poi_lngs, poi_cats, poi_grid = _build_poi_grid(pois, grid_size)
-    print(f"  POI grid has {len(poi_grid):,} occupied cells")
-
-    category_masks = {}
-    for group, categories in groups.items():
-        mask = np.isin(poi_cats, categories)
-        category_masks[group] = mask
-        print(f"  {group}: {mask.sum():,} POIs")
-
    pc_lats = postcodes_df["lat"].to_numpy()
    pc_lons = postcodes_df["lon"].to_numpy()
    pc_codes = postcodes_df["postcode"].to_list()
+    valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
+    valid_pc_indices = np.flatnonzero(valid_pc_mask)

    result_min_dist = {
        group: np.full(n_postcodes, np.nan, dtype=np.float32) for group in groups
    }

-    batch_size = 50000
-    n_batches = (n_postcodes + batch_size - 1) // batch_size
-    print(f"  Processing {n_postcodes:,} postcodes in {n_batches} batches...")
+    if n_pois == 0 or len(valid_pc_indices) == 0:
+        print("  No valid postcode/POI coordinates; returning NaN distances")
+        return pl.DataFrame(
+            {
+                "postcode": pc_codes,
+                **{
+                    f"{group}_nearest_km": values
+                    for group, values in result_min_dist.items()
+                },
+            }
+        )

-    for batch_idx in range(n_batches):
-        start_idx = batch_idx * batch_size
-        end_idx = min(start_idx + batch_size, n_postcodes)
+    poi_lats = pois["lat"].to_numpy()
+    poi_lngs = pois["lng"].to_numpy()
+    poi_cats = pois["category"].to_numpy()
+    valid_poi_mask = np.isfinite(poi_lats) & np.isfinite(poi_lngs)
+    origin_lat = float(np.nanmean(pc_lats[valid_pc_mask]))
+    query_xy = _project_lat_lng_km(
+        pc_lats[valid_pc_indices], pc_lons[valid_pc_indices], origin_lat
+    )

-        if batch_idx % 5 == 0:
-            print(
-                f"  Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
-            )
+    batch_size = 200_000
+    n_batches = (len(valid_pc_indices) + batch_size - 1) // batch_size

-        for i in range(start_idx, end_idx):
-            nearby = _get_nearby_indices(pc_lats[i], pc_lons[i], poi_grid, grid_size)
-            if nearby is None:
-                continue
+    for group, categories in groups.items():
+        group_indices = np.flatnonzero(valid_poi_mask & np.isin(poi_cats, categories))
+        print(f"  {group}: {len(group_indices):,} POIs")
+        if len(group_indices) == 0:
+            continue

-            distances = haversine_km(
-                poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
-            )
+        poi_xy = _project_lat_lng_km(
+            poi_lats[group_indices], poi_lngs[group_indices], origin_lat
+        )
+        tree = cKDTree(poi_xy)
+        k = min(8, len(group_indices))

-            for group, cat_mask in category_masks.items():
-                group_mask = cat_mask[nearby]
-                if group_mask.any():
-                    result_min_dist[group][i] = distances[group_mask].min()
+        for batch_idx in range(n_batches):
+            start_idx = batch_idx * batch_size
+            end_idx = min(start_idx + batch_size, len(valid_pc_indices))
+            batch_pc_indices = valid_pc_indices[start_idx:end_idx]
+            batch_xy = query_xy[start_idx:end_idx]
+
+            if batch_idx == 0 or (batch_idx + 1) % 5 == 0:
+                print(
+                    f"    Batch {batch_idx + 1}/{n_batches}: postcodes {start_idx:,} - {end_idx:,}"
+                )
+
+            _, nearest = tree.query(batch_xy, k=k)
+            nearest = np.asarray(nearest)
+
+            if k == 1:
+                candidate_indices = group_indices[nearest]
+                distances = haversine_km(
+                    poi_lats[candidate_indices],
+                    poi_lngs[candidate_indices],
+                    pc_lats[batch_pc_indices],
+                    pc_lons[batch_pc_indices],
+                )
+            else:
+                candidate_indices = group_indices[nearest]
+                distances = haversine_km(
+                    poi_lats[candidate_indices],
+                    poi_lngs[candidate_indices],
+                    pc_lats[batch_pc_indices, None],
+                    pc_lons[batch_pc_indices, None],
+                ).min(axis=1)
+
+            result_min_dist[group][batch_pc_indices] = distances.astype(np.float32)

    result_data = {"postcode": pc_codes}
    for group in groups:
--- a/pipeline/utils/test_poi_counts.py
+++ b/pipeline/utils/test_poi_counts.py
@ -113,9 +113,9 @@ def test_min_distance_finds_nearest(postcodes, pois):
    # Restaurant is co-located — distance ~0
    assert ec1a["restaurants_nearest_km"][0] < 0.01

-    # Far-away postcode should have NaN (no POIs within grid range)
+    # Far-away postcode should still get the global nearest distance.
    zz99 = result.filter(pl.col("postcode") == "ZZ99 9ZZ")
-    assert np.isnan(zz99["train_tube_nearest_km"][0])
+    assert zz99["train_tube_nearest_km"][0] > 300


 def test_min_distance_no_pois_returns_nan(postcodes):