all good

2026-05-17 10:16:30 +01:00 · 2026-05-17 10:16:30 +01:00 · 017902b8e6
commit 017902b8e6
parent 47d89f6fad
82 changed files with 331466 additions and 54841 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -10,9 +10,12 @@ import argparse
 import re
 from pathlib import Path

+import numpy as np
 import osmium
 import polars as pl
+from scipy.spatial import cKDTree
 from shapely.geometry import Point
+from pyproj import Transformer
 from tqdm import tqdm

 from pipeline.utils.england_geometry import (
@ -39,6 +42,12 @@ SEARCH_PLACE_TYPES = {
    "island",
 }
 TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
+ENGLAND_COUNTRY_CODE = "E92000001"
+LONDON_REGION_CODE = "E12000007"
+LONDON_LAD_PREFIX = "E09"
+LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
+DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
+WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)

 # Suffixes to strip from raw station names before appending the typed suffix.
 _STATION_STRIP = (
@ -55,6 +64,7 @@ _STATION_STRIP = (

 _DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
 _POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
+_LONDON_TOKEN_RE = re.compile(r"(^|[^a-z])london([^a-z]|$)", re.I)

 _NOISY_PROVIDER_SUFFIXES = (
    " higher education corporation",
@ -152,8 +162,7 @@ def _find_header_row(rows: list[tuple]) -> int:
    for idx, row in enumerate(rows):
        keys = [_header_key(value) for value in row]
        has_legal_name = any(
-            all(token in key for token in ("provider", "legal", "name"))
-            for key in keys
+            all(token in key for token in ("provider", "legal", "name")) for key in keys
        )
        has_university_title = any(
            all(token in key for token in ("right", "use", "university"))
@ -235,13 +244,94 @@ def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
    df = pl.read_parquet(
        postcodes_path,
        columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
-    ).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
+    ).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
    return {
        _normalize_postcode(postcode): (float(lat), float(lon))
        for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
    }


+def _display_city_from_tags(tags: dict[str, str]) -> str | None:
+    """Use explicit OSM context where available, before we fall back to admin data."""
+    for key in (
+        "is_in",
+        "is_in:city",
+        "is_in:town",
+        "is_in:county",
+        "addr:city",
+    ):
+        value = tags.get(key)
+        if value and _LONDON_TOKEN_RE.search(value):
+            return "London"
+    return None
+
+
+def _is_london_admin_expr() -> pl.Expr:
+    return (
+        (pl.col("rgn25cd") == LONDON_REGION_CODE)
+        | pl.col("lad25cd").str.starts_with(LONDON_LAD_PREFIX).fill_null(False)
+        | pl.col("cty25cd").is_in(LONDON_COUNTY_CODES)
+    )
+
+
+def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
+    required = [
+        "doterm",
+        "ctry25cd",
+        "east1m",
+        "north1m",
+        "rgn25cd",
+        "lad25cd",
+        "cty25cd",
+    ]
+    df = (
+        pl.read_parquet(postcodes_path, columns=required)
+        .filter(
+            (pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
+        )
+        .filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
+        .with_columns(_is_london_admin_expr().alias("is_london"))
+        .select("east1m", "north1m", "is_london")
+    )
+    if df.is_empty():
+        raise ValueError(f"No active England postcodes in {postcodes_path}")
+
+    coords = np.column_stack(
+        [
+            df["east1m"].to_numpy().astype(np.float64),
+            df["north1m"].to_numpy().astype(np.float64),
+        ]
+    )
+    london_flags = df["is_london"].to_numpy().astype(bool)
+    return cKDTree(coords), london_flags
+
+
+def _assign_london_display_city(
+    places: list[dict],
+    postcodes_path: Path,
+    max_distance_m: float = DISPLAY_CITY_NEAREST_POSTCODE_MAX_M,
+) -> int:
+    """Tag places whose nearest active postcode is inside Greater London."""
+    if not places:
+        return 0
+
+    tree, london_flags = _london_postcode_tree(postcodes_path)
+    lons = np.array([float(place["lon"]) for place in places], dtype=np.float64)
+    lats = np.array([float(place["lat"]) for place in places], dtype=np.float64)
+    eastings, northings = WGS84_TO_BNG.transform(lons, lats)
+    place_coords = np.column_stack([eastings, northings])
+    distances, indices = tree.query(place_coords)
+
+    assigned = 0
+    for idx, place in enumerate(places):
+        if place.get("display_city") or place.get("place_type") == "city":
+            continue
+        if distances[idx] <= max_distance_m and london_flags[indices[idx]]:
+            place["display_city"] = "London"
+            assigned += 1
+    return assigned
+
+
 def _ofs_universities(
    raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
 ) -> tuple[list[dict], int]:
@ -277,6 +367,7 @@ def _ofs_universities(
                "lon": lon,
                "population": 0,
                "travel_destination": True,
+                "display_city": None,
            }
        )

@ -354,6 +445,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
                "lon": station["lon_sum"] / count,
                "population": 0,
                "travel_destination": True,
+                "display_city": None,
            }
        )

@ -388,6 +480,7 @@ class PlaceHandler(osmium.SimpleHandler):
        lon: float,
        population: int,
        travel_destination: bool,
+        display_city: str | None = None,
    ) -> None:
        self.places.append(
            {
@ -397,6 +490,7 @@ class PlaceHandler(osmium.SimpleHandler):
                "lon": lon,
                "population": population,
                "travel_destination": travel_destination,
+                "display_city": display_city,
            }
        )
        self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
@ -414,18 +508,19 @@ class PlaceHandler(osmium.SimpleHandler):
        if not self._england.contains(Point(lon, lat)):
            return

-        name = n.tags.get("name:en", n.tags.get("name", ""))
+        tags = dict(n.tags)
+        name = tags.get("name:en", tags.get("name", ""))
        if not name:
            return

-        pop_str = n.tags.get("population", "")
+        pop_str = tags.get("population", "")
        try:
            population = int(pop_str)
        except ValueError:
            population = 0

        # place=* nodes
-        place_type = n.tags.get("place")
+        place_type = tags.get("place")
        if place_type in SEARCH_PLACE_TYPES:
            self._add(
                name,
@ -434,12 +529,14 @@ class PlaceHandler(osmium.SimpleHandler):
                lon,
                population,
                travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
+                display_city=None
+                if place_type == "city"
+                else _display_city_from_tags(tags),
            )
            return

        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
-        if n.tags.get("railway") == "station":
-            tags = dict(n.tags)
+        if tags.get("railway") == "station":
            if _is_tram_station(tags):
                return
            display_name = _station_display_name(name, tags)
@ -450,6 +547,7 @@ class PlaceHandler(osmium.SimpleHandler):
                lon,
                population,
                travel_destination=True,
+                display_city=_display_city_from_tags(tags),
            )
            return

@ -479,7 +577,10 @@ def main() -> None:
    parser.add_argument(
        "--postcodes",
        type=Path,
-        help="Postcode parquet used to geocode OfS university contact postcodes",
+        help=(
+            "Postcode parquet used to geocode OfS university contact postcodes "
+            "and assign Greater London display labels"
+        ),
    )
    args = parser.parse_args()

@ -507,14 +608,18 @@ def main() -> None:
        added, skipped = _append_ofs_universities(
            handler.places, args.university_register, args.postcodes
        )
-        print(
-            f"Added {added:,} university travel destinations from the OfS register"
-        )
+        print(f"Added {added:,} university travel destinations from the OfS register")
        if skipped:
            print(f"Skipped {skipped:,} OfS university rows without usable coordinates")

    if handler.places:
+        if args.postcodes:
+            assigned = _assign_london_display_city(handler.places, args.postcodes)
+            print(f"Assigned London display labels to {assigned:,} places")
+        for place in handler.places:
+            place.setdefault("display_city", None)
        df = pl.DataFrame(handler.places)
+        df = df.with_columns(pl.col("display_city").cast(pl.Utf8))
        args.output.parent.mkdir(parents=True, exist_ok=True)
        df.write_parquet(args.output)
        print(f"Saved to {args.output}")
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -1,6 +1,9 @@
 import polars as pl
+from pyproj import Transformer

 from pipeline.download.places import (
+    _assign_london_display_city,
+    _display_city_from_tags,
    _is_dlr_station,
    _is_tram_station,
    _naptan_dlr_stations,
@ -9,6 +12,22 @@ from pipeline.download.places import (
    _station_display_name,
 )

+WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
+
+
+def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
+    easting, northing = WGS84_TO_BNG.transform(lon, lat)
+    return {
+        "pcds": postcode,
+        "doterm": None,
+        "ctry25cd": "E92000001",
+        "east1m": int(round(easting)),
+        "north1m": int(round(northing)),
+        "rgn25cd": "E12000007" if london else "E12000008",
+        "lad25cd": "E09000008" if london else "E07000208",
+        "cty25cd": "E13000002" if london else "E10000030",
+    }
+

 def test_dlr_light_rail_is_not_treated_as_tram():
    dlr_tags = {
@ -144,5 +163,56 @@ def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
            "lon": -1.2643,
            "population": 0,
            "travel_destination": True,
+            "display_city": None,
        }
    ]
+
+
+def test_display_city_from_tags_uses_explicit_london_context():
+    assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
+    assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
+
+
+def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
+    postcodes = tmp_path / "postcodes.parquet"
+    pl.DataFrame(
+        [
+            _postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
+            _postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
+        ]
+    ).write_parquet(postcodes)
+
+    places = [
+        {
+            "name": "Croydon",
+            "place_type": "town",
+            "lat": 51.3713049,
+            "lon": -0.101957,
+            "population": 173314,
+            "travel_destination": False,
+            "display_city": None,
+        },
+        {
+            "name": "East Croydon railway station",
+            "place_type": "station",
+            "lat": 51.375845,
+            "lon": -0.092732,
+            "population": 0,
+            "travel_destination": True,
+            "display_city": None,
+        },
+        {
+            "name": "Epsom",
+            "place_type": "town",
+            "lat": 51.3326,
+            "lon": -0.2678,
+            "population": 31489,
+            "travel_destination": False,
+            "display_city": None,
+        },
+    ]
+
+    assigned = _assign_london_display_city(places, postcodes)
+
+    assert assigned == 2
+    assert [place["display_city"] for place in places] == ["London", "London", None]