This commit is contained in:
Andras Schmelczer 2026-05-17 10:16:30 +01:00
parent 47d89f6fad
commit 017902b8e6
82 changed files with 331466 additions and 54841 deletions

View file

@ -10,9 +10,12 @@ import argparse
import re
from pathlib import Path
import numpy as np
import osmium
import polars as pl
from scipy.spatial import cKDTree
from shapely.geometry import Point
from pyproj import Transformer
from tqdm import tqdm
from pipeline.utils.england_geometry import (
@ -39,6 +42,12 @@ SEARCH_PLACE_TYPES = {
"island",
}
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
ENGLAND_COUNTRY_CODE = "E92000001"
LONDON_REGION_CODE = "E12000007"
LONDON_LAD_PREFIX = "E09"
LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
@ -55,6 +64,7 @@ _STATION_STRIP = (
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
_LONDON_TOKEN_RE = re.compile(r"(^|[^a-z])london([^a-z]|$)", re.I)
_NOISY_PROVIDER_SUFFIXES = (
" higher education corporation",
@ -152,8 +162,7 @@ def _find_header_row(rows: list[tuple]) -> int:
for idx, row in enumerate(rows):
keys = [_header_key(value) for value in row]
has_legal_name = any(
all(token in key for token in ("provider", "legal", "name"))
for key in keys
all(token in key for token in ("provider", "legal", "name")) for key in keys
)
has_university_title = any(
all(token in key for token in ("right", "use", "university"))
@ -235,13 +244,94 @@ def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
df = pl.read_parquet(
postcodes_path,
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
return {
_normalize_postcode(postcode): (float(lat), float(lon))
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
}
def _display_city_from_tags(tags: dict[str, str]) -> str | None:
"""Use explicit OSM context where available, before we fall back to admin data."""
for key in (
"is_in",
"is_in:city",
"is_in:town",
"is_in:county",
"addr:city",
):
value = tags.get(key)
if value and _LONDON_TOKEN_RE.search(value):
return "London"
return None
def _is_london_admin_expr() -> pl.Expr:
return (
(pl.col("rgn25cd") == LONDON_REGION_CODE)
| pl.col("lad25cd").str.starts_with(LONDON_LAD_PREFIX).fill_null(False)
| pl.col("cty25cd").is_in(LONDON_COUNTY_CODES)
)
def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
required = [
"doterm",
"ctry25cd",
"east1m",
"north1m",
"rgn25cd",
"lad25cd",
"cty25cd",
]
df = (
pl.read_parquet(postcodes_path, columns=required)
.filter(
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
)
.filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
.with_columns(_is_london_admin_expr().alias("is_london"))
.select("east1m", "north1m", "is_london")
)
if df.is_empty():
raise ValueError(f"No active England postcodes in {postcodes_path}")
coords = np.column_stack(
[
df["east1m"].to_numpy().astype(np.float64),
df["north1m"].to_numpy().astype(np.float64),
]
)
london_flags = df["is_london"].to_numpy().astype(bool)
return cKDTree(coords), london_flags
def _assign_london_display_city(
places: list[dict],
postcodes_path: Path,
max_distance_m: float = DISPLAY_CITY_NEAREST_POSTCODE_MAX_M,
) -> int:
"""Tag places whose nearest active postcode is inside Greater London."""
if not places:
return 0
tree, london_flags = _london_postcode_tree(postcodes_path)
lons = np.array([float(place["lon"]) for place in places], dtype=np.float64)
lats = np.array([float(place["lat"]) for place in places], dtype=np.float64)
eastings, northings = WGS84_TO_BNG.transform(lons, lats)
place_coords = np.column_stack([eastings, northings])
distances, indices = tree.query(place_coords)
assigned = 0
for idx, place in enumerate(places):
if place.get("display_city") or place.get("place_type") == "city":
continue
if distances[idx] <= max_distance_m and london_flags[indices[idx]]:
place["display_city"] = "London"
assigned += 1
return assigned
def _ofs_universities(
raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
) -> tuple[list[dict], int]:
@ -277,6 +367,7 @@ def _ofs_universities(
"lon": lon,
"population": 0,
"travel_destination": True,
"display_city": None,
}
)
@ -354,6 +445,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
"lon": station["lon_sum"] / count,
"population": 0,
"travel_destination": True,
"display_city": None,
}
)
@ -388,6 +480,7 @@ class PlaceHandler(osmium.SimpleHandler):
lon: float,
population: int,
travel_destination: bool,
display_city: str | None = None,
) -> None:
self.places.append(
{
@ -397,6 +490,7 @@ class PlaceHandler(osmium.SimpleHandler):
"lon": lon,
"population": population,
"travel_destination": travel_destination,
"display_city": display_city,
}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
@ -414,18 +508,19 @@ class PlaceHandler(osmium.SimpleHandler):
if not self._england.contains(Point(lon, lat)):
return
name = n.tags.get("name:en", n.tags.get("name", ""))
tags = dict(n.tags)
name = tags.get("name:en", tags.get("name", ""))
if not name:
return
pop_str = n.tags.get("population", "")
pop_str = tags.get("population", "")
try:
population = int(pop_str)
except ValueError:
population = 0
# place=* nodes
place_type = n.tags.get("place")
place_type = tags.get("place")
if place_type in SEARCH_PLACE_TYPES:
self._add(
name,
@ -434,12 +529,14 @@ class PlaceHandler(osmium.SimpleHandler):
lon,
population,
travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
display_city=None
if place_type == "city"
else _display_city_from_tags(tags),
)
return
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
if n.tags.get("railway") == "station":
tags = dict(n.tags)
if tags.get("railway") == "station":
if _is_tram_station(tags):
return
display_name = _station_display_name(name, tags)
@ -450,6 +547,7 @@ class PlaceHandler(osmium.SimpleHandler):
lon,
population,
travel_destination=True,
display_city=_display_city_from_tags(tags),
)
return
@ -479,7 +577,10 @@ def main() -> None:
parser.add_argument(
"--postcodes",
type=Path,
help="Postcode parquet used to geocode OfS university contact postcodes",
help=(
"Postcode parquet used to geocode OfS university contact postcodes "
"and assign Greater London display labels"
),
)
args = parser.parse_args()
@ -507,14 +608,18 @@ def main() -> None:
added, skipped = _append_ofs_universities(
handler.places, args.university_register, args.postcodes
)
print(
f"Added {added:,} university travel destinations from the OfS register"
)
print(f"Added {added:,} university travel destinations from the OfS register")
if skipped:
print(f"Skipped {skipped:,} OfS university rows without usable coordinates")
if handler.places:
if args.postcodes:
assigned = _assign_london_display_city(handler.places, args.postcodes)
print(f"Assigned London display labels to {assigned:,} places")
for place in handler.places:
place.setdefault("display_city", None)
df = pl.DataFrame(handler.places)
df = df.with_columns(pl.col("display_city").cast(pl.Utf8))
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")

View file

@ -1,6 +1,9 @@
import polars as pl
from pyproj import Transformer
from pipeline.download.places import (
_assign_london_display_city,
_display_city_from_tags,
_is_dlr_station,
_is_tram_station,
_naptan_dlr_stations,
@ -9,6 +12,22 @@ from pipeline.download.places import (
_station_display_name,
)
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
easting, northing = WGS84_TO_BNG.transform(lon, lat)
return {
"pcds": postcode,
"doterm": None,
"ctry25cd": "E92000001",
"east1m": int(round(easting)),
"north1m": int(round(northing)),
"rgn25cd": "E12000007" if london else "E12000008",
"lad25cd": "E09000008" if london else "E07000208",
"cty25cd": "E13000002" if london else "E10000030",
}
def test_dlr_light_rail_is_not_treated_as_tram():
dlr_tags = {
@ -144,5 +163,56 @@ def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
"lon": -1.2643,
"population": 0,
"travel_destination": True,
"display_city": None,
}
]
def test_display_city_from_tags_uses_explicit_london_context():
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
postcodes = tmp_path / "postcodes.parquet"
pl.DataFrame(
[
_postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
_postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
]
).write_parquet(postcodes)
places = [
{
"name": "Croydon",
"place_type": "town",
"lat": 51.3713049,
"lon": -0.101957,
"population": 173314,
"travel_destination": False,
"display_city": None,
},
{
"name": "East Croydon railway station",
"place_type": "station",
"lat": 51.375845,
"lon": -0.092732,
"population": 0,
"travel_destination": True,
"display_city": None,
},
{
"name": "Epsom",
"place_type": "town",
"lat": 51.3326,
"lon": -0.2678,
"population": 31489,
"travel_destination": False,
"display_city": None,
},
]
assigned = _assign_london_display_city(places, postcodes)
assert assigned == 2
assert [place["display_city"] for place in places] == ["London", "London", None]