all good
This commit is contained in:
parent
47d89f6fad
commit
017902b8e6
82 changed files with 331466 additions and 54841 deletions
|
|
@ -10,9 +10,12 @@ import argparse
|
|||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import osmium
|
||||
import polars as pl
|
||||
from scipy.spatial import cKDTree
|
||||
from shapely.geometry import Point
|
||||
from pyproj import Transformer
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.utils.england_geometry import (
|
||||
|
|
@ -39,6 +42,12 @@ SEARCH_PLACE_TYPES = {
|
|||
"island",
|
||||
}
|
||||
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}
|
||||
ENGLAND_COUNTRY_CODE = "E92000001"
|
||||
LONDON_REGION_CODE = "E12000007"
|
||||
LONDON_LAD_PREFIX = "E09"
|
||||
LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
|
||||
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
|
||||
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
|
||||
# Suffixes to strip from raw station names before appending the typed suffix.
|
||||
_STATION_STRIP = (
|
||||
|
|
@ -55,6 +64,7 @@ _STATION_STRIP = (
|
|||
|
||||
_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
|
||||
_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
|
||||
_LONDON_TOKEN_RE = re.compile(r"(^|[^a-z])london([^a-z]|$)", re.I)
|
||||
|
||||
_NOISY_PROVIDER_SUFFIXES = (
|
||||
" higher education corporation",
|
||||
|
|
@ -152,8 +162,7 @@ def _find_header_row(rows: list[tuple]) -> int:
|
|||
for idx, row in enumerate(rows):
|
||||
keys = [_header_key(value) for value in row]
|
||||
has_legal_name = any(
|
||||
all(token in key for token in ("provider", "legal", "name"))
|
||||
for key in keys
|
||||
all(token in key for token in ("provider", "legal", "name")) for key in keys
|
||||
)
|
||||
has_university_title = any(
|
||||
all(token in key for token in ("right", "use", "university"))
|
||||
|
|
@ -235,13 +244,94 @@ def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
|
|||
df = pl.read_parquet(
|
||||
postcodes_path,
|
||||
columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
|
||||
).filter((pl.col("ctry25cd") == "E92000001") & pl.col("doterm").is_null())
|
||||
).filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
|
||||
return {
|
||||
_normalize_postcode(postcode): (float(lat), float(lon))
|
||||
for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
|
||||
}
|
||||
|
||||
|
||||
def _display_city_from_tags(tags: dict[str, str]) -> str | None:
|
||||
"""Use explicit OSM context where available, before we fall back to admin data."""
|
||||
for key in (
|
||||
"is_in",
|
||||
"is_in:city",
|
||||
"is_in:town",
|
||||
"is_in:county",
|
||||
"addr:city",
|
||||
):
|
||||
value = tags.get(key)
|
||||
if value and _LONDON_TOKEN_RE.search(value):
|
||||
return "London"
|
||||
return None
|
||||
|
||||
|
||||
def _is_london_admin_expr() -> pl.Expr:
|
||||
return (
|
||||
(pl.col("rgn25cd") == LONDON_REGION_CODE)
|
||||
| pl.col("lad25cd").str.starts_with(LONDON_LAD_PREFIX).fill_null(False)
|
||||
| pl.col("cty25cd").is_in(LONDON_COUNTY_CODES)
|
||||
)
|
||||
|
||||
|
||||
def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
|
||||
required = [
|
||||
"doterm",
|
||||
"ctry25cd",
|
||||
"east1m",
|
||||
"north1m",
|
||||
"rgn25cd",
|
||||
"lad25cd",
|
||||
"cty25cd",
|
||||
]
|
||||
df = (
|
||||
pl.read_parquet(postcodes_path, columns=required)
|
||||
.filter(
|
||||
(pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
|
||||
)
|
||||
.filter(pl.col("east1m").is_not_null() & pl.col("north1m").is_not_null())
|
||||
.with_columns(_is_london_admin_expr().alias("is_london"))
|
||||
.select("east1m", "north1m", "is_london")
|
||||
)
|
||||
if df.is_empty():
|
||||
raise ValueError(f"No active England postcodes in {postcodes_path}")
|
||||
|
||||
coords = np.column_stack(
|
||||
[
|
||||
df["east1m"].to_numpy().astype(np.float64),
|
||||
df["north1m"].to_numpy().astype(np.float64),
|
||||
]
|
||||
)
|
||||
london_flags = df["is_london"].to_numpy().astype(bool)
|
||||
return cKDTree(coords), london_flags
|
||||
|
||||
|
||||
def _assign_london_display_city(
|
||||
places: list[dict],
|
||||
postcodes_path: Path,
|
||||
max_distance_m: float = DISPLAY_CITY_NEAREST_POSTCODE_MAX_M,
|
||||
) -> int:
|
||||
"""Tag places whose nearest active postcode is inside Greater London."""
|
||||
if not places:
|
||||
return 0
|
||||
|
||||
tree, london_flags = _london_postcode_tree(postcodes_path)
|
||||
lons = np.array([float(place["lon"]) for place in places], dtype=np.float64)
|
||||
lats = np.array([float(place["lat"]) for place in places], dtype=np.float64)
|
||||
eastings, northings = WGS84_TO_BNG.transform(lons, lats)
|
||||
place_coords = np.column_stack([eastings, northings])
|
||||
distances, indices = tree.query(place_coords)
|
||||
|
||||
assigned = 0
|
||||
for idx, place in enumerate(places):
|
||||
if place.get("display_city") or place.get("place_type") == "city":
|
||||
continue
|
||||
if distances[idx] <= max_distance_m and london_flags[indices[idx]]:
|
||||
place["display_city"] = "London"
|
||||
assigned += 1
|
||||
return assigned
|
||||
|
||||
|
||||
def _ofs_universities(
|
||||
raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
|
||||
) -> tuple[list[dict], int]:
|
||||
|
|
@ -277,6 +367,7 @@ def _ofs_universities(
|
|||
"lon": lon,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
"display_city": None,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -354,6 +445,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
|
|||
"lon": station["lon_sum"] / count,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
"display_city": None,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -388,6 +480,7 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
lon: float,
|
||||
population: int,
|
||||
travel_destination: bool,
|
||||
display_city: str | None = None,
|
||||
) -> None:
|
||||
self.places.append(
|
||||
{
|
||||
|
|
@ -397,6 +490,7 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
"lon": lon,
|
||||
"population": population,
|
||||
"travel_destination": travel_destination,
|
||||
"display_city": display_city,
|
||||
}
|
||||
)
|
||||
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
||||
|
|
@ -414,18 +508,19 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
if not self._england.contains(Point(lon, lat)):
|
||||
return
|
||||
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
tags = dict(n.tags)
|
||||
name = tags.get("name:en", tags.get("name", ""))
|
||||
if not name:
|
||||
return
|
||||
|
||||
pop_str = n.tags.get("population", "")
|
||||
pop_str = tags.get("population", "")
|
||||
try:
|
||||
population = int(pop_str)
|
||||
except ValueError:
|
||||
population = 0
|
||||
|
||||
# place=* nodes
|
||||
place_type = n.tags.get("place")
|
||||
place_type = tags.get("place")
|
||||
if place_type in SEARCH_PLACE_TYPES:
|
||||
self._add(
|
||||
name,
|
||||
|
|
@ -434,12 +529,14 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
lon,
|
||||
population,
|
||||
travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
|
||||
display_city=None
|
||||
if place_type == "city"
|
||||
else _display_city_from_tags(tags),
|
||||
)
|
||||
return
|
||||
|
||||
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
|
||||
if n.tags.get("railway") == "station":
|
||||
tags = dict(n.tags)
|
||||
if tags.get("railway") == "station":
|
||||
if _is_tram_station(tags):
|
||||
return
|
||||
display_name = _station_display_name(name, tags)
|
||||
|
|
@ -450,6 +547,7 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
lon,
|
||||
population,
|
||||
travel_destination=True,
|
||||
display_city=_display_city_from_tags(tags),
|
||||
)
|
||||
return
|
||||
|
||||
|
|
@ -479,7 +577,10 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--postcodes",
|
||||
type=Path,
|
||||
help="Postcode parquet used to geocode OfS university contact postcodes",
|
||||
help=(
|
||||
"Postcode parquet used to geocode OfS university contact postcodes "
|
||||
"and assign Greater London display labels"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
@ -507,14 +608,18 @@ def main() -> None:
|
|||
added, skipped = _append_ofs_universities(
|
||||
handler.places, args.university_register, args.postcodes
|
||||
)
|
||||
print(
|
||||
f"Added {added:,} university travel destinations from the OfS register"
|
||||
)
|
||||
print(f"Added {added:,} university travel destinations from the OfS register")
|
||||
if skipped:
|
||||
print(f"Skipped {skipped:,} OfS university rows without usable coordinates")
|
||||
|
||||
if handler.places:
|
||||
if args.postcodes:
|
||||
assigned = _assign_london_display_city(handler.places, args.postcodes)
|
||||
print(f"Assigned London display labels to {assigned:,} places")
|
||||
for place in handler.places:
|
||||
place.setdefault("display_city", None)
|
||||
df = pl.DataFrame(handler.places)
|
||||
df = df.with_columns(pl.col("display_city").cast(pl.Utf8))
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
import polars as pl
|
||||
from pyproj import Transformer
|
||||
|
||||
from pipeline.download.places import (
|
||||
_assign_london_display_city,
|
||||
_display_city_from_tags,
|
||||
_is_dlr_station,
|
||||
_is_tram_station,
|
||||
_naptan_dlr_stations,
|
||||
|
|
@ -9,6 +12,22 @@ from pipeline.download.places import (
|
|||
_station_display_name,
|
||||
)
|
||||
|
||||
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
|
||||
|
||||
def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
|
||||
easting, northing = WGS84_TO_BNG.transform(lon, lat)
|
||||
return {
|
||||
"pcds": postcode,
|
||||
"doterm": None,
|
||||
"ctry25cd": "E92000001",
|
||||
"east1m": int(round(easting)),
|
||||
"north1m": int(round(northing)),
|
||||
"rgn25cd": "E12000007" if london else "E12000008",
|
||||
"lad25cd": "E09000008" if london else "E07000208",
|
||||
"cty25cd": "E13000002" if london else "E10000030",
|
||||
}
|
||||
|
||||
|
||||
def test_dlr_light_rail_is_not_treated_as_tram():
|
||||
dlr_tags = {
|
||||
|
|
@ -144,5 +163,56 @@ def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
|
|||
"lon": -1.2643,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
"display_city": None,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_display_city_from_tags_uses_explicit_london_context():
|
||||
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
|
||||
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
|
||||
|
||||
|
||||
def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
|
||||
postcodes = tmp_path / "postcodes.parquet"
|
||||
pl.DataFrame(
|
||||
[
|
||||
_postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
|
||||
_postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
|
||||
]
|
||||
).write_parquet(postcodes)
|
||||
|
||||
places = [
|
||||
{
|
||||
"name": "Croydon",
|
||||
"place_type": "town",
|
||||
"lat": 51.3713049,
|
||||
"lon": -0.101957,
|
||||
"population": 173314,
|
||||
"travel_destination": False,
|
||||
"display_city": None,
|
||||
},
|
||||
{
|
||||
"name": "East Croydon railway station",
|
||||
"place_type": "station",
|
||||
"lat": 51.375845,
|
||||
"lon": -0.092732,
|
||||
"population": 0,
|
||||
"travel_destination": True,
|
||||
"display_city": None,
|
||||
},
|
||||
{
|
||||
"name": "Epsom",
|
||||
"place_type": "town",
|
||||
"lat": 51.3326,
|
||||
"lon": -0.2678,
|
||||
"population": 31489,
|
||||
"travel_destination": False,
|
||||
"display_city": None,
|
||||
},
|
||||
]
|
||||
|
||||
assigned = _assign_london_display_city(places, postcodes)
|
||||
|
||||
assert assigned == 2
|
||||
assert [place["display_city"] for place in places] == ["London", "London", None]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue