perfect-postcode/pipeline/download/test_places.py
2026-05-17 10:16:30 +01:00

218 lines
6.5 KiB
Python

import polars as pl
from pyproj import Transformer
from pipeline.download.places import (
_assign_london_display_city,
_display_city_from_tags,
_is_dlr_station,
_is_tram_station,
_naptan_dlr_stations,
_ofs_universities,
_select_university_name,
_station_display_name,
)
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
easting, northing = WGS84_TO_BNG.transform(lon, lat)
return {
"pcds": postcode,
"doterm": None,
"ctry25cd": "E92000001",
"east1m": int(round(easting)),
"north1m": int(round(northing)),
"rgn25cd": "E12000007" if london else "E12000008",
"lad25cd": "E09000008" if london else "E07000208",
"cty25cd": "E13000002" if london else "E10000030",
}
def test_dlr_light_rail_is_not_treated_as_tram():
dlr_tags = {
"name": "Lewisham DLR",
"railway": "station",
"station": "light_rail",
"network": "Docklands Light Railway",
}
assert _is_dlr_station(dlr_tags)
assert not _is_tram_station(dlr_tags)
assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
assert (
_station_display_name("Tower Gateway Station DLR", dlr_tags)
== "Tower Gateway DLR station"
)
def test_tram_light_rail_is_still_excluded():
tram_tags = {
"name": "East Croydon",
"railway": "station",
"station": "light_rail",
"network": "London Trams",
}
assert not _is_dlr_station(tram_tags)
assert _is_tram_station(tram_tags)
def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
naptan = tmp_path / "naptan.parquet"
pl.DataFrame(
{
"id": [
"4900ZZDLSHA3",
"9400ZZDLSHA",
"4900ZZDLGRE1",
"490002076RV",
"4900ZZLUBNK",
],
"name": [
"Shadwell DLR",
"Shadwell DLR Station",
"Greenwich Station",
"Tower Gateway Station DLR",
"Bank",
],
"category": [
"Tube station",
"Tube station",
"Rail station",
"Bus stop",
"Tube station",
],
"lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
"lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
}
).write_parquet(naptan)
stations = _naptan_dlr_stations(naptan)
assert [station["name"] for station in stations] == [
"Greenwich DLR station",
"Shadwell DLR station",
]
shadwell = next(
station for station in stations if station["name"].startswith("Shadwell")
)
assert shadwell["lat"] == (51.51156 + 51.511693) / 2
assert shadwell["place_type"] == "station"
assert shadwell["travel_destination"] is True
def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name():
assert (
_select_university_name(
"The Chancellor, Masters and Scholars of the University of Oxford",
"Oxford University\nThe University of Oxford",
)
== "Oxford University"
)
assert (
_select_university_name(
"Bournemouth University Higher Education Corporation",
"Bournemouth University",
)
== "Bournemouth University"
)
assert (
_select_university_name("The University of Surrey", "Not applicable")
== "University of Surrey"
)
def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
raw_register = pl.DataFrame(
[
["OfS Register", None, None, None],
["Note row", None, None, None],
[
"Provider's legal name",
"Provider's trading name(s)",
"Provider's contact address",
"Does the provider have the right to use university in its title?",
],
[
"The Chancellor, Masters and Scholars of the University of Oxford",
"Oxford University\nThe University of Oxford",
"University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom",
"Yes",
],
[
"Example College",
"Not applicable",
"Example Street\nLondon\nSW1A 1AA\nUnited Kingdom",
"No",
],
],
orient="row",
)
universities, skipped = _ofs_universities(
raw_register, {"OX12JD": (51.7585, -1.2643)}
)
assert skipped == 0
assert universities == [
{
"name": "Oxford University",
"place_type": "university",
"lat": 51.7585,
"lon": -1.2643,
"population": 0,
"travel_destination": True,
"display_city": None,
}
]
def test_display_city_from_tags_uses_explicit_london_context():
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
postcodes = tmp_path / "postcodes.parquet"
pl.DataFrame(
[
_postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
_postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
]
).write_parquet(postcodes)
places = [
{
"name": "Croydon",
"place_type": "town",
"lat": 51.3713049,
"lon": -0.101957,
"population": 173314,
"travel_destination": False,
"display_city": None,
},
{
"name": "East Croydon railway station",
"place_type": "station",
"lat": 51.375845,
"lon": -0.092732,
"population": 0,
"travel_destination": True,
"display_city": None,
},
{
"name": "Epsom",
"place_type": "town",
"lat": 51.3326,
"lon": -0.2678,
"population": 31489,
"travel_destination": False,
"display_city": None,
},
]
assigned = _assign_london_display_city(places, postcodes)
assert assigned == 2
assert [place["display_city"] for place in places] == ["London", "London", None]