381 lines
12 KiB
Python
381 lines
12 KiB
Python
import numpy as np
|
|
import polars as pl
|
|
from pyproj import Transformer
|
|
from scipy.spatial import cKDTree
|
|
|
|
from pipeline.download.places import (
|
|
_assign_london_display_city,
|
|
_build_street_places,
|
|
_display_city_from_tags,
|
|
_is_dlr_station,
|
|
_is_tram_station,
|
|
_london_postcode_tree,
|
|
_naptan_dlr_stations,
|
|
_normalize_street_name,
|
|
_ofs_universities,
|
|
_outcode_of_postcode,
|
|
_outcode_tree,
|
|
_pois_to_places,
|
|
_select_university_name,
|
|
_station_display_name,
|
|
_street_centroid,
|
|
)
|
|
|
|
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
|
|
|
|
|
def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
|
|
easting, northing = WGS84_TO_BNG.transform(lon, lat)
|
|
return {
|
|
"pcds": postcode,
|
|
"doterm": None,
|
|
"ctry25cd": "E92000001",
|
|
"east1m": int(round(easting)),
|
|
"north1m": int(round(northing)),
|
|
"rgn25cd": "E12000007" if london else "E12000008",
|
|
"lad25cd": "E09000008" if london else "E07000208",
|
|
"cty25cd": "E13000002" if london else "E10000030",
|
|
}
|
|
|
|
|
|
def test_dlr_light_rail_is_not_treated_as_tram():
|
|
dlr_tags = {
|
|
"name": "Lewisham DLR",
|
|
"railway": "station",
|
|
"station": "light_rail",
|
|
"network": "Docklands Light Railway",
|
|
}
|
|
|
|
assert _is_dlr_station(dlr_tags)
|
|
assert not _is_tram_station(dlr_tags)
|
|
assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
|
|
assert (
|
|
_station_display_name("Tower Gateway Station DLR", dlr_tags)
|
|
== "Tower Gateway DLR station"
|
|
)
|
|
|
|
|
|
def test_tram_light_rail_is_still_excluded():
|
|
tram_tags = {
|
|
"name": "East Croydon",
|
|
"railway": "station",
|
|
"station": "light_rail",
|
|
"network": "London Trams",
|
|
}
|
|
|
|
assert not _is_dlr_station(tram_tags)
|
|
assert _is_tram_station(tram_tags)
|
|
|
|
|
|
def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
|
|
naptan = tmp_path / "naptan.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"id": [
|
|
"4900ZZDLSHA3",
|
|
"9400ZZDLSHA",
|
|
"4900ZZDLGRE1",
|
|
"490002076RV",
|
|
"4900ZZLUBNK",
|
|
],
|
|
"name": [
|
|
"Shadwell DLR",
|
|
"Shadwell DLR Station",
|
|
"Greenwich Station",
|
|
"Tower Gateway Station DLR",
|
|
"Bank",
|
|
],
|
|
"category": [
|
|
"Tram & Metro stop",
|
|
"Tube station",
|
|
"Rail station",
|
|
"Bus stop",
|
|
"Tube station",
|
|
],
|
|
"lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
|
|
"lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
|
|
}
|
|
).write_parquet(naptan)
|
|
|
|
stations = _naptan_dlr_stations(naptan)
|
|
|
|
assert [station["name"] for station in stations] == [
|
|
"Greenwich DLR station",
|
|
"Shadwell DLR station",
|
|
]
|
|
shadwell = next(
|
|
station for station in stations if station["name"].startswith("Shadwell")
|
|
)
|
|
assert shadwell["lat"] == (51.51156 + 51.511693) / 2
|
|
assert shadwell["place_type"] == "station"
|
|
assert shadwell["travel_destination"] is True
|
|
|
|
|
|
def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name():
|
|
assert (
|
|
_select_university_name(
|
|
"The Chancellor, Masters and Scholars of the University of Oxford",
|
|
"Oxford University\nThe University of Oxford",
|
|
)
|
|
== "Oxford University"
|
|
)
|
|
assert (
|
|
_select_university_name(
|
|
"Bournemouth University Higher Education Corporation",
|
|
"Bournemouth University",
|
|
)
|
|
== "Bournemouth University"
|
|
)
|
|
assert (
|
|
_select_university_name("The University of Surrey", "Not applicable")
|
|
== "University of Surrey"
|
|
)
|
|
|
|
|
|
def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
|
|
raw_register = pl.DataFrame(
|
|
[
|
|
["OfS Register", None, None, None],
|
|
["Note row", None, None, None],
|
|
[
|
|
"Provider's legal name",
|
|
"Provider's trading name(s)",
|
|
"Provider's contact address",
|
|
"Does the provider have the right to use university in its title?",
|
|
],
|
|
[
|
|
"The Chancellor, Masters and Scholars of the University of Oxford",
|
|
"Oxford University\nThe University of Oxford",
|
|
"University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom",
|
|
"Yes",
|
|
],
|
|
[
|
|
"Example College",
|
|
"Not applicable",
|
|
"Example Street\nLondon\nSW1A 1AA\nUnited Kingdom",
|
|
"No",
|
|
],
|
|
],
|
|
orient="row",
|
|
)
|
|
|
|
universities, skipped = _ofs_universities(
|
|
raw_register, {"OX12JD": (51.7585, -1.2643)}
|
|
)
|
|
|
|
assert skipped == 0
|
|
assert universities == [
|
|
{
|
|
"name": "Oxford University",
|
|
"place_type": "university",
|
|
"lat": 51.7585,
|
|
"lon": -1.2643,
|
|
"population": 0,
|
|
"travel_destination": True,
|
|
"display_city": None,
|
|
}
|
|
]
|
|
|
|
|
|
def test_street_centroid_averages_vertices():
|
|
assert _street_centroid([(51.0, -0.1), (53.0, -0.3)]) == (52.0, -0.2)
|
|
assert _street_centroid([]) is None
|
|
|
|
|
|
def test_normalize_street_name_and_outcode():
|
|
assert _normalize_street_name(" High Street ") == "high street"
|
|
assert _outcode_of_postcode("NW1 6XE") == "NW1"
|
|
assert _outcode_of_postcode("") == ""
|
|
|
|
|
|
def test_build_street_places_groups_segments_by_name_and_outcode():
|
|
# Two postcodes: NW1 (north) and CR0 (south). The tree lives in BNG metres
|
|
# (matching _outcode_tree); streets are transformed before querying.
|
|
east, north = WGS84_TO_BNG.transform([-0.14, -0.10], [51.53, 51.37])
|
|
tree = cKDTree(np.column_stack([east, north]))
|
|
outcodes = ["NW1", "CR0"]
|
|
|
|
streets = [
|
|
{"name": "High Street", "lat": 51.531, "lon": -0.141}, # NW1
|
|
{"name": "High Street", "lat": 51.529, "lon": -0.139}, # NW1 (same road, 2nd segment)
|
|
{"name": "High Street", "lat": 51.371, "lon": -0.101}, # CR0 (different road, same name)
|
|
{"name": "Baker Street", "lat": 51.5305, "lon": -0.1405}, # NW1
|
|
]
|
|
|
|
places = _build_street_places(streets, tree, outcodes)
|
|
|
|
# 3 distinct streets: High Street/NW1 (2 segments merged), High Street/CR0, Baker Street/NW1.
|
|
assert len(places) == 3
|
|
assert all(place["place_type"] == "street" for place in places)
|
|
|
|
nw1_high = next(
|
|
place
|
|
for place in places
|
|
if place["name"] == "High Street" and place["lat"] > 51.5
|
|
)
|
|
assert nw1_high["lat"] == (51.531 + 51.529) / 2
|
|
assert nw1_high["lon"] == (-0.141 + -0.139) / 2
|
|
# The same-named CR0 road stays separate.
|
|
assert any(
|
|
place["name"] == "High Street" and place["lat"] < 51.4 for place in places
|
|
)
|
|
|
|
|
|
def test_pois_to_places_keeps_high_value_named_pois_only():
|
|
pois = pl.DataFrame(
|
|
{
|
|
"name": ["Hyde Park", "St Thomas' Hospital", "Joe's Cafe", "Hyde Park", ""],
|
|
"category": [
|
|
"leisure/park",
|
|
"amenity/hospital",
|
|
"amenity/cafe", # everyday amenity → excluded
|
|
"leisure/park", # duplicate → excluded
|
|
"leisure/park", # empty name → excluded
|
|
],
|
|
"lat": [51.507, 51.498, 51.5, 51.507, 51.6],
|
|
"lng": [-0.165, -0.118, -0.1, -0.165, -0.2],
|
|
}
|
|
)
|
|
|
|
places = _pois_to_places(pois)
|
|
|
|
assert [(place["name"], place["place_type"]) for place in places] == [
|
|
("Hyde Park", "park"),
|
|
("St Thomas' Hospital", "hospital"),
|
|
]
|
|
assert all(place["travel_destination"] is False for place in places)
|
|
|
|
|
|
def test_pois_to_places_keeps_distinct_same_named_pois():
|
|
# Two genuinely distinct POIs sharing a name, far apart (London vs Bristol).
|
|
pois = pl.DataFrame(
|
|
{
|
|
"name": ["Victoria Park", "Victoria Park"],
|
|
"category": ["leisure/park", "leisure/park"],
|
|
"lat": [51.54, 51.46],
|
|
"lng": [-0.04, -2.60],
|
|
}
|
|
)
|
|
|
|
places = _pois_to_places(pois)
|
|
|
|
assert len(places) == 2
|
|
assert {(place["lat"], place["lon"]) for place in places} == {
|
|
(51.54, -0.04),
|
|
(51.46, -2.60),
|
|
}
|
|
|
|
|
|
def test_pois_to_places_still_dedupes_colocated():
|
|
# The same physical POI mapped twice a few metres apart collapses to one.
|
|
pois = pl.DataFrame(
|
|
{
|
|
"name": ["Victoria Park", "Victoria Park"],
|
|
"category": ["leisure/park", "leisure/park"],
|
|
"lat": [51.5400, 51.5401],
|
|
"lng": [-0.0400, -0.0399],
|
|
}
|
|
)
|
|
|
|
places = _pois_to_places(pois)
|
|
|
|
assert len(places) == 1
|
|
|
|
|
|
def test_display_city_from_tags_uses_explicit_london_context():
|
|
assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
|
|
assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None
|
|
|
|
|
|
def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
|
|
postcodes = tmp_path / "postcodes.parquet"
|
|
pl.DataFrame(
|
|
[
|
|
_postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
|
|
_postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
|
|
]
|
|
).write_parquet(postcodes)
|
|
|
|
places = [
|
|
{
|
|
"name": "Croydon",
|
|
"place_type": "town",
|
|
"lat": 51.3713049,
|
|
"lon": -0.101957,
|
|
"population": 173314,
|
|
"travel_destination": False,
|
|
"display_city": None,
|
|
},
|
|
{
|
|
"name": "East Croydon railway station",
|
|
"place_type": "station",
|
|
"lat": 51.375845,
|
|
"lon": -0.092732,
|
|
"population": 0,
|
|
"travel_destination": True,
|
|
"display_city": None,
|
|
},
|
|
{
|
|
"name": "Epsom",
|
|
"place_type": "town",
|
|
"lat": 51.3326,
|
|
"lon": -0.2678,
|
|
"population": 31489,
|
|
"travel_destination": False,
|
|
"display_city": None,
|
|
},
|
|
]
|
|
|
|
assigned = _assign_london_display_city(places, postcodes)
|
|
|
|
assert assigned == 2
|
|
assert [place["display_city"] for place in places] == ["London", "London", None]
|
|
|
|
|
|
def test_no_grid_reference_sentinel_is_excluded_from_coordinate_trees(tmp_path):
|
|
# ONS NSPL stores postcodes with no grid reference at the Null-Island sentinel
|
|
# lat=99.999999, long=0.0, whose paired BNG coords collapse to the (0, 0) origin.
|
|
# Such an active postcode must never enter the nearest-neighbour indexes.
|
|
sentinel = {
|
|
"pcds": "ZZ99 9ZZ",
|
|
"lat": 99.999999,
|
|
"long": 0.0,
|
|
"doterm": None,
|
|
"ctry25cd": "E92000001",
|
|
"east1m": 0,
|
|
"north1m": 0,
|
|
"rgn25cd": "E12000007",
|
|
"lad25cd": "E09000008",
|
|
"cty25cd": "E13000002",
|
|
}
|
|
croydon_easting, croydon_northing = WGS84_TO_BNG.transform(-0.101793, 51.371273)
|
|
real = {
|
|
"pcds": "CR0 1SZ",
|
|
"lat": 51.371273,
|
|
"long": -0.101793,
|
|
"doterm": None,
|
|
"ctry25cd": "E92000001",
|
|
"east1m": int(round(croydon_easting)),
|
|
"north1m": int(round(croydon_northing)),
|
|
"rgn25cd": "E12000007",
|
|
"lad25cd": "E09000008",
|
|
"cty25cd": "E13000002",
|
|
}
|
|
postcodes = tmp_path / "postcodes.parquet"
|
|
pl.DataFrame([sentinel, real]).write_parquet(postcodes)
|
|
|
|
# lat/long outcode tree: only the real postcode survives, so a London-area query
|
|
# cannot be tagged with the sentinel's (empty) outcode.
|
|
tree, outcodes = _outcode_tree(postcodes)
|
|
assert tree.n == 1
|
|
assert outcodes == ["CR0"]
|
|
_, idx = tree.query([[51.371273, -0.101793]])
|
|
assert outcodes[idx[0]] == "CR0"
|
|
|
|
# BNG London tree: only the real postcode survives, so the (0, 0) origin can never
|
|
# be the nearest neighbour of a real place.
|
|
bng_tree, london_flags = _london_postcode_tree(postcodes)
|
|
assert bng_tree.n == 1
|
|
assert london_flags.tolist() == [True]
|
|
_, bng_idx = bng_tree.query([[croydon_easting, croydon_northing]])
|
|
assert bng_idx[0] == 0
|