perfect-postcode/pipeline/download/test_places.py

import numpy as np
import polars as pl
from pyproj import Transformer
from scipy.spatial import cKDTree

from pipeline.download.places import (
    _assign_london_display_city,
    _build_street_places,
    _display_city_from_tags,
    _is_dlr_station,
    _is_tram_station,
    _naptan_dlr_stations,
    _normalize_street_name,
    _ofs_universities,
    _outcode_of_postcode,
    _pois_to_places,
    _select_university_name,
    _station_display_name,
    _street_centroid,
)

WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)


def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict:
    easting, northing = WGS84_TO_BNG.transform(lon, lat)
    return {
        "pcds": postcode,
        "doterm": None,
        "ctry25cd": "E92000001",
        "east1m": int(round(easting)),
        "north1m": int(round(northing)),
        "rgn25cd": "E12000007" if london else "E12000008",
        "lad25cd": "E09000008" if london else "E07000208",
        "cty25cd": "E13000002" if london else "E10000030",
    }


def test_dlr_light_rail_is_not_treated_as_tram():
    dlr_tags = {
        "name": "Lewisham DLR",
        "railway": "station",
        "station": "light_rail",
        "network": "Docklands Light Railway",
    }

    assert _is_dlr_station(dlr_tags)
    assert not _is_tram_station(dlr_tags)
    assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station"
    assert (
        _station_display_name("Tower Gateway Station DLR", dlr_tags)
        == "Tower Gateway DLR station"
    )


def test_tram_light_rail_is_still_excluded():
    tram_tags = {
        "name": "East Croydon",
        "railway": "station",
        "station": "light_rail",
        "network": "London Trams",
    }

    assert not _is_dlr_station(tram_tags)
    assert _is_tram_station(tram_tags)


def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
    naptan = tmp_path / "naptan.parquet"
    pl.DataFrame(
        {
            "id": [
                "4900ZZDLSHA3",
                "9400ZZDLSHA",
                "4900ZZDLGRE1",
                "490002076RV",
                "4900ZZLUBNK",
            ],
            "name": [
                "Shadwell DLR",
                "Shadwell DLR Station",
                "Greenwich Station",
                "Tower Gateway Station DLR",
                "Bank",
            ],
            "category": [
                "Tube station",
                "Tube station",
                "Rail station",
                "Bus stop",
                "Tube station",
            ],
            "lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131],
            "lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894],
        }
    ).write_parquet(naptan)

    stations = _naptan_dlr_stations(naptan)

    assert [station["name"] for station in stations] == [
        "Greenwich DLR station",
        "Shadwell DLR station",
    ]
    shadwell = next(
        station for station in stations if station["name"].startswith("Shadwell")
    )
    assert shadwell["lat"] == (51.51156 + 51.511693) / 2
    assert shadwell["place_type"] == "station"
    assert shadwell["travel_destination"] is True


def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name():
    assert (
        _select_university_name(
            "The Chancellor, Masters and Scholars of the University of Oxford",
            "Oxford University\nThe University of Oxford",
        )
        == "Oxford University"
    )
    assert (
        _select_university_name(
            "Bournemouth University Higher Education Corporation",
            "Bournemouth University",
        )
        == "Bournemouth University"
    )
    assert (
        _select_university_name("The University of Surrey", "Not applicable")
        == "University of Surrey"
    )


def test_ofs_universities_extracts_university_title_rows_with_postcode_coords():
    raw_register = pl.DataFrame(
        [
            ["OfS Register", None, None, None],
            ["Note row", None, None, None],
            [
                "Provider's legal name",
                "Provider's trading name(s)",
                "Provider's contact address",
                "Does the provider have the right to use university in its title?",
            ],
            [
                "The Chancellor, Masters and Scholars of the University of Oxford",
                "Oxford University\nThe University of Oxford",
                "University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom",
                "Yes",
            ],
            [
                "Example College",
                "Not applicable",
                "Example Street\nLondon\nSW1A 1AA\nUnited Kingdom",
                "No",
            ],
        ],
        orient="row",
    )

    universities, skipped = _ofs_universities(
        raw_register, {"OX12JD": (51.7585, -1.2643)}
    )

    assert skipped == 0
    assert universities == [
        {
            "name": "Oxford University",
            "place_type": "university",
            "lat": 51.7585,
            "lon": -1.2643,
            "population": 0,
            "travel_destination": True,
            "display_city": None,
        }
    ]


def test_street_centroid_averages_vertices():
    assert _street_centroid([(51.0, -0.1), (53.0, -0.3)]) == (52.0, -0.2)
    assert _street_centroid([]) is None


def test_normalize_street_name_and_outcode():
    assert _normalize_street_name("  High   Street ") == "high street"
    assert _outcode_of_postcode("NW1 6XE") == "NW1"
    assert _outcode_of_postcode("") == ""


def test_build_street_places_groups_segments_by_name_and_outcode():
    # Two postcodes: NW1 (north) and CR0 (south).
    tree = cKDTree(np.array([[51.53, -0.14], [51.37, -0.10]], dtype=np.float64))
    outcodes = ["NW1", "CR0"]

    streets = [
        {"name": "High Street", "lat": 51.531, "lon": -0.141},  # NW1
        {"name": "High Street", "lat": 51.529, "lon": -0.139},  # NW1 (same road, 2nd segment)
        {"name": "High Street", "lat": 51.371, "lon": -0.101},  # CR0 (different road, same name)
        {"name": "Baker Street", "lat": 51.5305, "lon": -0.1405},  # NW1
    ]

    places = _build_street_places(streets, tree, outcodes)

    # 3 distinct streets: High Street/NW1 (2 segments merged), High Street/CR0, Baker Street/NW1.
    assert len(places) == 3
    assert all(place["place_type"] == "street" for place in places)

    nw1_high = next(
        place
        for place in places
        if place["name"] == "High Street" and place["lat"] > 51.5
    )
    assert nw1_high["lat"] == (51.531 + 51.529) / 2
    assert nw1_high["lon"] == (-0.141 + -0.139) / 2
    # The same-named CR0 road stays separate.
    assert any(
        place["name"] == "High Street" and place["lat"] < 51.4 for place in places
    )


def test_pois_to_places_keeps_high_value_named_pois_only():
    pois = pl.DataFrame(
        {
            "name": ["Hyde Park", "St Thomas' Hospital", "Joe's Cafe", "Hyde Park", ""],
            "category": [
                "leisure/park",
                "amenity/hospital",
                "amenity/cafe",  # everyday amenity → excluded
                "leisure/park",  # duplicate → excluded
                "leisure/park",  # empty name → excluded
            ],
            "lat": [51.507, 51.498, 51.5, 51.507, 51.6],
            "lng": [-0.165, -0.118, -0.1, -0.165, -0.2],
        }
    )

    places = _pois_to_places(pois)

    assert [(place["name"], place["place_type"]) for place in places] == [
        ("Hyde Park", "park"),
        ("St Thomas' Hospital", "hospital"),
    ]
    assert all(place["travel_destination"] is False for place in places)


def test_display_city_from_tags_uses_explicit_london_context():
    assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London"
    assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None


def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path):
    postcodes = tmp_path / "postcodes.parquet"
    pl.DataFrame(
        [
            _postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True),
            _postcode_row("KT19 8AG", 51.3326, -0.2678, london=False),
        ]
    ).write_parquet(postcodes)

    places = [
        {
            "name": "Croydon",
            "place_type": "town",
            "lat": 51.3713049,
            "lon": -0.101957,
            "population": 173314,
            "travel_destination": False,
            "display_city": None,
        },
        {
            "name": "East Croydon railway station",
            "place_type": "station",
            "lat": 51.375845,
            "lon": -0.092732,
            "population": 0,
            "travel_destination": True,
            "display_city": None,
        },
        {
            "name": "Epsom",
            "place_type": "town",
            "lat": 51.3326,
            "lon": -0.2678,
            "population": 31489,
            "travel_destination": False,
            "display_city": None,
        },
    ]

    assigned = _assign_london_display_city(places, postcodes)

    assert assigned == 2
    assert [place["display_city"] for place in places] == ["London", "London", None]