import polars as pl from pyproj import Transformer from pipeline.download.places import ( _assign_london_display_city, _display_city_from_tags, _is_dlr_station, _is_tram_station, _naptan_dlr_stations, _ofs_universities, _select_university_name, _station_display_name, ) WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) def _postcode_row(postcode: str, lat: float, lon: float, *, london: bool) -> dict: easting, northing = WGS84_TO_BNG.transform(lon, lat) return { "pcds": postcode, "doterm": None, "ctry25cd": "E92000001", "east1m": int(round(easting)), "north1m": int(round(northing)), "rgn25cd": "E12000007" if london else "E12000008", "lad25cd": "E09000008" if london else "E07000208", "cty25cd": "E13000002" if london else "E10000030", } def test_dlr_light_rail_is_not_treated_as_tram(): dlr_tags = { "name": "Lewisham DLR", "railway": "station", "station": "light_rail", "network": "Docklands Light Railway", } assert _is_dlr_station(dlr_tags) assert not _is_tram_station(dlr_tags) assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station" assert ( _station_display_name("Tower Gateway Station DLR", dlr_tags) == "Tower Gateway DLR station" ) def test_tram_light_rail_is_still_excluded(): tram_tags = { "name": "East Croydon", "railway": "station", "station": "light_rail", "network": "London Trams", } assert not _is_dlr_station(tram_tags) assert _is_tram_station(tram_tags) def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path): naptan = tmp_path / "naptan.parquet" pl.DataFrame( { "id": [ "4900ZZDLSHA3", "9400ZZDLSHA", "4900ZZDLGRE1", "490002076RV", "4900ZZLUBNK", ], "name": [ "Shadwell DLR", "Shadwell DLR Station", "Greenwich Station", "Tower Gateway Station DLR", "Bank", ], "category": [ "Tube station", "Tube station", "Rail station", "Bus stop", "Tube station", ], "lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131], "lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894], } ).write_parquet(naptan) stations = _naptan_dlr_stations(naptan) assert [station["name"] for station in stations] == [ "Greenwich DLR station", "Shadwell DLR station", ] shadwell = next( station for station in stations if station["name"].startswith("Shadwell") ) assert shadwell["lat"] == (51.51156 + 51.511693) / 2 assert shadwell["place_type"] == "station" assert shadwell["travel_destination"] is True def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name(): assert ( _select_university_name( "The Chancellor, Masters and Scholars of the University of Oxford", "Oxford University\nThe University of Oxford", ) == "Oxford University" ) assert ( _select_university_name( "Bournemouth University Higher Education Corporation", "Bournemouth University", ) == "Bournemouth University" ) assert ( _select_university_name("The University of Surrey", "Not applicable") == "University of Surrey" ) def test_ofs_universities_extracts_university_title_rows_with_postcode_coords(): raw_register = pl.DataFrame( [ ["OfS Register", None, None, None], ["Note row", None, None, None], [ "Provider's legal name", "Provider's trading name(s)", "Provider's contact address", "Does the provider have the right to use university in its title?", ], [ "The Chancellor, Masters and Scholars of the University of Oxford", "Oxford University\nThe University of Oxford", "University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom", "Yes", ], [ "Example College", "Not applicable", "Example Street\nLondon\nSW1A 1AA\nUnited Kingdom", "No", ], ], orient="row", ) universities, skipped = _ofs_universities( raw_register, {"OX12JD": (51.7585, -1.2643)} ) assert skipped == 0 assert universities == [ { "name": "Oxford University", "place_type": "university", "lat": 51.7585, "lon": -1.2643, "population": 0, "travel_destination": True, "display_city": None, } ] def test_display_city_from_tags_uses_explicit_london_context(): assert _display_city_from_tags({"is_in": "Croydon, London, UK"}) == "London" assert _display_city_from_tags({"is_in": "Croydon, Cambridgeshire, UK"}) is None def test_assign_london_display_city_uses_nearest_active_postcode_admin(tmp_path): postcodes = tmp_path / "postcodes.parquet" pl.DataFrame( [ _postcode_row("CR0 1SZ", 51.371273, -0.101793, london=True), _postcode_row("KT19 8AG", 51.3326, -0.2678, london=False), ] ).write_parquet(postcodes) places = [ { "name": "Croydon", "place_type": "town", "lat": 51.3713049, "lon": -0.101957, "population": 173314, "travel_destination": False, "display_city": None, }, { "name": "East Croydon railway station", "place_type": "station", "lat": 51.375845, "lon": -0.092732, "population": 0, "travel_destination": True, "display_city": None, }, { "name": "Epsom", "place_type": "town", "lat": 51.3326, "lon": -0.2678, "population": 31489, "travel_destination": False, "display_city": None, }, ] assigned = _assign_london_display_city(places, postcodes) assert assigned == 2 assert [place["display_city"] for place in places] == ["London", "London", None]