import polars as pl from pipeline.download.places import ( _is_dlr_station, _is_tram_station, _naptan_dlr_stations, _ofs_universities, _select_university_name, _station_display_name, ) def test_dlr_light_rail_is_not_treated_as_tram(): dlr_tags = { "name": "Lewisham DLR", "railway": "station", "station": "light_rail", "network": "Docklands Light Railway", } assert _is_dlr_station(dlr_tags) assert not _is_tram_station(dlr_tags) assert _station_display_name("Lewisham DLR", dlr_tags) == "Lewisham DLR station" assert ( _station_display_name("Tower Gateway Station DLR", dlr_tags) == "Tower Gateway DLR station" ) def test_tram_light_rail_is_still_excluded(): tram_tags = { "name": "East Croydon", "railway": "station", "station": "light_rail", "network": "London Trams", } assert not _is_dlr_station(tram_tags) assert _is_tram_station(tram_tags) def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path): naptan = tmp_path / "naptan.parquet" pl.DataFrame( { "id": [ "4900ZZDLSHA3", "9400ZZDLSHA", "4900ZZDLGRE1", "490002076RV", "4900ZZLUBNK", ], "name": [ "Shadwell DLR", "Shadwell DLR Station", "Greenwich Station", "Tower Gateway Station DLR", "Bank", ], "category": [ "Tube station", "Tube station", "Rail station", "Bus stop", "Tube station", ], "lat": [51.51156, 51.511693, 51.47794, 51.510575, 51.5131], "lng": [-0.055595, -0.056643, -0.01442, -0.07514, -0.0894], } ).write_parquet(naptan) stations = _naptan_dlr_stations(naptan) assert [station["name"] for station in stations] == [ "Greenwich DLR station", "Shadwell DLR station", ] shadwell = next( station for station in stations if station["name"].startswith("Shadwell") ) assert shadwell["lat"] == (51.51156 + 51.511693) / 2 assert shadwell["place_type"] == "station" assert shadwell["travel_destination"] is True def test_select_university_name_prefers_public_trading_name_for_noisy_legal_name(): assert ( _select_university_name( "The Chancellor, Masters and Scholars of the University of Oxford", "Oxford University\nThe University of Oxford", ) == "Oxford University" ) assert ( _select_university_name( "Bournemouth University Higher Education Corporation", "Bournemouth University", ) == "Bournemouth University" ) assert ( _select_university_name("The University of Surrey", "Not applicable") == "University of Surrey" ) def test_ofs_universities_extracts_university_title_rows_with_postcode_coords(): raw_register = pl.DataFrame( [ ["OfS Register", None, None, None], ["Note row", None, None, None], [ "Provider's legal name", "Provider's trading name(s)", "Provider's contact address", "Does the provider have the right to use university in its title?", ], [ "The Chancellor, Masters and Scholars of the University of Oxford", "Oxford University\nThe University of Oxford", "University Offices\nWellington Square\nOxford\nOX1 2JD\nUnited Kingdom", "Yes", ], [ "Example College", "Not applicable", "Example Street\nLondon\nSW1A 1AA\nUnited Kingdom", "No", ], ], orient="row", ) universities, skipped = _ofs_universities( raw_register, {"OX12JD": (51.7585, -1.2643)} ) assert skipped == 0 assert universities == [ { "name": "Oxford University", "place_type": "university", "lat": 51.7585, "lon": -1.2643, "population": 0, "travel_destination": True, } ]