91 lines
2.4 KiB
Python
91 lines
2.4 KiB
Python
import polars as pl
|
|
import pytest
|
|
|
|
from pipeline.download.naptan import (
|
|
canonical_station_name,
|
|
canonical_station_name_expr,
|
|
deduplicate_naptan,
|
|
)
|
|
|
|
|
|
def test_canonical_station_name_expr_normalizes_transport_suffixes():
|
|
names = [
|
|
"Bank",
|
|
"Bank Underground Station",
|
|
"Bank DLR Station",
|
|
"Pleasure Beach (Blackpool Tramway)",
|
|
"Earl's Court Tube Station",
|
|
]
|
|
df = pl.DataFrame(
|
|
{
|
|
"name": names,
|
|
}
|
|
)
|
|
|
|
result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
|
|
|
|
assert result == [
|
|
"bank",
|
|
"bank",
|
|
"bank",
|
|
"pleasure beach",
|
|
"earls court",
|
|
]
|
|
assert [canonical_station_name(name) for name in names] == result
|
|
|
|
|
|
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": [
|
|
"bank",
|
|
"bank-lu",
|
|
"bank-dlr",
|
|
"other-bank",
|
|
"central-a",
|
|
"central-b",
|
|
],
|
|
"name": [
|
|
"Bank",
|
|
"Bank Underground Station",
|
|
"Bank DLR Station",
|
|
"Bank Underground Station",
|
|
"Central Tube Station",
|
|
"Central Tube Station",
|
|
],
|
|
"category": ["Tube station"] * 6,
|
|
"lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0],
|
|
"lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0],
|
|
"locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df).sort("lat")
|
|
|
|
assert len(result) == 4
|
|
assert result["name"].to_list() == [
|
|
"Central Tube Station",
|
|
"Bank",
|
|
"Central Tube Station",
|
|
"Bank Underground Station",
|
|
]
|
|
assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx(
|
|
(51.5129 + 51.5134 + 51.5132) / 3
|
|
)
|
|
|
|
|
|
def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops():
|
|
df = pl.DataFrame(
|
|
{
|
|
"id": ["a", "b"],
|
|
"name": ["High Street", "High Street"],
|
|
"category": ["Bus stop", "Bus stop"],
|
|
"lat": [51.5, 52.5],
|
|
"lng": [-0.1, -1.1],
|
|
"locality": [None, None],
|
|
}
|
|
)
|
|
|
|
result = deduplicate_naptan(df)
|
|
|
|
assert len(result) == 2
|