perfect-postcode/pipeline/download/test_naptan.py

import polars as pl
import pytest

from pipeline.download.naptan import (
    canonical_station_name,
    canonical_station_name_expr,
    deduplicate_naptan,
)


def test_canonical_station_name_expr_normalizes_transport_suffixes():
    names = [
        "Bank",
        "Bank Underground Station",
        "Bank DLR Station",
        "Pleasure Beach (Blackpool Tramway)",
        "Earl's Court Tube Station",
    ]
    df = pl.DataFrame(
        {
            "name": names,
        }
    )

    result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()

    assert result == [
        "bank",
        "bank",
        "bank",
        "pleasure beach",
        "earls court",
    ]
    assert [canonical_station_name(name) for name in names] == result


def test_deduplicate_naptan_merges_tube_station_variants_by_area():
    df = pl.DataFrame(
        {
            "id": [
                "bank",
                "bank-lu",
                "bank-dlr",
                "other-bank",
                "central-a",
                "central-b",
            ],
            "name": [
                "Bank",
                "Bank Underground Station",
                "Bank DLR Station",
                "Bank Underground Station",
                "Central Tube Station",
                "Central Tube Station",
            ],
            "category": ["Tube station"] * 6,
            "lat": [51.5129, 51.5134, 51.5132, 55.0140, 51.5, 53.0],
            "lng": [-0.0889, -0.0890, -0.0885, -1.6781, -0.1, -2.0],
            "locality": ["LOC1", "LOC1", "LOC2", "LOC1", None, None],
        }
    )

    result = deduplicate_naptan(df).sort("lat")

    assert len(result) == 4
    assert result["name"].to_list() == [
        "Central Tube Station",
        "Bank",
        "Central Tube Station",
        "Bank Underground Station",
    ]
    assert result.filter(pl.col("name") == "Bank")["lat"][0] == pytest.approx(
        (51.5129 + 51.5134 + 51.5132) / 3
    )


def test_deduplicate_naptan_does_not_merge_missing_locality_bus_stops():
    df = pl.DataFrame(
        {
            "id": ["a", "b"],
            "name": ["High Street", "High Street"],
            "category": ["Bus stop", "Bus stop"],
            "lat": [51.5, 52.5],
            "lng": [-0.1, -1.1],
            "locality": [None, None],
        }
    )

    result = deduplicate_naptan(df)

    assert len(result) == 2


def test_deduplicate_naptan_merges_colocated_missing_locality_bus_stations():
    # Two NaPTAN records for the same bus station with no locality, co-located
    # within the merge area, are a true duplicate and collapse to one POI.
    df = pl.DataFrame(
        {
            "id": ["a", "b"],
            "name": ["Victoria Bus Station", "Victoria Bus Station"],
            "category": ["Bus station", "Bus station"],
            "lat": [51.4952, 51.4953],
            "lng": [-0.1441, -0.1440],
            "locality": [None, None],
        }
    )

    result = deduplicate_naptan(df)

    assert len(result) == 1
    assert result["name"][0] == "Victoria Bus Station"
    assert result["category"][0] == "Bus station"
    assert result["lat"][0] == pytest.approx((51.4952 + 51.4953) / 2)


def test_deduplicate_naptan_keeps_rail_station_with_only_station_node():
    # Aberdare's only NaPTAN record is an RLY station node (StopType "RLY").
    df = pl.DataFrame(
        {
            "id": ["aberdare-rly"],
            "name": ["Aberdare Rail Station"],
            "category": ["Rail station"],
            "lat": [51.7155],
            "lng": [-3.4438],
            "locality": ["ABERDARE"],
            "entrance": [False],
        }
    )

    result = deduplicate_naptan(df)

    assert len(result) == 1
    assert result["name"][0] == "Aberdare Rail Station"
    assert result["category"][0] == "Rail station"


def test_deduplicate_naptan_merges_rail_entrances_into_station_node():
    # A station node (RLY) and its two entrance nodes (RSE) collapse to a single
    # "Rail station" POI represented by the station node, not an entrance.
    df = pl.DataFrame(
        {
            "id": ["clapham-rly", "clapham-rse-a", "clapham-rse-b"],
            "name": [
                "Clapham Junction Rail Station",
                "Clapham Junction Rail Station",
                "Clapham Junction Rail Station",
            ],
            "category": ["Rail station", "Rail station", "Rail station"],
            "lat": [51.4642, 51.4644, 51.4640],
            "lng": [-0.1705, -0.1702, -0.1708],
            "locality": ["CLAPHAM", "CLAPHAM", "CLAPHAM"],
            "entrance": [False, True, True],
        }
    )

    result = deduplicate_naptan(df)

    assert len(result) == 1
    assert result["id"][0] == "clapham-rly"
    assert result["category"][0] == "Rail station"


def test_deduplicate_naptan_does_not_merge_rail_and_ferry_in_same_area():
    # Different transport modes sharing a name/area stay as separate POIs.
    df = pl.DataFrame(
        {
            "id": ["harbour-rail", "harbour-ferry"],
            "name": ["Harbour Station", "Harbour Station"],
            "category": ["Rail station", "Ferry"],
            "lat": [51.5, 51.5001],
            "lng": [-0.1, -0.1001],
            "locality": ["HARBOUR", "HARBOUR"],
            "entrance": [False, False],
        }
    )

    result = deduplicate_naptan(df).sort("category")

    assert len(result) == 2
    assert result["category"].to_list() == ["Ferry", "Rail station"]