perfect-postcode/pipeline/download/naptan.py

"""Download NaPTAN data and extract railway/metro station POIs."""

import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path

import polars as pl

NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01


STOP_TYPES = {
    "AIR": "Airport",
    # Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
    "FER": "Ferry",
    "FBT": "Ferry",
    "FTD": "Ferry",
    # Rail: RLY is the station node; RSE is a station entrance.
    "RLY": "Rail station",
    "RSE": "Rail station",
    "BCT": "Bus stop",
    "BCE": "Bus station",
    "TXR": "Taxi rank",
    "TMU": "Tube station",
    "MET": "Tube station",
}

# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
# with both a station node and entrances yields one POI at the station node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}

# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}


OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]


def canonical_station_name(name: str | None) -> str:
    """Normalize station names so entrances/transport-mode variants collapse."""
    if not name:
        return ""

    normalized = name.lower()
    normalized = re.sub(r"\([^)]*\)", " ", normalized)
    normalized = re.sub(r"['’`]", "", normalized)
    normalized = normalized.replace("&", " and ")
    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
    words = normalized.split()

    suffixes = (
        ("underground", "station"),
        ("tube", "station"),
        ("dlr", "station"),
        ("metro", "station"),
        ("tram", "stop"),
        ("rail", "station"),
        ("railway", "station"),
        ("station",),
        ("stop",),
    )
    while True:
        suffix = next(
            (suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
            None,
        )
        if suffix is None:
            break
        del words[-len(suffix) :]

    return " ".join(words)


def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
    """Normalize station names so entrances/transport-mode variants collapse."""
    expr = pl.col(name_col).str.to_lowercase()
    expr = expr.str.replace_all(r"\([^)]*\)", " ")
    expr = expr.str.replace_all(r"['’`]", "")
    expr = expr.str.replace_all(r"&", " and ")
    expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
    expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
    expr = expr.str.replace_all(
        r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
    )
    expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
    expr = expr.str.replace_all(r"\s+(station|stop)$", "")
    return expr.str.strip_chars()


def _has_locality() -> pl.Expr:
    return pl.col("locality").is_not_null() & (pl.col("locality") != "")


def _empty_output_frame() -> pl.DataFrame:
    return pl.DataFrame(
        {
            "id": pl.Series([], dtype=pl.String),
            "name": pl.Series([], dtype=pl.String),
            "category": pl.Series([], dtype=pl.String),
            "lat": pl.Series([], dtype=pl.Float64),
            "lng": pl.Series([], dtype=pl.Float64),
        }
    )


def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
    # Prefer the primary station/terminal node over an entrance, then a name
    # without a transport-mode suffix, then the shorter name.
    lower = name.lower()
    suffix_penalty = int(
        lower.endswith(
            (
                " underground station",
                " tube station",
                " dlr station",
                " metro station",
                " tram stop",
                " station",
                " stop",
            )
        )
    )
    return (int(entrance), suffix_penalty, len(name))


@dataclass
class StationAccumulator:
    id: str
    name: str
    category: str
    lat_sum: float
    lng_sum: float
    entrance: bool = False
    count: int = 1

    @property
    def lat(self) -> float:
        return self.lat_sum / self.count

    @property
    def lng(self) -> float:
        return self.lng_sum / self.count

    def same_area(self, lat: float, lng: float) -> bool:
        dlat = self.lat - lat
        dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
        return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2

    def merge(self, row: dict[str, object]) -> None:
        self.lat_sum += float(row["lat"])
        self.lng_sum += float(row["lng"])
        self.count += 1

        name = str(row["name"] or "")
        entrance = bool(row.get("entrance"))
        if station_name_score(name, entrance) < station_name_score(
            self.name, self.entrance
        ):
            self.id = str(row["id"] or "")
            self.name = name
            self.entrance = entrance


def _station_from_row(row: dict[str, object]) -> StationAccumulator:
    return StationAccumulator(
        id=str(row["id"] or ""),
        name=str(row["name"] or ""),
        category=str(row["category"] or ""),
        lat_sum=float(row["lat"]),
        lng_sum=float(row["lng"]),
        entrance=bool(row.get("entrance")),
    )


def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
    if len(df) == 0:
        return _empty_output_frame()

    selected: list[StationAccumulator] = []
    groups: dict[tuple[str, str], list[int]] = {}

    for row in df.iter_rows(named=True):
        # Key by category so different modes sharing a name/area (e.g. a rail
        # station and a ferry terminal) are not merged into one POI.
        category = str(row["category"] or "")
        station_key = (category, canonical_station_name(str(row["name"] or "")))
        if not station_key[1]:
            selected.append(_station_from_row(row))
            continue

        existing = next(
            (
                index
                for index in groups.get(station_key, [])
                if selected[index].same_area(float(row["lat"]), float(row["lng"]))
            ),
            None,
        )
        if existing is not None:
            selected[existing].merge(row)
            continue

        index = len(selected)
        selected.append(_station_from_row(row))
        groups.setdefault(station_key, []).append(index)

    return pl.DataFrame(
        {
            "id": [station.id for station in selected],
            "name": [station.name for station in selected],
            "category": [station.category for station in selected],
            "lat": [station.lat for station in selected],
            "lng": [station.lng for station in selected],
        }
    ).select(OUTPUT_COLUMNS)


def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
    if len(df) == 0:
        return _empty_output_frame()

    has_loc = df.filter(_has_locality())
    no_loc = df.filter(~_has_locality())

    # First pass: one record per exact stop name/category/locality.
    frames = []
    if len(has_loc) > 0:
        frames.append(
            has_loc.group_by("name", "category", "locality")
            .agg(
                pl.col("id").first(),
                pl.col("lat").mean(),
                pl.col("lng").mean(),
            )
            .select(OUTPUT_COLUMNS)
        )
    if len(no_loc) > 0:
        # Stops with no locality can't be deduped by locality, so merge genuine
        # co-located duplicates (same name+category within the same small area)
        # via the station-area logic, while keeping distinct far-apart stops.
        frames.append(_deduplicate_station_areas(no_loc))

    if not frames:
        return _empty_output_frame()

    return pl.concat(frames).select(OUTPUT_COLUMNS)


def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
    """Deduplicate NaPTAN stops, merging station/terminal entrances by area.

    Tube, rail and ferry POIs are merged to one record per station by
    normalized name + area, with the primary station/terminal node (e.g. RLY,
    FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
    by exact name+category+locality.
    """
    station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
    other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))

    return pl.concat(
        [
            _deduplicate_local_stops(other),
            _deduplicate_station_areas(station),
        ]
    ).select(OUTPUT_COLUMNS)


def download_naptan(output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)

    print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
    with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
        raw = resp.read()

    print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")

    df = (
        pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
        .with_columns(
            pl.col("Latitude").cast(pl.Float64, strict=False),
            pl.col("Longitude").cast(pl.Float64, strict=False),
        )
        .drop_nulls(subset=["Latitude", "Longitude"])
        .filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
        .select(
            pl.col("ATCOCode").alias("id"),
            pl.col("CommonName").alias("name"),
            pl.col("StopType").replace(STOP_TYPES).alias("category"),
            pl.col("Latitude").alias("lat"),
            pl.col("Longitude").alias("lng"),
            pl.col("NptgLocalityCode").alias("locality"),
            pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
        )
    )

    before = len(df)
    df = deduplicate_naptan(df)

    print(
        f"Deduplicated {before:,} → {len(df):,} stops "
        "(by name+category+locality; tube stations by normalized name+area)"
    )

    df.write_parquet(output)
    size_mb = output.stat().st_size / (1024 * 1024)
    print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")

    counts = df.group_by("category").len().sort("len", descending=True)
    for row in counts.iter_rows(named=True):
        print(f"  {row['category']}: {row['len']:,}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Download NaPTAN station data")
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    download_naptan(args.output)


if __name__ == "__main__":
    main()