perfect-postcode/pipeline/download/naptan.py

"""Download NaPTAN data and extract railway/metro station POIs."""

import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path

import polars as pl

NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TRAM_METRO_CATEGORY = "Tram & Metro stop"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01

# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
# LU stations outside Greater London such as Epping or Amersham), then "0"
# (platform/entrance node) or "G" (station group node), then the system code.
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
# WM Metro, Blackpool Tramway, heritage railways, ...).
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"


STOP_TYPES = {
    "AIR": "Airport",
    # Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
    "FER": "Ferry",
    "FBT": "Ferry",
    "FTD": "Ferry",
    # Rail: RLY is the station node; RSE is a station entrance.
    "RLY": "Rail station",
    "RSE": "Rail station",
    "BCT": "Bus stop",
    # Bus/coach stations: BST is the station access-area node, BCS/BCQ are
    # bays/stands within the station and BCE is a station entrance. NaPTAN maps
    # very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
    # so sparse that 20% of England showed the nearest bus station >100km away.
    # Bays and entrances collapse to one POI per station via
    # STATION_MERGE_CATEGORIES below.
    "BST": "Bus station",
    "BCS": "Bus station",
    "BCQ": "Bus station",
    "BCE": "Bus station",
    "TXR": "Taxi rank",
    # Tram/Metro/Underground: TMU is an entrance node, MET the station access
    # area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
    # mark them as London Underground (ZZLU) are reclassified to "Tube station"
    # after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
    # Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
    # "heritage" flag, so they remain in "Tram & Metro stop".
    "TMU": TRAM_METRO_CATEGORY,
    "MET": TRAM_METRO_CATEGORY,
}

# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
# station with both a station node and entrances yields one POI at the station
# node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}

# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {
    TRAM_METRO_CATEGORY,
    TUBE_STATION_CATEGORY,
    "Rail station",
    "Ferry",
    "Bus station",
}


OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]

# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
# "West Station Entrance", ...) are stripped from canonical names so a
# station's individually-named entrance nodes collapse into the station.
# A trailing run of filler words is only stripped when it contains at least
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
_ENTRANCE_FILLER_WORDS = {
    "north",
    "south",
    "east",
    "west",
    "ne",
    "nw",
    "se",
    "sw",
    "n",
    "s",
    "e",
    "w",
    "wt",
    "main",
    "side",
    "no",
    "station",
    "stop",
    "platform",
}

_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
_ENTRANCE_FILLER_RE = (
    r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
    r"|platform|\d+)"
)
_ENTRANCE_SUFFIX_RE = (
    rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
    rf"\s+{_ENTRANCE_WORDS_RE}"
    rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
)

# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
# stripped so every bay of one station shares a canonical name. The designator
# word must be followed by a short alphanumeric token, so place names ending in
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"


def _strip_entrance_suffix(words: list[str]) -> list[str]:
    """Drop a trailing entrance designator (direction/number filler around an
    entrance word) from a tokenized stop name; no-op when no entrance word."""
    idx = len(words)
    saw_entrance = False
    while idx > 0:
        word = words[idx - 1]
        if word in _ENTRANCE_NAME_WORDS:
            saw_entrance = True
        elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
            pass
        else:
            break
        idx -= 1
    return words[:idx] if saw_entrance else words


def canonical_station_name(name: str | None) -> str:
    """Normalize station names so entrances/transport-mode variants collapse."""
    if not name:
        return ""

    normalized = name.lower()
    normalized = re.sub(r"\([^)]*\)", " ", normalized)
    normalized = re.sub(r"['’`]", "", normalized)
    normalized = normalized.replace("&", " and ")
    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
    words = _strip_entrance_suffix(normalized.split())

    if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
        del words[-2:]

    suffixes = (
        ("underground", "station"),
        ("tube", "station"),
        ("dlr", "station"),
        ("metro", "station"),
        ("metrolink", "station"),
        ("metrolink", "stop"),
        ("tram", "stop"),
        ("rail", "station"),
        ("railway", "station"),
        ("station",),
        ("stop",),
        ("metrolink",),
    )
    while True:
        suffix = next(
            (suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
            None,
        )
        if suffix is None:
            break
        del words[-len(suffix) :]

    return " ".join(words)


_QUALIFIER_RE = re.compile(r"\(([^)]*)\)")


def station_name_qualifier(name: str | None) -> str:
    """The canonicalized parenthetical of a station name, e.g. "Edgware Road
    (Bakerloo)" -> "bakerloo".

    Genuinely distinct same-named stations (the two Edgware Roads ~150m apart,
    Hammersmith's two stations) differ ONLY by this parenthetical, which
    `canonical_station_name` strips; it must block their merge while still
    letting unqualified entrance/variant rows collapse into either.
    """
    if not name:
        return ""
    parts = _QUALIFIER_RE.findall(name)
    if not parts:
        return ""
    text = " ".join(parts).lower().replace("&", " and ")
    return re.sub(r"[^a-z0-9]+", " ", text).strip()


def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
    """Normalize station names so entrances/transport-mode variants collapse."""
    expr = pl.col(name_col).str.to_lowercase()
    expr = expr.str.replace_all(r"\([^)]*\)", " ")
    expr = expr.str.replace_all(r"['’`]", "")
    expr = expr.str.replace_all(r"&", " and ")
    expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
    expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
    expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
    expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
    expr = expr.str.replace_all(
        r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
    )
    expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
    expr = expr.str.replace_all(r"\s+(station|stop)$", "")
    expr = expr.str.replace_all(r"\s+metrolink$", "")
    return expr.str.strip_chars()


def _has_locality() -> pl.Expr:
    return pl.col("locality").is_not_null() & (pl.col("locality") != "")


def _empty_output_frame() -> pl.DataFrame:
    return pl.DataFrame(
        {
            "id": pl.Series([], dtype=pl.String),
            "name": pl.Series([], dtype=pl.String),
            "category": pl.Series([], dtype=pl.String),
            "lat": pl.Series([], dtype=pl.Float64),
            "lng": pl.Series([], dtype=pl.Float64),
        }
    )


def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
    # Prefer the primary station/terminal node over an entrance, then a name
    # without a transport-mode suffix, then the shorter name.
    lower = name.lower()
    suffix_penalty = int(
        lower.endswith(
            (
                " underground station",
                " tube station",
                " dlr station",
                " metro station",
                " tram stop",
                " station",
                " stop",
            )
        )
    )
    return (int(entrance), suffix_penalty, len(name))


@dataclass
class StationAccumulator:
    id: str
    name: str
    category: str
    lat_sum: float
    lng_sum: float
    entrance: bool = False
    is_lu: bool = False
    count: int = 1
    qualifier: str = ""

    @property
    def lat(self) -> float:
        return self.lat_sum / self.count

    @property
    def lng(self) -> float:
        return self.lng_sum / self.count

    def same_area(self, lat: float, lng: float) -> bool:
        dlat = self.lat - lat
        dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
        return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2

    def qualifier_compatible(self, qualifier: str) -> bool:
        # Conflicting parentheticals mark distinct same-named stations; an
        # unqualified row can join either group.
        return not qualifier or not self.qualifier or qualifier == self.qualifier

    def merge(self, row: dict[str, object]) -> None:
        self.lat_sum += float(row["lat"])
        self.lng_sum += float(row["lng"])
        self.count += 1
        self.is_lu = self.is_lu or bool(row.get("is_lu"))

        name = str(row["name"] or "")
        row_qualifier = station_name_qualifier(name)
        self.qualifier = self.qualifier or row_qualifier
        entrance = bool(row.get("entrance"))
        # Prefer a display name carrying the group's disambiguating
        # parenthetical: without it the two Edgware Roads would both render as
        # the bare "Edgware Road Underground Station".
        candidate = (
            self._qualifier_penalty(row_qualifier),
            *station_name_score(name, entrance),
        )
        current = (
            self._qualifier_penalty(station_name_qualifier(self.name)),
            *station_name_score(self.name, self.entrance),
        )
        if candidate < current:
            self.id = str(row["id"] or "")
            self.name = name
            self.entrance = entrance

    def _qualifier_penalty(self, name_qualifier: str) -> int:
        return int(bool(self.qualifier) and name_qualifier != self.qualifier)

    @property
    def output_category(self) -> str:
        # A merged tram/metro station is a genuine Tube station when ANY of its
        # constituent nodes carries a London Underground ATCO code. Checking
        # the whole group (not just the winning node) matters because LU
        # entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
        if self.category == TRAM_METRO_CATEGORY and self.is_lu:
            return TUBE_STATION_CATEGORY
        return self.category


def _station_from_row(row: dict[str, object]) -> StationAccumulator:
    return StationAccumulator(
        id=str(row["id"] or ""),
        name=str(row["name"] or ""),
        category=str(row["category"] or ""),
        lat_sum=float(row["lat"]),
        lng_sum=float(row["lng"]),
        entrance=bool(row.get("entrance")),
        is_lu=bool(row.get("is_lu")),
        qualifier=station_name_qualifier(str(row["name"] or "")),
    )


def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
    if len(df) == 0:
        return _empty_output_frame()

    selected: list[StationAccumulator] = []
    groups: dict[tuple[str, str], list[int]] = {}

    for row in df.iter_rows(named=True):
        # Key by category so different modes sharing a name/area (e.g. a rail
        # station and a ferry terminal) are not merged into one POI.
        category = str(row["category"] or "")
        station_key = (category, canonical_station_name(str(row["name"] or "")))
        if not station_key[1]:
            selected.append(_station_from_row(row))
            continue

        row_qualifier = station_name_qualifier(str(row["name"] or ""))
        existing = next(
            (
                index
                for index in groups.get(station_key, [])
                if selected[index].same_area(float(row["lat"]), float(row["lng"]))
                and selected[index].qualifier_compatible(row_qualifier)
            ),
            None,
        )
        if existing is not None:
            selected[existing].merge(row)
            continue

        index = len(selected)
        selected.append(_station_from_row(row))
        groups.setdefault(station_key, []).append(index)

    return pl.DataFrame(
        {
            "id": [station.id for station in selected],
            "name": [station.name for station in selected],
            "category": [station.output_category for station in selected],
            "lat": [station.lat for station in selected],
            "lng": [station.lng for station in selected],
        }
    ).select(OUTPUT_COLUMNS)


def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
    if len(df) == 0:
        return _empty_output_frame()

    has_loc = df.filter(_has_locality())
    no_loc = df.filter(~_has_locality())

    # First pass: one record per exact stop name/category/locality.
    frames = []
    if len(has_loc) > 0:
        frames.append(
            has_loc.group_by("name", "category", "locality")
            .agg(
                pl.col("id").first(),
                pl.col("lat").mean(),
                pl.col("lng").mean(),
            )
            .select(OUTPUT_COLUMNS)
        )
    if len(no_loc) > 0:
        # Stops with no locality can't be deduped by locality, so merge genuine
        # co-located duplicates (same name+category within the same small area)
        # via the station-area logic, while keeping distinct far-apart stops.
        frames.append(_deduplicate_station_areas(no_loc))

    if not frames:
        return _empty_output_frame()

    return pl.concat(frames).select(OUTPUT_COLUMNS)


def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
    """Deduplicate NaPTAN stops, merging station/terminal entrances by area.

    Tram/metro, rail, ferry and bus-station POIs are merged to one record per
    station by normalized name + area, with the primary station/terminal node
    (e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
    BCE). Merged tram/metro stations with a London Underground ATCO code in
    the group become "Tube station". Other stops are deduplicated by exact
    name+category+locality.
    """
    station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
    other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))

    return pl.concat(
        [
            _deduplicate_local_stops(other),
            _deduplicate_station_areas(station),
        ]
    ).select(OUTPUT_COLUMNS)


def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
    """Keep only active NaPTAN stops.

    The NaPTAN export's Status column marks stops as active/inactive/pending;
    without this filter closed stations ("(closed)", "not in use") ship as
    live POIs. Rows with a null Status are kept (benefit of the doubt); a
    missing column is tolerated so older extracts still load.
    """
    if "Status" not in df.columns:
        print("WARNING: NaPTAN data has no Status column; keeping all stops")
        return df

    before = len(df)
    df = df.filter(
        pl.col("Status").is_null()
        | pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
    )
    dropped = before - len(df)
    if dropped:
        print(f"Dropped {dropped:,} non-active stops (Status != active)")
    return df


def download_naptan(output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)

    print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
    with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
        raw = resp.read()

    print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")

    df = (
        pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
        .with_columns(
            pl.col("Latitude").cast(pl.Float64, strict=False),
            pl.col("Longitude").cast(pl.Float64, strict=False),
        )
        .drop_nulls(subset=["Latitude", "Longitude"])
        .filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
    )
    df = filter_active_stops(df).select(
        pl.col("ATCOCode").alias("id"),
        pl.col("CommonName").alias("name"),
        pl.col("StopType").replace(STOP_TYPES).alias("category"),
        pl.col("Latitude").alias("lat"),
        pl.col("Longitude").alias("lng"),
        pl.col("NptgLocalityCode").alias("locality"),
        pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
        pl.col("ATCOCode")
        .str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
        .fill_null(False)
        .alias("is_lu"),
    )

    before = len(df)
    df = deduplicate_naptan(df)

    print(
        f"Deduplicated {before:,} → {len(df):,} stops "
        "(by name+category+locality; tube stations by normalized name+area)"
    )

    df.write_parquet(output)
    size_mb = output.stat().st_size / (1024 * 1024)
    print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")

    counts = df.group_by("category").len().sort("len", descending=True)
    for row in counts.iter_rows(named=True):
        print(f"  {row['category']}: {row['len']:,}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Download NaPTAN station data")
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    download_naptan(args.output)


if __name__ == "__main__":
    main()