"""Download NaPTAN data and extract railway/metro station POIs."""
import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path
import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TRAM_METRO_CATEGORY = "Tram & Metro stop"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
# London Underground ATCO codes are "ZZLU": a 3-digit
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
# LU stations outside Greater London such as Epping or Amersham), then "0"
# (platform/entrance node) or "G" (station group node), then the system code.
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
# WM Metro, Blackpool Tramway, heritage railways, ...).
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
STOP_TYPES = {
"AIR": "Airport",
# Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
"FER": "Ferry",
"FBT": "Ferry",
"FTD": "Ferry",
# Rail: RLY is the station node; RSE is a station entrance.
"RLY": "Rail station",
"RSE": "Rail station",
"BCT": "Bus stop",
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
# so sparse that 20% of England showed the nearest bus station >100km away.
# Bays and entrances collapse to one POI per station via
# STATION_MERGE_CATEGORIES below.
"BST": "Bus station",
"BCS": "Bus station",
"BCQ": "Bus station",
"BCE": "Bus station",
"TXR": "Taxi rank",
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
# "heritage" flag, so they remain in "Tram & Metro stop".
"TMU": TRAM_METRO_CATEGORY,
"MET": TRAM_METRO_CATEGORY,
}
# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
# station with both a station node and entrances yields one POI at the station
# node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
"Rail station",
"Ferry",
"Bus station",
}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
# "West Station Entrance", ...) are stripped from canonical names so a
# station's individually-named entrance nodes collapse into the station.
# A trailing run of filler words is only stripped when it contains at least
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
_ENTRANCE_FILLER_WORDS = {
"north",
"south",
"east",
"west",
"ne",
"nw",
"se",
"sw",
"n",
"s",
"e",
"w",
"wt",
"main",
"side",
"no",
"station",
"stop",
"platform",
}
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
_ENTRANCE_FILLER_RE = (
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
r"|platform|\d+)"
)
_ENTRANCE_SUFFIX_RE = (
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
rf"\s+{_ENTRANCE_WORDS_RE}"
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
)
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
# stripped so every bay of one station shares a canonical name. The designator
# word must be followed by a short alphanumeric token, so place names ending in
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
def _strip_entrance_suffix(words: list[str]) -> list[str]:
"""Drop a trailing entrance designator (direction/number filler around an
entrance word) from a tokenized stop name; no-op when no entrance word."""
idx = len(words)
saw_entrance = False
while idx > 0:
word = words[idx - 1]
if word in _ENTRANCE_NAME_WORDS:
saw_entrance = True
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
pass
else:
break
idx -= 1
return words[:idx] if saw_entrance else words
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
if not name:
return ""
normalized = name.lower()
normalized = re.sub(r"\([^)]*\)", " ", normalized)
normalized = re.sub(r"['’`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = _strip_entrance_suffix(normalized.split())
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
del words[-2:]
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("metrolink", "station"),
("metrolink", "stop"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
("metrolink",),
)
while True:
suffix = next(
(suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
None,
)
if suffix is None:
break
del words[-len(suffix) :]
return " ".join(words)
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
"""Normalize station names so entrances/transport-mode variants collapse."""
expr = pl.col(name_col).str.to_lowercase()
expr = expr.str.replace_all(r"\([^)]*\)", " ")
expr = expr.str.replace_all(r"['’`]", "")
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
expr = expr.str.replace_all(r"\s+metrolink$", "")
return expr.str.strip_chars()
def _has_locality() -> pl.Expr:
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
def _empty_output_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
# Prefer the primary station/terminal node over an entrance, then a name
# without a transport-mode suffix, then the shorter name.
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" metro station",
" tram stop",
" station",
" stop",
)
)
)
return (int(entrance), suffix_penalty, len(name))
@dataclass
class StationAccumulator:
id: str
name: str
category: str
lat_sum: float
lng_sum: float
entrance: bool = False
is_lu: bool = False
count: int = 1
@property
def lat(self) -> float:
return self.lat_sum / self.count
@property
def lng(self) -> float:
return self.lng_sum / self.count
def same_area(self, lat: float, lng: float) -> bool:
dlat = self.lat - lat
dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2
def merge(self, row: dict[str, object]) -> None:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
self.is_lu = self.is_lu or bool(row.get("is_lu"))
name = str(row["name"] or "")
entrance = bool(row.get("entrance"))
if station_name_score(name, entrance) < station_name_score(
self.name, self.entrance
):
self.id = str(row["id"] or "")
self.name = name
self.entrance = entrance
@property
def output_category(self) -> str:
# A merged tram/metro station is a genuine Tube station when ANY of its
# constituent nodes carries a London Underground ATCO code. Checking
# the whole group (not just the winning node) matters because LU
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
return TUBE_STATION_CATEGORY
return self.category
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
id=str(row["id"] or ""),
name=str(row["name"] or ""),
category=str(row["category"] or ""),
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
entrance=bool(row.get("entrance")),
is_lu=bool(row.get("is_lu")),
)
def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
selected: list[StationAccumulator] = []
groups: dict[tuple[str, str], list[int]] = {}
for row in df.iter_rows(named=True):
# Key by category so different modes sharing a name/area (e.g. a rail
# station and a ferry terminal) are not merged into one POI.
category = str(row["category"] or "")
station_key = (category, canonical_station_name(str(row["name"] or "")))
if not station_key[1]:
selected.append(_station_from_row(row))
continue
existing = next(
(
index
for index in groups.get(station_key, [])
if selected[index].same_area(float(row["lat"]), float(row["lng"]))
),
None,
)
if existing is not None:
selected[existing].merge(row)
continue
index = len(selected)
selected.append(_station_from_row(row))
groups.setdefault(station_key, []).append(index)
return pl.DataFrame(
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.output_category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
).select(OUTPUT_COLUMNS)
def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
# First pass: one record per exact stop name/category/locality.
frames = []
if len(has_loc) > 0:
frames.append(
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
)
if len(no_loc) > 0:
# Stops with no locality can't be deduped by locality, so merge genuine
# co-located duplicates (same name+category within the same small area)
# via the station-area logic, while keeping distinct far-apart stops.
frames.append(_deduplicate_station_areas(no_loc))
if not frames:
return _empty_output_frame()
return pl.concat(frames).select(OUTPUT_COLUMNS)
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
station by normalized name + area, with the primary station/terminal node
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
BCE). Merged tram/metro stations with a London Underground ATCO code in
the group become "Tube station". Other stops are deduplicated by exact
name+category+locality.
"""
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
return pl.concat(
[
_deduplicate_local_stops(other),
_deduplicate_station_areas(station),
]
).select(OUTPUT_COLUMNS)
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
"""Keep only active NaPTAN stops.
The NaPTAN export's Status column marks stops as active/inactive/pending;
without this filter closed stations ("(closed)", "not in use") ship as
live POIs. Rows with a null Status are kept (benefit of the doubt); a
missing column is tolerated so older extracts still load.
"""
if "Status" not in df.columns:
print("WARNING: NaPTAN data has no Status column; keeping all stops")
return df
before = len(df)
df = df.filter(
pl.col("Status").is_null()
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
)
dropped = before - len(df)
if dropped:
print(f"Dropped {dropped:,} non-active stops (Status != active)")
return df
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
raw = resp.read()
print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")
df = (
pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
.with_columns(
pl.col("Latitude").cast(pl.Float64, strict=False),
pl.col("Longitude").cast(pl.Float64, strict=False),
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
)
df = filter_active_stops(df).select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
pl.col("ATCOCode")
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
.fill_null(False)
.alias("is_lu"),
)
before = len(df)
df = deduplicate_naptan(df)
print(
f"Deduplicated {before:,} → {len(df):,} stops "
"(by name+category+locality; tube stations by normalized name+area)"
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download NaPTAN station data")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_naptan(args.output)
if __name__ == "__main__":
main()