perfect-postcode/pipeline/download/naptan.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

522 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Download NaPTAN data and extract railway/metro station POIs."""
import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path
import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TRAM_METRO_CATEGORY = "Tram & Metro stop"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
# LU stations outside Greater London such as Epping or Amersham), then "0"
# (platform/entrance node) or "G" (station group node), then the system code.
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
# WM Metro, Blackpool Tramway, heritage railways, ...).
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
STOP_TYPES = {
"AIR": "Airport",
# Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
"FER": "Ferry",
"FBT": "Ferry",
"FTD": "Ferry",
# Rail: RLY is the station node; RSE is a station entrance.
"RLY": "Rail station",
"RSE": "Rail station",
"BCT": "Bus stop",
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
# so sparse that 20% of England showed the nearest bus station >100km away.
# Bays and entrances collapse to one POI per station via
# STATION_MERGE_CATEGORIES below.
"BST": "Bus station",
"BCS": "Bus station",
"BCQ": "Bus station",
"BCE": "Bus station",
"TXR": "Taxi rank",
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
# "heritage" flag, so they remain in "Tram & Metro stop".
"TMU": TRAM_METRO_CATEGORY,
"MET": TRAM_METRO_CATEGORY,
}
# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
# station with both a station node and entrances yields one POI at the station
# node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
"Rail station",
"Ferry",
"Bus station",
}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
# "West Station Entrance", ...) are stripped from canonical names so a
# station's individually-named entrance nodes collapse into the station.
# A trailing run of filler words is only stripped when it contains at least
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
_ENTRANCE_FILLER_WORDS = {
"north",
"south",
"east",
"west",
"ne",
"nw",
"se",
"sw",
"n",
"s",
"e",
"w",
"wt",
"main",
"side",
"no",
"station",
"stop",
"platform",
}
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
_ENTRANCE_FILLER_RE = (
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
r"|platform|\d+)"
)
_ENTRANCE_SUFFIX_RE = (
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
rf"\s+{_ENTRANCE_WORDS_RE}"
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
)
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
# stripped so every bay of one station shares a canonical name. The designator
# word must be followed by a short alphanumeric token, so place names ending in
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
def _strip_entrance_suffix(words: list[str]) -> list[str]:
"""Drop a trailing entrance designator (direction/number filler around an
entrance word) from a tokenized stop name; no-op when no entrance word."""
idx = len(words)
saw_entrance = False
while idx > 0:
word = words[idx - 1]
if word in _ENTRANCE_NAME_WORDS:
saw_entrance = True
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
pass
else:
break
idx -= 1
return words[:idx] if saw_entrance else words
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
if not name:
return ""
normalized = name.lower()
normalized = re.sub(r"\([^)]*\)", " ", normalized)
normalized = re.sub(r"['`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = _strip_entrance_suffix(normalized.split())
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
del words[-2:]
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("metrolink", "station"),
("metrolink", "stop"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
("metrolink",),
)
while True:
suffix = next(
(suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
None,
)
if suffix is None:
break
del words[-len(suffix) :]
return " ".join(words)
_QUALIFIER_RE = re.compile(r"\(([^)]*)\)")
def station_name_qualifier(name: str | None) -> str:
"""The canonicalized parenthetical of a station name, e.g. "Edgware Road
(Bakerloo)" -> "bakerloo".
Genuinely distinct same-named stations (the two Edgware Roads ~150m apart,
Hammersmith's two stations) differ ONLY by this parenthetical, which
`canonical_station_name` strips; it must block their merge while still
letting unqualified entrance/variant rows collapse into either.
"""
if not name:
return ""
parts = _QUALIFIER_RE.findall(name)
if not parts:
return ""
text = " ".join(parts).lower().replace("&", " and ")
return re.sub(r"[^a-z0-9]+", " ", text).strip()
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
"""Normalize station names so entrances/transport-mode variants collapse."""
expr = pl.col(name_col).str.to_lowercase()
expr = expr.str.replace_all(r"\([^)]*\)", " ")
expr = expr.str.replace_all(r"['`]", "")
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
expr = expr.str.replace_all(r"\s+metrolink$", "")
return expr.str.strip_chars()
def _has_locality() -> pl.Expr:
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
def _empty_output_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
# Prefer the primary station/terminal node over an entrance, then a name
# without a transport-mode suffix, then the shorter name.
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" metro station",
" tram stop",
" station",
" stop",
)
)
)
return (int(entrance), suffix_penalty, len(name))
@dataclass
class StationAccumulator:
id: str
name: str
category: str
lat_sum: float
lng_sum: float
entrance: bool = False
is_lu: bool = False
count: int = 1
qualifier: str = ""
@property
def lat(self) -> float:
return self.lat_sum / self.count
@property
def lng(self) -> float:
return self.lng_sum / self.count
def same_area(self, lat: float, lng: float) -> bool:
dlat = self.lat - lat
dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2
def qualifier_compatible(self, qualifier: str) -> bool:
# Conflicting parentheticals mark distinct same-named stations; an
# unqualified row can join either group.
return not qualifier or not self.qualifier or qualifier == self.qualifier
def merge(self, row: dict[str, object]) -> None:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
self.is_lu = self.is_lu or bool(row.get("is_lu"))
name = str(row["name"] or "")
row_qualifier = station_name_qualifier(name)
self.qualifier = self.qualifier or row_qualifier
entrance = bool(row.get("entrance"))
# Prefer a display name carrying the group's disambiguating
# parenthetical: without it the two Edgware Roads would both render as
# the bare "Edgware Road Underground Station".
candidate = (
self._qualifier_penalty(row_qualifier),
*station_name_score(name, entrance),
)
current = (
self._qualifier_penalty(station_name_qualifier(self.name)),
*station_name_score(self.name, self.entrance),
)
if candidate < current:
self.id = str(row["id"] or "")
self.name = name
self.entrance = entrance
def _qualifier_penalty(self, name_qualifier: str) -> int:
return int(bool(self.qualifier) and name_qualifier != self.qualifier)
@property
def output_category(self) -> str:
# A merged tram/metro station is a genuine Tube station when ANY of its
# constituent nodes carries a London Underground ATCO code. Checking
# the whole group (not just the winning node) matters because LU
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
return TUBE_STATION_CATEGORY
return self.category
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
id=str(row["id"] or ""),
name=str(row["name"] or ""),
category=str(row["category"] or ""),
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
entrance=bool(row.get("entrance")),
is_lu=bool(row.get("is_lu")),
qualifier=station_name_qualifier(str(row["name"] or "")),
)
def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
selected: list[StationAccumulator] = []
groups: dict[tuple[str, str], list[int]] = {}
for row in df.iter_rows(named=True):
# Key by category so different modes sharing a name/area (e.g. a rail
# station and a ferry terminal) are not merged into one POI.
category = str(row["category"] or "")
station_key = (category, canonical_station_name(str(row["name"] or "")))
if not station_key[1]:
selected.append(_station_from_row(row))
continue
row_qualifier = station_name_qualifier(str(row["name"] or ""))
existing = next(
(
index
for index in groups.get(station_key, [])
if selected[index].same_area(float(row["lat"]), float(row["lng"]))
and selected[index].qualifier_compatible(row_qualifier)
),
None,
)
if existing is not None:
selected[existing].merge(row)
continue
index = len(selected)
selected.append(_station_from_row(row))
groups.setdefault(station_key, []).append(index)
return pl.DataFrame(
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.output_category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
).select(OUTPUT_COLUMNS)
def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
# First pass: one record per exact stop name/category/locality.
frames = []
if len(has_loc) > 0:
frames.append(
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
)
if len(no_loc) > 0:
# Stops with no locality can't be deduped by locality, so merge genuine
# co-located duplicates (same name+category within the same small area)
# via the station-area logic, while keeping distinct far-apart stops.
frames.append(_deduplicate_station_areas(no_loc))
if not frames:
return _empty_output_frame()
return pl.concat(frames).select(OUTPUT_COLUMNS)
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
station by normalized name + area, with the primary station/terminal node
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
BCE). Merged tram/metro stations with a London Underground ATCO code in
the group become "Tube station". Other stops are deduplicated by exact
name+category+locality.
"""
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
return pl.concat(
[
_deduplicate_local_stops(other),
_deduplicate_station_areas(station),
]
).select(OUTPUT_COLUMNS)
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
"""Keep only active NaPTAN stops.
The NaPTAN export's Status column marks stops as active/inactive/pending;
without this filter closed stations ("(closed)", "not in use") ship as
live POIs. Rows with a null Status are kept (benefit of the doubt); a
missing column is tolerated so older extracts still load.
"""
if "Status" not in df.columns:
print("WARNING: NaPTAN data has no Status column; keeping all stops")
return df
before = len(df)
df = df.filter(
pl.col("Status").is_null()
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
)
dropped = before - len(df)
if dropped:
print(f"Dropped {dropped:,} non-active stops (Status != active)")
return df
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
raw = resp.read()
print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")
df = (
pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
.with_columns(
pl.col("Latitude").cast(pl.Float64, strict=False),
pl.col("Longitude").cast(pl.Float64, strict=False),
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
)
df = filter_active_stops(df).select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
pl.col("ATCOCode")
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
.fill_null(False)
.alias("is_lu"),
)
before = len(df)
df = deduplicate_naptan(df)
print(
f"Deduplicated {before:,}{len(df):,} stops "
"(by name+category+locality; tube stations by normalized name+area)"
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download NaPTAN station data")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_naptan(args.output)
if __name__ == "__main__":
main()