522 lines
18 KiB
Python
522 lines
18 KiB
Python
"""Download NaPTAN data and extract railway/metro station POIs."""
|
||
|
||
import argparse
|
||
import io
|
||
import math
|
||
import re
|
||
import urllib.request
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
|
||
import polars as pl
|
||
|
||
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||
TUBE_STATION_CATEGORY = "Tube station"
|
||
TRAM_METRO_CATEGORY = "Tram & Metro stop"
|
||
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
|
||
|
||
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
|
||
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
|
||
# LU stations outside Greater London such as Epping or Amersham), then "0"
|
||
# (platform/entrance node) or "G" (station group node), then the system code.
|
||
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
|
||
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
|
||
# WM Metro, Blackpool Tramway, heritage railways, ...).
|
||
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
|
||
|
||
|
||
STOP_TYPES = {
|
||
"AIR": "Airport",
|
||
# Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
|
||
"FER": "Ferry",
|
||
"FBT": "Ferry",
|
||
"FTD": "Ferry",
|
||
# Rail: RLY is the station node; RSE is a station entrance.
|
||
"RLY": "Rail station",
|
||
"RSE": "Rail station",
|
||
"BCT": "Bus stop",
|
||
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
|
||
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
|
||
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
|
||
# so sparse that 20% of England showed the nearest bus station >100km away.
|
||
# Bays and entrances collapse to one POI per station via
|
||
# STATION_MERGE_CATEGORIES below.
|
||
"BST": "Bus station",
|
||
"BCS": "Bus station",
|
||
"BCQ": "Bus station",
|
||
"BCE": "Bus station",
|
||
"TXR": "Taxi rank",
|
||
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
|
||
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
|
||
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
|
||
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
|
||
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
|
||
# "heritage" flag, so they remain in "Tram & Metro stop".
|
||
"TMU": TRAM_METRO_CATEGORY,
|
||
"MET": TRAM_METRO_CATEGORY,
|
||
}
|
||
|
||
# Stop types that are access/entrance nodes rather than the primary station or
|
||
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
|
||
# station with both a station node and entrances yields one POI at the station
|
||
# node.
|
||
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
|
||
|
||
# Categories whose entrances/variants are merged into a single station-level POI
|
||
# by normalized name + area (like Tube stations), so an RLY node and its RSE
|
||
# entrances collapse to one POI at the station node.
|
||
STATION_MERGE_CATEGORIES = {
|
||
TRAM_METRO_CATEGORY,
|
||
TUBE_STATION_CATEGORY,
|
||
"Rail station",
|
||
"Ferry",
|
||
"Bus station",
|
||
}
|
||
|
||
|
||
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
|
||
|
||
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
|
||
# "West Station Entrance", ...) are stripped from canonical names so a
|
||
# station's individually-named entrance nodes collapse into the station.
|
||
# A trailing run of filler words is only stripped when it contains at least
|
||
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
|
||
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
|
||
_ENTRANCE_FILLER_WORDS = {
|
||
"north",
|
||
"south",
|
||
"east",
|
||
"west",
|
||
"ne",
|
||
"nw",
|
||
"se",
|
||
"sw",
|
||
"n",
|
||
"s",
|
||
"e",
|
||
"w",
|
||
"wt",
|
||
"main",
|
||
"side",
|
||
"no",
|
||
"station",
|
||
"stop",
|
||
"platform",
|
||
}
|
||
|
||
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
|
||
_ENTRANCE_FILLER_RE = (
|
||
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
|
||
r"|platform|\d+)"
|
||
)
|
||
_ENTRANCE_SUFFIX_RE = (
|
||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
|
||
rf"\s+{_ENTRANCE_WORDS_RE}"
|
||
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
|
||
)
|
||
|
||
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
|
||
# stripped so every bay of one station shares a canonical name. The designator
|
||
# word must be followed by a short alphanumeric token, so place names ending in
|
||
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
|
||
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
|
||
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
|
||
|
||
|
||
def _strip_entrance_suffix(words: list[str]) -> list[str]:
|
||
"""Drop a trailing entrance designator (direction/number filler around an
|
||
entrance word) from a tokenized stop name; no-op when no entrance word."""
|
||
idx = len(words)
|
||
saw_entrance = False
|
||
while idx > 0:
|
||
word = words[idx - 1]
|
||
if word in _ENTRANCE_NAME_WORDS:
|
||
saw_entrance = True
|
||
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
|
||
pass
|
||
else:
|
||
break
|
||
idx -= 1
|
||
return words[:idx] if saw_entrance else words
|
||
|
||
|
||
def canonical_station_name(name: str | None) -> str:
|
||
"""Normalize station names so entrances/transport-mode variants collapse."""
|
||
if not name:
|
||
return ""
|
||
|
||
normalized = name.lower()
|
||
normalized = re.sub(r"\([^)]*\)", " ", normalized)
|
||
normalized = re.sub(r"['’`]", "", normalized)
|
||
normalized = normalized.replace("&", " and ")
|
||
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
|
||
words = _strip_entrance_suffix(normalized.split())
|
||
|
||
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
|
||
del words[-2:]
|
||
|
||
suffixes = (
|
||
("underground", "station"),
|
||
("tube", "station"),
|
||
("dlr", "station"),
|
||
("metro", "station"),
|
||
("metrolink", "station"),
|
||
("metrolink", "stop"),
|
||
("tram", "stop"),
|
||
("rail", "station"),
|
||
("railway", "station"),
|
||
("station",),
|
||
("stop",),
|
||
("metrolink",),
|
||
)
|
||
while True:
|
||
suffix = next(
|
||
(suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
|
||
None,
|
||
)
|
||
if suffix is None:
|
||
break
|
||
del words[-len(suffix) :]
|
||
|
||
return " ".join(words)
|
||
|
||
|
||
_QUALIFIER_RE = re.compile(r"\(([^)]*)\)")
|
||
|
||
|
||
def station_name_qualifier(name: str | None) -> str:
|
||
"""The canonicalized parenthetical of a station name, e.g. "Edgware Road
|
||
(Bakerloo)" -> "bakerloo".
|
||
|
||
Genuinely distinct same-named stations (the two Edgware Roads ~150m apart,
|
||
Hammersmith's two stations) differ ONLY by this parenthetical, which
|
||
`canonical_station_name` strips; it must block their merge while still
|
||
letting unqualified entrance/variant rows collapse into either.
|
||
"""
|
||
if not name:
|
||
return ""
|
||
parts = _QUALIFIER_RE.findall(name)
|
||
if not parts:
|
||
return ""
|
||
text = " ".join(parts).lower().replace("&", " and ")
|
||
return re.sub(r"[^a-z0-9]+", " ", text).strip()
|
||
|
||
|
||
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
|
||
"""Normalize station names so entrances/transport-mode variants collapse."""
|
||
expr = pl.col(name_col).str.to_lowercase()
|
||
expr = expr.str.replace_all(r"\([^)]*\)", " ")
|
||
expr = expr.str.replace_all(r"['’`]", "")
|
||
expr = expr.str.replace_all(r"&", " and ")
|
||
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
|
||
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
|
||
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
|
||
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
|
||
expr = expr.str.replace_all(
|
||
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
|
||
)
|
||
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
|
||
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
|
||
expr = expr.str.replace_all(r"\s+metrolink$", "")
|
||
return expr.str.strip_chars()
|
||
|
||
|
||
def _has_locality() -> pl.Expr:
|
||
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
|
||
|
||
|
||
def _empty_output_frame() -> pl.DataFrame:
|
||
return pl.DataFrame(
|
||
{
|
||
"id": pl.Series([], dtype=pl.String),
|
||
"name": pl.Series([], dtype=pl.String),
|
||
"category": pl.Series([], dtype=pl.String),
|
||
"lat": pl.Series([], dtype=pl.Float64),
|
||
"lng": pl.Series([], dtype=pl.Float64),
|
||
}
|
||
)
|
||
|
||
|
||
def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
|
||
# Prefer the primary station/terminal node over an entrance, then a name
|
||
# without a transport-mode suffix, then the shorter name.
|
||
lower = name.lower()
|
||
suffix_penalty = int(
|
||
lower.endswith(
|
||
(
|
||
" underground station",
|
||
" tube station",
|
||
" dlr station",
|
||
" metro station",
|
||
" tram stop",
|
||
" station",
|
||
" stop",
|
||
)
|
||
)
|
||
)
|
||
return (int(entrance), suffix_penalty, len(name))
|
||
|
||
|
||
@dataclass
|
||
class StationAccumulator:
|
||
id: str
|
||
name: str
|
||
category: str
|
||
lat_sum: float
|
||
lng_sum: float
|
||
entrance: bool = False
|
||
is_lu: bool = False
|
||
count: int = 1
|
||
qualifier: str = ""
|
||
|
||
@property
|
||
def lat(self) -> float:
|
||
return self.lat_sum / self.count
|
||
|
||
@property
|
||
def lng(self) -> float:
|
||
return self.lng_sum / self.count
|
||
|
||
def same_area(self, lat: float, lng: float) -> bool:
|
||
dlat = self.lat - lat
|
||
dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
|
||
return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2
|
||
|
||
def qualifier_compatible(self, qualifier: str) -> bool:
|
||
# Conflicting parentheticals mark distinct same-named stations; an
|
||
# unqualified row can join either group.
|
||
return not qualifier or not self.qualifier or qualifier == self.qualifier
|
||
|
||
def merge(self, row: dict[str, object]) -> None:
|
||
self.lat_sum += float(row["lat"])
|
||
self.lng_sum += float(row["lng"])
|
||
self.count += 1
|
||
self.is_lu = self.is_lu or bool(row.get("is_lu"))
|
||
|
||
name = str(row["name"] or "")
|
||
row_qualifier = station_name_qualifier(name)
|
||
self.qualifier = self.qualifier or row_qualifier
|
||
entrance = bool(row.get("entrance"))
|
||
# Prefer a display name carrying the group's disambiguating
|
||
# parenthetical: without it the two Edgware Roads would both render as
|
||
# the bare "Edgware Road Underground Station".
|
||
candidate = (
|
||
self._qualifier_penalty(row_qualifier),
|
||
*station_name_score(name, entrance),
|
||
)
|
||
current = (
|
||
self._qualifier_penalty(station_name_qualifier(self.name)),
|
||
*station_name_score(self.name, self.entrance),
|
||
)
|
||
if candidate < current:
|
||
self.id = str(row["id"] or "")
|
||
self.name = name
|
||
self.entrance = entrance
|
||
|
||
def _qualifier_penalty(self, name_qualifier: str) -> int:
|
||
return int(bool(self.qualifier) and name_qualifier != self.qualifier)
|
||
|
||
@property
|
||
def output_category(self) -> str:
|
||
# A merged tram/metro station is a genuine Tube station when ANY of its
|
||
# constituent nodes carries a London Underground ATCO code. Checking
|
||
# the whole group (not just the winning node) matters because LU
|
||
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
|
||
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
|
||
return TUBE_STATION_CATEGORY
|
||
return self.category
|
||
|
||
|
||
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
|
||
return StationAccumulator(
|
||
id=str(row["id"] or ""),
|
||
name=str(row["name"] or ""),
|
||
category=str(row["category"] or ""),
|
||
lat_sum=float(row["lat"]),
|
||
lng_sum=float(row["lng"]),
|
||
entrance=bool(row.get("entrance")),
|
||
is_lu=bool(row.get("is_lu")),
|
||
qualifier=station_name_qualifier(str(row["name"] or "")),
|
||
)
|
||
|
||
|
||
def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
|
||
if len(df) == 0:
|
||
return _empty_output_frame()
|
||
|
||
selected: list[StationAccumulator] = []
|
||
groups: dict[tuple[str, str], list[int]] = {}
|
||
|
||
for row in df.iter_rows(named=True):
|
||
# Key by category so different modes sharing a name/area (e.g. a rail
|
||
# station and a ferry terminal) are not merged into one POI.
|
||
category = str(row["category"] or "")
|
||
station_key = (category, canonical_station_name(str(row["name"] or "")))
|
||
if not station_key[1]:
|
||
selected.append(_station_from_row(row))
|
||
continue
|
||
|
||
row_qualifier = station_name_qualifier(str(row["name"] or ""))
|
||
existing = next(
|
||
(
|
||
index
|
||
for index in groups.get(station_key, [])
|
||
if selected[index].same_area(float(row["lat"]), float(row["lng"]))
|
||
and selected[index].qualifier_compatible(row_qualifier)
|
||
),
|
||
None,
|
||
)
|
||
if existing is not None:
|
||
selected[existing].merge(row)
|
||
continue
|
||
|
||
index = len(selected)
|
||
selected.append(_station_from_row(row))
|
||
groups.setdefault(station_key, []).append(index)
|
||
|
||
return pl.DataFrame(
|
||
{
|
||
"id": [station.id for station in selected],
|
||
"name": [station.name for station in selected],
|
||
"category": [station.output_category for station in selected],
|
||
"lat": [station.lat for station in selected],
|
||
"lng": [station.lng for station in selected],
|
||
}
|
||
).select(OUTPUT_COLUMNS)
|
||
|
||
|
||
def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||
if len(df) == 0:
|
||
return _empty_output_frame()
|
||
|
||
has_loc = df.filter(_has_locality())
|
||
no_loc = df.filter(~_has_locality())
|
||
|
||
# First pass: one record per exact stop name/category/locality.
|
||
frames = []
|
||
if len(has_loc) > 0:
|
||
frames.append(
|
||
has_loc.group_by("name", "category", "locality")
|
||
.agg(
|
||
pl.col("id").first(),
|
||
pl.col("lat").mean(),
|
||
pl.col("lng").mean(),
|
||
)
|
||
.select(OUTPUT_COLUMNS)
|
||
)
|
||
if len(no_loc) > 0:
|
||
# Stops with no locality can't be deduped by locality, so merge genuine
|
||
# co-located duplicates (same name+category within the same small area)
|
||
# via the station-area logic, while keeping distinct far-apart stops.
|
||
frames.append(_deduplicate_station_areas(no_loc))
|
||
|
||
if not frames:
|
||
return _empty_output_frame()
|
||
|
||
return pl.concat(frames).select(OUTPUT_COLUMNS)
|
||
|
||
|
||
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
|
||
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
|
||
|
||
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
|
||
station by normalized name + area, with the primary station/terminal node
|
||
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
|
||
BCE). Merged tram/metro stations with a London Underground ATCO code in
|
||
the group become "Tube station". Other stops are deduplicated by exact
|
||
name+category+locality.
|
||
"""
|
||
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
|
||
|
||
return pl.concat(
|
||
[
|
||
_deduplicate_local_stops(other),
|
||
_deduplicate_station_areas(station),
|
||
]
|
||
).select(OUTPUT_COLUMNS)
|
||
|
||
|
||
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
|
||
"""Keep only active NaPTAN stops.
|
||
|
||
The NaPTAN export's Status column marks stops as active/inactive/pending;
|
||
without this filter closed stations ("(closed)", "not in use") ship as
|
||
live POIs. Rows with a null Status are kept (benefit of the doubt); a
|
||
missing column is tolerated so older extracts still load.
|
||
"""
|
||
if "Status" not in df.columns:
|
||
print("WARNING: NaPTAN data has no Status column; keeping all stops")
|
||
return df
|
||
|
||
before = len(df)
|
||
df = df.filter(
|
||
pl.col("Status").is_null()
|
||
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
|
||
)
|
||
dropped = before - len(df)
|
||
if dropped:
|
||
print(f"Dropped {dropped:,} non-active stops (Status != active)")
|
||
return df
|
||
|
||
|
||
def download_naptan(output: Path) -> None:
|
||
output.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
|
||
with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
|
||
raw = resp.read()
|
||
|
||
print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")
|
||
|
||
df = (
|
||
pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
|
||
.with_columns(
|
||
pl.col("Latitude").cast(pl.Float64, strict=False),
|
||
pl.col("Longitude").cast(pl.Float64, strict=False),
|
||
)
|
||
.drop_nulls(subset=["Latitude", "Longitude"])
|
||
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
|
||
)
|
||
df = filter_active_stops(df).select(
|
||
pl.col("ATCOCode").alias("id"),
|
||
pl.col("CommonName").alias("name"),
|
||
pl.col("StopType").replace(STOP_TYPES).alias("category"),
|
||
pl.col("Latitude").alias("lat"),
|
||
pl.col("Longitude").alias("lng"),
|
||
pl.col("NptgLocalityCode").alias("locality"),
|
||
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
|
||
pl.col("ATCOCode")
|
||
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
|
||
.fill_null(False)
|
||
.alias("is_lu"),
|
||
)
|
||
|
||
before = len(df)
|
||
df = deduplicate_naptan(df)
|
||
|
||
print(
|
||
f"Deduplicated {before:,} → {len(df):,} stops "
|
||
"(by name+category+locality; tube stations by normalized name+area)"
|
||
)
|
||
|
||
df.write_parquet(output)
|
||
size_mb = output.stat().st_size / (1024 * 1024)
|
||
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
|
||
|
||
counts = df.group_by("category").len().sort("len", descending=True)
|
||
for row in counts.iter_rows(named=True):
|
||
print(f" {row['category']}: {row['len']:,}")
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="Download NaPTAN station data")
|
||
parser.add_argument(
|
||
"--output", type=Path, required=True, help="Output parquet file path"
|
||
)
|
||
args = parser.parse_args()
|
||
download_naptan(args.output)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|