perfect-postcode/pipeline/download/naptan.py
2026-06-02 13:46:18 +01:00

332 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Download NaPTAN data and extract railway/metro station POIs."""
import argparse
import io
import math
import re
import urllib.request
from dataclasses import dataclass
from pathlib import Path
import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
STOP_TYPES = {
"AIR": "Airport",
# Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance.
"FER": "Ferry",
"FBT": "Ferry",
"FTD": "Ferry",
# Rail: RLY is the station node; RSE is a station entrance.
"RLY": "Rail station",
"RSE": "Rail station",
"BCT": "Bus stop",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Tube station",
"MET": "Tube station",
}
# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
# with both a station node and entrances yields one POI at the station node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
if not name:
return ""
normalized = name.lower()
normalized = re.sub(r"\([^)]*\)", " ", normalized)
normalized = re.sub(r"['`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = normalized.split()
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
)
while True:
suffix = next(
(suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)),
None,
)
if suffix is None:
break
del words[-len(suffix) :]
return " ".join(words)
def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
"""Normalize station names so entrances/transport-mode variants collapse."""
expr = pl.col(name_col).str.to_lowercase()
expr = expr.str.replace_all(r"\([^)]*\)", " ")
expr = expr.str.replace_all(r"['`]", "")
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
return expr.str.strip_chars()
def _has_locality() -> pl.Expr:
return pl.col("locality").is_not_null() & (pl.col("locality") != "")
def _empty_output_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"id": pl.Series([], dtype=pl.String),
"name": pl.Series([], dtype=pl.String),
"category": pl.Series([], dtype=pl.String),
"lat": pl.Series([], dtype=pl.Float64),
"lng": pl.Series([], dtype=pl.Float64),
}
)
def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]:
# Prefer the primary station/terminal node over an entrance, then a name
# without a transport-mode suffix, then the shorter name.
lower = name.lower()
suffix_penalty = int(
lower.endswith(
(
" underground station",
" tube station",
" dlr station",
" metro station",
" tram stop",
" station",
" stop",
)
)
)
return (int(entrance), suffix_penalty, len(name))
@dataclass
class StationAccumulator:
id: str
name: str
category: str
lat_sum: float
lng_sum: float
entrance: bool = False
count: int = 1
@property
def lat(self) -> float:
return self.lat_sum / self.count
@property
def lng(self) -> float:
return self.lng_sum / self.count
def same_area(self, lat: float, lng: float) -> bool:
dlat = self.lat - lat
dlng = (self.lng - lng) * math.cos(math.radians(self.lat))
return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2
def merge(self, row: dict[str, object]) -> None:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
name = str(row["name"] or "")
entrance = bool(row.get("entrance"))
if station_name_score(name, entrance) < station_name_score(
self.name, self.entrance
):
self.id = str(row["id"] or "")
self.name = name
self.entrance = entrance
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
id=str(row["id"] or ""),
name=str(row["name"] or ""),
category=str(row["category"] or ""),
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
entrance=bool(row.get("entrance")),
)
def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
selected: list[StationAccumulator] = []
groups: dict[tuple[str, str], list[int]] = {}
for row in df.iter_rows(named=True):
# Key by category so different modes sharing a name/area (e.g. a rail
# station and a ferry terminal) are not merged into one POI.
category = str(row["category"] or "")
station_key = (category, canonical_station_name(str(row["name"] or "")))
if not station_key[1]:
selected.append(_station_from_row(row))
continue
existing = next(
(
index
for index in groups.get(station_key, [])
if selected[index].same_area(float(row["lat"]), float(row["lng"]))
),
None,
)
if existing is not None:
selected[existing].merge(row)
continue
index = len(selected)
selected.append(_station_from_row(row))
groups.setdefault(station_key, []).append(index)
return pl.DataFrame(
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
).select(OUTPUT_COLUMNS)
def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
if len(df) == 0:
return _empty_output_frame()
has_loc = df.filter(_has_locality())
no_loc = df.filter(~_has_locality())
# First pass: one record per exact stop name/category/locality.
frames = []
if len(has_loc) > 0:
frames.append(
has_loc.group_by("name", "category", "locality")
.agg(
pl.col("id").first(),
pl.col("lat").mean(),
pl.col("lng").mean(),
)
.select(OUTPUT_COLUMNS)
)
if len(no_loc) > 0:
# Stops with no locality can't be deduped by locality, so merge genuine
# co-located duplicates (same name+category within the same small area)
# via the station-area logic, while keeping distinct far-apart stops.
frames.append(_deduplicate_station_areas(no_loc))
if not frames:
return _empty_output_frame()
return pl.concat(frames).select(OUTPUT_COLUMNS)
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
Tube, rail and ferry POIs are merged to one record per station by
normalized name + area, with the primary station/terminal node (e.g. RLY,
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
by exact name+category+locality.
"""
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
return pl.concat(
[
_deduplicate_local_stops(other),
_deduplicate_station_areas(station),
]
).select(OUTPUT_COLUMNS)
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}")
with urllib.request.urlopen(NAPTAN_CSV_URL) as resp:
raw = resp.read()
print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB")
df = (
pl.read_csv(io.BytesIO(raw), infer_schema_length=0)
.with_columns(
pl.col("Latitude").cast(pl.Float64, strict=False),
pl.col("Longitude").cast(pl.Float64, strict=False),
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
.select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
pl.col("Latitude").alias("lat"),
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
)
)
before = len(df)
df = deduplicate_naptan(df)
print(
f"Deduplicated {before:,}{len(df):,} stops "
"(by name+category+locality; tube stations by normalized name+area)"
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
counts = df.group_by("category").len().sort("len", descending=True)
for row in counts.iter_rows(named=True):
print(f" {row['category']}: {row['len']:,}")
def main() -> None:
parser = argparse.ArgumentParser(description="Download NaPTAN station data")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_naptan(args.output)
if __name__ == "__main__":
main()