"""Download NaPTAN data and extract railway/metro station POIs.""" import argparse import io import math import re import urllib.request from dataclasses import dataclass from pathlib import Path import polars as pl NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv" TUBE_STATION_CATEGORY = "Tube station" TRAM_METRO_CATEGORY = "Tram & Metro stop" TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01 # London Underground ATCO codes are "ZZLU": a 3-digit # AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for # LU stations outside Greater London such as Epping or Amersham), then "0" # (platform/entrance node) or "G" (station group node), then the system code. # "ZZLU" is unique to London Underground, which cleanly separates genuine Tube # stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro, # WM Metro, Blackpool Tramway, heritage railways, ...). LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU" STOP_TYPES = { "AIR": "Airport", # Ferry: FER/FBT are the terminal/berth nodes; FTD is a docking entrance. "FER": "Ferry", "FBT": "Ferry", "FTD": "Ferry", # Rail: RLY is the station node; RSE is a station entrance. "RLY": "Rail station", "RSE": "Rail station", "BCT": "Bus stop", # Bus/coach stations: BST is the station access-area node, BCS/BCQ are # bays/stands within the station and BCE is a station entrance. NaPTAN maps # very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was # so sparse that 20% of England showed the nearest bus station >100km away. # Bays and entrances collapse to one POI per station via # STATION_MERGE_CATEGORIES below. "BST": "Bus station", "BCS": "Bus station", "BCQ": "Bus station", "BCE": "Bus station", "TXR": "Taxi rank", # Tram/Metro/Underground: TMU is an entrance node, MET the station access # area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes # mark them as London Underground (ZZLU) are reclassified to "Tube station" # after dedup (see _deduplicate_station_areas). Heritage railways (RHDR, # Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable # "heritage" flag, so they remain in "Tram & Metro stop". "TMU": TRAM_METRO_CATEGORY, "MET": TRAM_METRO_CATEGORY, } # Stop types that are access/entrance nodes rather than the primary station or # terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a # station with both a station node and entrances yields one POI at the station # node. ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"} # Categories whose entrances/variants are merged into a single station-level POI # by normalized name + area (like Tube stations), so an RLY node and its RSE # entrances collapse to one POI at the station node. STATION_MERGE_CATEGORIES = { TRAM_METRO_CATEGORY, TUBE_STATION_CATEGORY, "Rail station", "Ferry", "Bus station", } OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"] # Trailing entrance designators ("North East Ent", "Main Entrance No 2", # "West Station Entrance", ...) are stripped from canonical names so a # station's individually-named entrance nodes collapse into the station. # A trailing run of filler words is only stripped when it contains at least # one entrance word, so "Maze Hill North" or "Platform 1" are untouched. _ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"} _ENTRANCE_FILLER_WORDS = { "north", "south", "east", "west", "ne", "nw", "se", "sw", "n", "s", "e", "w", "wt", "main", "side", "no", "station", "stop", "platform", } _ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)" _ENTRANCE_FILLER_RE = ( r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop" r"|platform|\d+)" ) _ENTRANCE_SUFFIX_RE = ( rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*" rf"\s+{_ENTRANCE_WORDS_RE}" rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$" ) # Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are # stripped so every bay of one station shares a canonical name. The designator # word must be followed by a short alphanumeric token, so place names ending in # a bare "Bay" (Colwyn Bay, Herne Bay) are untouched. _BAY_WORDS = {"stand", "stance", "bay", "gate"} _BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$" def _strip_entrance_suffix(words: list[str]) -> list[str]: """Drop a trailing entrance designator (direction/number filler around an entrance word) from a tokenized stop name; no-op when no entrance word.""" idx = len(words) saw_entrance = False while idx > 0: word = words[idx - 1] if word in _ENTRANCE_NAME_WORDS: saw_entrance = True elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS: pass else: break idx -= 1 return words[:idx] if saw_entrance else words def canonical_station_name(name: str | None) -> str: """Normalize station names so entrances/transport-mode variants collapse.""" if not name: return "" normalized = name.lower() normalized = re.sub(r"\([^)]*\)", " ", normalized) normalized = re.sub(r"['’`]", "", normalized) normalized = normalized.replace("&", " and ") normalized = re.sub(r"[^a-z0-9]+", " ", normalized) words = _strip_entrance_suffix(normalized.split()) if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3: del words[-2:] suffixes = ( ("underground", "station"), ("tube", "station"), ("dlr", "station"), ("metro", "station"), ("metrolink", "station"), ("metrolink", "stop"), ("tram", "stop"), ("rail", "station"), ("railway", "station"), ("station",), ("stop",), ("metrolink",), ) while True: suffix = next( (suffix for suffix in suffixes if words[-len(suffix) :] == list(suffix)), None, ) if suffix is None: break del words[-len(suffix) :] return " ".join(words) _QUALIFIER_RE = re.compile(r"\(([^)]*)\)") def station_name_qualifier(name: str | None) -> str: """The canonicalized parenthetical of a station name, e.g. "Edgware Road (Bakerloo)" -> "bakerloo". Genuinely distinct same-named stations (the two Edgware Roads ~150m apart, Hammersmith's two stations) differ ONLY by this parenthetical, which `canonical_station_name` strips; it must block their merge while still letting unqualified entrance/variant rows collapse into either. """ if not name: return "" parts = _QUALIFIER_RE.findall(name) if not parts: return "" text = " ".join(parts).lower().replace("&", " and ") return re.sub(r"[^a-z0-9]+", " ", text).strip() def canonical_station_name_expr(name_col: str = "name") -> pl.Expr: """Normalize station names so entrances/transport-mode variants collapse.""" expr = pl.col(name_col).str.to_lowercase() expr = expr.str.replace_all(r"\([^)]*\)", " ") expr = expr.str.replace_all(r"['’`]", "") expr = expr.str.replace_all(r"&", " and ") expr = expr.str.replace_all(r"[^a-z0-9]+", " ") expr = expr.str.replace_all(r"\s+", " ").str.strip_chars() expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "") expr = expr.str.replace_all(_BAY_SUFFIX_RE, "") expr = expr.str.replace_all( r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", "" ) expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "") expr = expr.str.replace_all(r"\s+(station|stop)$", "") expr = expr.str.replace_all(r"\s+metrolink$", "") return expr.str.strip_chars() def _has_locality() -> pl.Expr: return pl.col("locality").is_not_null() & (pl.col("locality") != "") def _empty_output_frame() -> pl.DataFrame: return pl.DataFrame( { "id": pl.Series([], dtype=pl.String), "name": pl.Series([], dtype=pl.String), "category": pl.Series([], dtype=pl.String), "lat": pl.Series([], dtype=pl.Float64), "lng": pl.Series([], dtype=pl.Float64), } ) def station_name_score(name: str, entrance: bool = False) -> tuple[int, int, int]: # Prefer the primary station/terminal node over an entrance, then a name # without a transport-mode suffix, then the shorter name. lower = name.lower() suffix_penalty = int( lower.endswith( ( " underground station", " tube station", " dlr station", " metro station", " tram stop", " station", " stop", ) ) ) return (int(entrance), suffix_penalty, len(name)) @dataclass class StationAccumulator: id: str name: str category: str lat_sum: float lng_sum: float entrance: bool = False is_lu: bool = False count: int = 1 qualifier: str = "" @property def lat(self) -> float: return self.lat_sum / self.count @property def lng(self) -> float: return self.lng_sum / self.count def same_area(self, lat: float, lng: float) -> bool: dlat = self.lat - lat dlng = (self.lng - lng) * math.cos(math.radians(self.lat)) return (dlat * dlat + dlng * dlng) <= TUBE_STATION_MERGE_RADIUS_DEGREES**2 def qualifier_compatible(self, qualifier: str) -> bool: # Conflicting parentheticals mark distinct same-named stations; an # unqualified row can join either group. return not qualifier or not self.qualifier or qualifier == self.qualifier def merge(self, row: dict[str, object]) -> None: self.lat_sum += float(row["lat"]) self.lng_sum += float(row["lng"]) self.count += 1 self.is_lu = self.is_lu or bool(row.get("is_lu")) name = str(row["name"] or "") row_qualifier = station_name_qualifier(name) self.qualifier = self.qualifier or row_qualifier entrance = bool(row.get("entrance")) # Prefer a display name carrying the group's disambiguating # parenthetical: without it the two Edgware Roads would both render as # the bare "Edgware Road Underground Station". candidate = ( self._qualifier_penalty(row_qualifier), *station_name_score(name, entrance), ) current = ( self._qualifier_penalty(station_name_qualifier(self.name)), *station_name_score(self.name, self.entrance), ) if candidate < current: self.id = str(row["id"] or "") self.name = name self.entrance = entrance def _qualifier_penalty(self, name_qualifier: str) -> int: return int(bool(self.qualifier) and name_qualifier != self.qualifier) @property def output_category(self) -> str: # A merged tram/metro station is a genuine Tube station when ANY of its # constituent nodes carries a London Underground ATCO code. Checking # the whole group (not just the winning node) matters because LU # entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...). if self.category == TRAM_METRO_CATEGORY and self.is_lu: return TUBE_STATION_CATEGORY return self.category def _station_from_row(row: dict[str, object]) -> StationAccumulator: return StationAccumulator( id=str(row["id"] or ""), name=str(row["name"] or ""), category=str(row["category"] or ""), lat_sum=float(row["lat"]), lng_sum=float(row["lng"]), entrance=bool(row.get("entrance")), is_lu=bool(row.get("is_lu")), qualifier=station_name_qualifier(str(row["name"] or "")), ) def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame: if len(df) == 0: return _empty_output_frame() selected: list[StationAccumulator] = [] groups: dict[tuple[str, str], list[int]] = {} for row in df.iter_rows(named=True): # Key by category so different modes sharing a name/area (e.g. a rail # station and a ferry terminal) are not merged into one POI. category = str(row["category"] or "") station_key = (category, canonical_station_name(str(row["name"] or ""))) if not station_key[1]: selected.append(_station_from_row(row)) continue row_qualifier = station_name_qualifier(str(row["name"] or "")) existing = next( ( index for index in groups.get(station_key, []) if selected[index].same_area(float(row["lat"]), float(row["lng"])) and selected[index].qualifier_compatible(row_qualifier) ), None, ) if existing is not None: selected[existing].merge(row) continue index = len(selected) selected.append(_station_from_row(row)) groups.setdefault(station_key, []).append(index) return pl.DataFrame( { "id": [station.id for station in selected], "name": [station.name for station in selected], "category": [station.output_category for station in selected], "lat": [station.lat for station in selected], "lng": [station.lng for station in selected], } ).select(OUTPUT_COLUMNS) def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame: if len(df) == 0: return _empty_output_frame() has_loc = df.filter(_has_locality()) no_loc = df.filter(~_has_locality()) # First pass: one record per exact stop name/category/locality. frames = [] if len(has_loc) > 0: frames.append( has_loc.group_by("name", "category", "locality") .agg( pl.col("id").first(), pl.col("lat").mean(), pl.col("lng").mean(), ) .select(OUTPUT_COLUMNS) ) if len(no_loc) > 0: # Stops with no locality can't be deduped by locality, so merge genuine # co-located duplicates (same name+category within the same small area) # via the station-area logic, while keeping distinct far-apart stops. frames.append(_deduplicate_station_areas(no_loc)) if not frames: return _empty_output_frame() return pl.concat(frames).select(OUTPUT_COLUMNS) def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame: """Deduplicate NaPTAN stops, merging station/terminal entrances by area. Tram/metro, rail, ferry and bus-station POIs are merged to one record per station by normalized name + area, with the primary station/terminal node (e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU, BCE). Merged tram/metro stations with a London Underground ATCO code in the group become "Tube station". Other stops are deduplicated by exact name+category+locality. """ station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES))) other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES))) return pl.concat( [ _deduplicate_local_stops(other), _deduplicate_station_areas(station), ] ).select(OUTPUT_COLUMNS) def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame: """Keep only active NaPTAN stops. The NaPTAN export's Status column marks stops as active/inactive/pending; without this filter closed stations ("(closed)", "not in use") ship as live POIs. Rows with a null Status are kept (benefit of the doubt); a missing column is tolerated so older extracts still load. """ if "Status" not in df.columns: print("WARNING: NaPTAN data has no Status column; keeping all stops") return df before = len(df) df = df.filter( pl.col("Status").is_null() | pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"]) ) dropped = before - len(df) if dropped: print(f"Dropped {dropped:,} non-active stops (Status != active)") return df def download_naptan(output: Path) -> None: output.parent.mkdir(parents=True, exist_ok=True) print(f"Downloading NaPTAN data from {NAPTAN_CSV_URL}") with urllib.request.urlopen(NAPTAN_CSV_URL) as resp: raw = resp.read() print(f"Downloaded {len(raw) / (1024 * 1024):.1f} MB") df = ( pl.read_csv(io.BytesIO(raw), infer_schema_length=0) .with_columns( pl.col("Latitude").cast(pl.Float64, strict=False), pl.col("Longitude").cast(pl.Float64, strict=False), ) .drop_nulls(subset=["Latitude", "Longitude"]) .filter(pl.col("StopType").is_in(list(STOP_TYPES.keys()))) ) df = filter_active_stops(df).select( pl.col("ATCOCode").alias("id"), pl.col("CommonName").alias("name"), pl.col("StopType").replace(STOP_TYPES).alias("category"), pl.col("Latitude").alias("lat"), pl.col("Longitude").alias("lng"), pl.col("NptgLocalityCode").alias("locality"), pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"), pl.col("ATCOCode") .str.contains(LONDON_UNDERGROUND_ATCO_PATTERN) .fill_null(False) .alias("is_lu"), ) before = len(df) df = deduplicate_naptan(df) print( f"Deduplicated {before:,} → {len(df):,} stops " "(by name+category+locality; tube stations by normalized name+area)" ) df.write_parquet(output) size_mb = output.stat().st_size / (1024 * 1024) print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)") counts = df.group_by("category").len().sort("len", descending=True) for row in counts.iter_rows(named=True): print(f" {row['category']}: {row['len']:,}") def main() -> None: parser = argparse.ArgumentParser(description="Download NaPTAN station data") parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_naptan(args.output) if __name__ == "__main__": main()