"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet. Extracts named place nodes and railway stations (tube, national rail, DLR, etc.) for typeahead search. Reuses the same england-latest.osm.pbf as pois.py. """ import argparse import re from pathlib import Path import osmium import polars as pl from shapely.geometry import Point from tqdm import tqdm from pipeline.utils.england_geometry import ( ENGLAND_BBOX_EAST, ENGLAND_BBOX_NORTH, ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_WEST, load_england_polygon, ) # Search can use a wider set of OSM place nodes, but travel-time destinations # must remain restricted to the historical city/station origin set. SEARCH_PLACE_TYPES = { "city", "town", "village", "suburb", "neighbourhood", "quarter", "borough", "locality", "hamlet", "isolated_dwelling", "island", } TRAVEL_DESTINATION_PLACE_TYPES = {"city"} # Suffixes to strip from raw station names before appending the typed suffix. _STATION_STRIP = ( " tube station", " underground station", " railway station", " dlr station", " station dlr", " dlr", " overground station", " tram stop", " station", ) _DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})") def _is_dlr_station(tags: dict[str, str]) -> bool: name = tags.get("name", "").lower() network = tags.get("network", "").lower() operator = tags.get("operator", "").lower() return ( "docklands" in network or "dlr" in network or "docklands" in operator or "dlr" in operator or name.endswith(" dlr") or " dlr " in name ) def _is_tram_station(tags: dict[str, str]) -> bool: if _is_dlr_station(tags): return False station_tag = tags.get("station", "") network = tags.get("network", "").lower() return station_tag == "light_rail" or "tramlink" in network or "tram" in network def _station_display_name(name: str, tags: dict[str, str]) -> str: """Build a descriptive station name like 'Bank tube station'.""" station_tag = tags.get("station", "") network = tags.get("network", "").lower() if station_tag == "subway" or "underground" in network: suffix = "tube station" elif "docklands" in network or "dlr" in network: suffix = "DLR station" elif "overground" in network: suffix = "overground station" elif "elizabeth" in network: suffix = "Elizabeth line station" elif station_tag == "light_rail" or "tramlink" in network or "tram" in network: suffix = "tram stop" else: suffix = "railway station" # Strip any existing station suffix from the raw name lower = name.lower() for s in _STATION_STRIP: if lower.endswith(s): name = name[: len(name) - len(s)].rstrip() break return f"{name} {suffix}" def _station_name_score(name: str) -> tuple[int, int]: lower = name.lower() suffix_penalty = int( lower.endswith( ( " underground station", " tube station", " dlr station", " railway station", " rail station", " station dlr", " station", ) ) or lower.endswith(" dlr") ) return (suffix_penalty, len(name)) def _naptan_dlr_stations(naptan_path: Path) -> list[dict]: """Extract station-level DLR destinations from NaPTAN access nodes.""" df = pl.read_parquet(naptan_path) required = {"id", "name", "category", "lat", "lng"} missing = required - set(df.columns) if missing: raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}") rows: dict[str, dict] = {} for row in df.iter_rows(named=True): atco_id = str(row["id"] or "") match = _DLR_CODE_RE.search(atco_id) if not match: continue if row["category"] not in {"Tube station", "Rail station"}: continue code = match.group(1) raw_name = str(row["name"] or "") if not raw_name: continue lat = float(row["lat"]) lon = float(row["lng"]) current = rows.get(code) if current is None: rows[code] = { "raw_name": raw_name, "lat_sum": lat, "lon_sum": lon, "count": 1, } continue current["lat_sum"] += lat current["lon_sum"] += lon current["count"] += 1 if _station_name_score(raw_name) < _station_name_score(current["raw_name"]): current["raw_name"] = raw_name stations = [] for station in rows.values(): count = station["count"] display_name = _station_display_name(station["raw_name"], {"network": "DLR"}) stations.append( { "name": display_name, "place_type": "station", "lat": station["lat_sum"] / count, "lon": station["lon_sum"] / count, "population": 0, "travel_destination": True, } ) return sorted(stations, key=lambda station: station["name"]) def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int: existing_names = {str(place["name"]).casefold() for place in places} added = 0 for station in _naptan_dlr_stations(naptan_path): key = station["name"].casefold() if key in existing_names: continue places.append(station) existing_names.add(key) added += 1 return added class PlaceHandler(osmium.SimpleHandler): def __init__(self, progress: tqdm, england_polygon) -> None: super().__init__() self._progress = progress self.places: list[dict] = [] self._england = england_polygon def _add( self, name: str, place_type: str, lat: float, lon: float, population: int, travel_destination: bool, ) -> None: self.places.append( { "name": name, "place_type": place_type, "lat": lat, "lon": lon, "population": population, "travel_destination": travel_destination, } ) self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False) def node(self, n: osmium.osm.Node) -> None: self._progress.update(1) if not n.location.valid: return lat, lon = n.location.lat, n.location.lon if not ( ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST ): return if not self._england.contains(Point(lon, lat)): return name = n.tags.get("name:en", n.tags.get("name", "")) if not name: return pop_str = n.tags.get("population", "") try: population = int(pop_str) except ValueError: population = 0 # place=* nodes place_type = n.tags.get("place") if place_type in SEARCH_PLACE_TYPES: self._add( name, place_type, lat, lon, population, travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES, ) return # Railway stations (tube, national rail, DLR, overground, Elizabeth line) if n.tags.get("railway") == "station": tags = dict(n.tags) if _is_tram_station(tags): return display_name = _station_display_name(name, tags) self._add( display_name, "station", lat, lon, population, travel_destination=True, ) return def main() -> None: parser = argparse.ArgumentParser(description="Extract place names from OSM PBF") parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file") parser.add_argument( "--boundary", type=Path, required=True, help="England boundary GeoJSON file", ) parser.add_argument( "--naptan", type=Path, help="Optional NaPTAN parquet file used to add DLR station destinations", ) args = parser.parse_args() pbf_file = args.pbf england_polygon = load_england_polygon(args.boundary) print("Extracting search place nodes + railway stations") with tqdm( unit=" elements", unit_scale=True, desc="Streaming", smoothing=0.05, mininterval=1.0, ) as progress: handler = PlaceHandler(progress, england_polygon) handler.apply_file(str(pbf_file), locations=True) print(f"Extracted {len(handler.places):,} place nodes") if args.naptan: added = _append_naptan_dlr_stations(handler.places, args.naptan) print(f"Added {added:,} DLR station destinations from NaPTAN") if handler.places: df = pl.DataFrame(handler.places) args.output.parent.mkdir(parents=True, exist_ok=True) df.write_parquet(args.output) print(f"Saved to {args.output}") else: print("No places found — skipping output") if __name__ == "__main__": main()