perfect-postcode/pipeline/download/places.py

"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.

Extracts named place nodes and railway stations (tube, national rail, DLR,
etc.) for typeahead search.
Reuses the same england-latest.osm.pbf as pois.py.
"""

import argparse
import re
from pathlib import Path

import osmium
import polars as pl
from shapely.geometry import Point
from tqdm import tqdm

from pipeline.utils.england_geometry import (
    ENGLAND_BBOX_EAST,
    ENGLAND_BBOX_NORTH,
    ENGLAND_BBOX_SOUTH,
    ENGLAND_BBOX_WEST,
    load_england_polygon,
)

# Search can use a wider set of OSM place nodes, but travel-time destinations
# must remain restricted to the historical city/station origin set.
SEARCH_PLACE_TYPES = {
    "city",
    "town",
    "village",
    "suburb",
    "neighbourhood",
    "quarter",
    "borough",
    "locality",
    "hamlet",
    "isolated_dwelling",
    "island",
}
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}

# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
    " tube station",
    " underground station",
    " railway station",
    " dlr station",
    " station dlr",
    " dlr",
    " overground station",
    " tram stop",
    " station",
)

_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")


def _is_dlr_station(tags: dict[str, str]) -> bool:
    name = tags.get("name", "").lower()
    network = tags.get("network", "").lower()
    operator = tags.get("operator", "").lower()
    return (
        "docklands" in network
        or "dlr" in network
        or "docklands" in operator
        or "dlr" in operator
        or name.endswith(" dlr")
        or " dlr " in name
    )


def _is_tram_station(tags: dict[str, str]) -> bool:
    if _is_dlr_station(tags):
        return False
    station_tag = tags.get("station", "")
    network = tags.get("network", "").lower()
    return station_tag == "light_rail" or "tramlink" in network or "tram" in network


def _station_display_name(name: str, tags: dict[str, str]) -> str:
    """Build a descriptive station name like 'Bank tube station'."""
    station_tag = tags.get("station", "")
    network = tags.get("network", "").lower()

    if station_tag == "subway" or "underground" in network:
        suffix = "tube station"
    elif "docklands" in network or "dlr" in network:
        suffix = "DLR station"
    elif "overground" in network:
        suffix = "overground station"
    elif "elizabeth" in network:
        suffix = "Elizabeth line station"
    elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
        suffix = "tram stop"
    else:
        suffix = "railway station"

    # Strip any existing station suffix from the raw name
    lower = name.lower()
    for s in _STATION_STRIP:
        if lower.endswith(s):
            name = name[: len(name) - len(s)].rstrip()
            break

    return f"{name} {suffix}"


def _station_name_score(name: str) -> tuple[int, int]:
    lower = name.lower()
    suffix_penalty = int(
        lower.endswith(
            (
                " underground station",
                " tube station",
                " dlr station",
                " railway station",
                " rail station",
                " station dlr",
                " station",
            )
        )
        or lower.endswith(" dlr")
    )
    return (suffix_penalty, len(name))


def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
    """Extract station-level DLR destinations from NaPTAN access nodes."""
    df = pl.read_parquet(naptan_path)
    required = {"id", "name", "category", "lat", "lng"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")

    rows: dict[str, dict] = {}
    for row in df.iter_rows(named=True):
        atco_id = str(row["id"] or "")
        match = _DLR_CODE_RE.search(atco_id)
        if not match:
            continue
        if row["category"] not in {"Tube station", "Rail station"}:
            continue

        code = match.group(1)
        raw_name = str(row["name"] or "")
        if not raw_name:
            continue

        lat = float(row["lat"])
        lon = float(row["lng"])
        current = rows.get(code)
        if current is None:
            rows[code] = {
                "raw_name": raw_name,
                "lat_sum": lat,
                "lon_sum": lon,
                "count": 1,
            }
            continue

        current["lat_sum"] += lat
        current["lon_sum"] += lon
        current["count"] += 1
        if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
            current["raw_name"] = raw_name

    stations = []
    for station in rows.values():
        count = station["count"]
        display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
        stations.append(
            {
                "name": display_name,
                "place_type": "station",
                "lat": station["lat_sum"] / count,
                "lon": station["lon_sum"] / count,
                "population": 0,
                "travel_destination": True,
            }
        )

    return sorted(stations, key=lambda station: station["name"])


def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
    existing_names = {str(place["name"]).casefold() for place in places}
    added = 0
    for station in _naptan_dlr_stations(naptan_path):
        key = station["name"].casefold()
        if key in existing_names:
            continue
        places.append(station)
        existing_names.add(key)
        added += 1
    return added


class PlaceHandler(osmium.SimpleHandler):
    def __init__(self, progress: tqdm, england_polygon) -> None:
        super().__init__()
        self._progress = progress
        self.places: list[dict] = []
        self._england = england_polygon

    def _add(
        self,
        name: str,
        place_type: str,
        lat: float,
        lon: float,
        population: int,
        travel_destination: bool,
    ) -> None:
        self.places.append(
            {
                "name": name,
                "place_type": place_type,
                "lat": lat,
                "lon": lon,
                "population": population,
                "travel_destination": travel_destination,
            }
        )
        self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)

    def node(self, n: osmium.osm.Node) -> None:
        self._progress.update(1)
        if not n.location.valid:
            return
        lat, lon = n.location.lat, n.location.lon
        if not (
            ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
            and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
        ):
            return
        if not self._england.contains(Point(lon, lat)):
            return

        name = n.tags.get("name:en", n.tags.get("name", ""))
        if not name:
            return

        pop_str = n.tags.get("population", "")
        try:
            population = int(pop_str)
        except ValueError:
            population = 0

        # place=* nodes
        place_type = n.tags.get("place")
        if place_type in SEARCH_PLACE_TYPES:
            self._add(
                name,
                place_type,
                lat,
                lon,
                population,
                travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
            )
            return

        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
        if n.tags.get("railway") == "station":
            tags = dict(n.tags)
            if _is_tram_station(tags):
                return
            display_name = _station_display_name(name, tags)
            self._add(
                display_name,
                "station",
                lat,
                lon,
                population,
                travel_destination=True,
            )
            return


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
    parser.add_argument(
        "--boundary",
        type=Path,
        required=True,
        help="England boundary GeoJSON file",
    )
    parser.add_argument(
        "--naptan",
        type=Path,
        help="Optional NaPTAN parquet file used to add DLR station destinations",
    )
    args = parser.parse_args()

    pbf_file = args.pbf
    england_polygon = load_england_polygon(args.boundary)

    print("Extracting search place nodes + railway stations")
    with tqdm(
        unit=" elements",
        unit_scale=True,
        desc="Streaming",
        smoothing=0.05,
        mininterval=1.0,
    ) as progress:
        handler = PlaceHandler(progress, england_polygon)
        handler.apply_file(str(pbf_file), locations=True)

    print(f"Extracted {len(handler.places):,} place nodes")
    if args.naptan:
        added = _append_naptan_dlr_stations(handler.places, args.naptan)
        print(f"Added {added:,} DLR station destinations from NaPTAN")

    if handler.places:
        df = pl.DataFrame(handler.places)
        args.output.parent.mkdir(parents=True, exist_ok=True)
        df.write_parquet(args.output)
        print(f"Saved to {args.output}")
    else:
        print("No places found — skipping output")


if __name__ == "__main__":
    main()