perfect-postcode/pipeline/download/places.py

"""Extract places, stations, and universities → data/places.parquet.

Extracts named place nodes and railway stations (tube, national rail, DLR,
etc.) for typeahead search. Official English university providers from the
Office for Students register can also be added as travel-time destinations.
Reuses the same england-latest.osm.pbf as pois.py.
"""

import argparse
import re
from pathlib import Path

import numpy as np
import osmium
import polars as pl
from scipy.spatial import cKDTree
from shapely.geometry import Point
from pyproj import Transformer
from tqdm import tqdm

from pipeline.utils.england_geometry import (
    ENGLAND_BBOX_EAST,
    ENGLAND_BBOX_NORTH,
    ENGLAND_BBOX_SOUTH,
    ENGLAND_BBOX_WEST,
    load_england_polygon,
)

# Search can use a wider set of OSM place nodes, but travel-time destinations
# must remain restricted to the historical city/station origin set.
SEARCH_PLACE_TYPES = {
    "city",
    "town",
    "village",
    "suburb",
    "neighbourhood",
    "quarter",
    "borough",
    "locality",
    "hamlet",
    "isolated_dwelling",
    "island",
}
TRAVEL_DESTINATION_PLACE_TYPES = {"city"}

# Named OSM highways worth surfacing as searchable streets (N). Service roads, footways,
# cycleways and motorways are deliberately excluded.
SEARCHABLE_HIGHWAY_TYPES = {
    "residential",
    "unclassified",
    "tertiary",
    "tertiary_link",
    "secondary",
    "secondary_link",
    "primary",
    "primary_link",
    "trunk",
    "living_street",
    "pedestrian",
}

# High-value named POIs (M) lifted from uk_pois.parquet into the gazetteer, mapped from the
# OSM "key/value" category onto a search place_type. Everyday shops/amenities are excluded.
HIGH_VALUE_POI_CATEGORIES = {
    "leisure/park": "park",
    "leisure/garden": "park",
    "leisure/nature_reserve": "park",
    "leisure/common": "park",
    "tourism/attraction": "attraction",
    "tourism/theme_park": "attraction",
    "tourism/zoo": "attraction",
    "tourism/museum": "attraction",
    "tourism/gallery": "attraction",
    "amenity/hospital": "hospital",
    "healthcare/hospital": "hospital",
    "shop/mall": "retail",
    "shop/department_store": "retail",
}

ENGLAND_COUNTRY_CODE = "E92000001"
LONDON_REGION_CODE = "E12000007"
LONDON_LAD_PREFIX = "E09"
LONDON_COUNTY_CODES = {"E13000001", "E13000002"}
DISPLAY_CITY_NEAREST_POSTCODE_MAX_M = 3_000
WGS84_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)

# England British National Grid (EPSG:27700) bounding box, with margin. ONS NSPL stores
# postcodes that have no grid reference at the Null-Island sentinel lat=99.999999,
# long=0.000000, whose paired easting/northing collapse to the grid origin (0, 0) (or
# inf). Requiring coordinates inside this box drops the sentinel from every index, so an
# active postcode lacking a grid ref can never become a false nearest neighbour.
ENGLAND_BNG_MIN_EAST = 50_000.0
ENGLAND_BNG_MAX_EAST = 660_000.0
ENGLAND_BNG_MIN_NORTH = 0.0
ENGLAND_BNG_MAX_NORTH = 660_000.0


def _valid_wgs84_expr() -> pl.Expr:
    """Rows with a real lat/long inside England (drops the ONS lat=99.999999, long=0.0
    no-grid-reference sentinel and any nulls), so they never enter a coordinate index."""
    return (
        pl.col("lat").is_not_null()
        & pl.col("long").is_not_null()
        & pl.col("lat").is_between(ENGLAND_BBOX_SOUTH, ENGLAND_BBOX_NORTH)
        & pl.col("long").is_between(ENGLAND_BBOX_WEST, ENGLAND_BBOX_EAST)
    )


def _valid_bng_expr() -> pl.Expr:
    """Rows with a real easting/northing inside England (drops the (0, 0) grid-origin /
    inf paired with the ONS no-grid-reference sentinel and any nulls)."""
    return (
        pl.col("east1m").is_not_null()
        & pl.col("north1m").is_not_null()
        & pl.col("east1m").is_between(ENGLAND_BNG_MIN_EAST, ENGLAND_BNG_MAX_EAST)
        & pl.col("north1m").is_between(ENGLAND_BNG_MIN_NORTH, ENGLAND_BNG_MAX_NORTH)
    )

# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
    " tube station",
    " underground station",
    " railway station",
    " dlr station",
    " station dlr",
    " dlr",
    " overground station",
    " tram stop",
    " station",
)

_DLR_CODE_RE = re.compile(r"ZZDL([A-Z0-9]{3})")
_POSTCODE_RE = re.compile(r"\b([A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2})\b", re.I)
_LONDON_TOKEN_RE = re.compile(r"(^|[^a-z])london([^a-z]|$)", re.I)

_NOISY_PROVIDER_SUFFIXES = (
    " higher education corporation",
    " limited",
    " ltd",
)

_LEGAL_NAME_FALLBACK_MARKERS = (
    "the chancellor",
    "chancellor, masters",
    "chancellor masters",
)


def _is_dlr_station(tags: dict[str, str]) -> bool:
    name = tags.get("name", "").lower()
    network = tags.get("network", "").lower()
    operator = tags.get("operator", "").lower()
    return (
        "docklands" in network
        or "dlr" in network
        or "docklands" in operator
        or "dlr" in operator
        or name.endswith(" dlr")
        or " dlr " in name
    )


def _is_tram_station(tags: dict[str, str]) -> bool:
    if _is_dlr_station(tags):
        return False
    station_tag = tags.get("station", "")
    network = tags.get("network", "").lower()
    return station_tag == "light_rail" or "tramlink" in network or "tram" in network


def _station_display_name(name: str, tags: dict[str, str]) -> str:
    """Build a descriptive station name like 'Bank tube station'."""
    station_tag = tags.get("station", "")
    network = tags.get("network", "").lower()

    if station_tag == "subway" or "underground" in network:
        suffix = "tube station"
    elif "docklands" in network or "dlr" in network:
        suffix = "DLR station"
    elif "overground" in network:
        suffix = "overground station"
    elif "elizabeth" in network:
        suffix = "Elizabeth line station"
    elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
        suffix = "tram stop"
    else:
        suffix = "railway station"

    # Strip any existing station suffix from the raw name
    lower = name.lower()
    for s in _STATION_STRIP:
        if lower.endswith(s):
            name = name[: len(name) - len(s)].rstrip()
            break

    return f"{name} {suffix}"


def _station_name_score(name: str) -> tuple[int, int]:
    lower = name.lower()
    suffix_penalty = int(
        lower.endswith(
            (
                " underground station",
                " tube station",
                " dlr station",
                " railway station",
                " rail station",
                " station dlr",
                " station",
            )
        )
        or lower.endswith(" dlr")
    )
    return (suffix_penalty, len(name))


def _cell_text(value: object) -> str:
    if value is None:
        return ""
    return str(value).strip()


def _header_key(value: object) -> str:
    return re.sub(r"[^a-z0-9]+", " ", _cell_text(value).lower()).strip()


def _find_header_row(rows: list[tuple]) -> int:
    for idx, row in enumerate(rows):
        keys = [_header_key(value) for value in row]
        has_legal_name = any(
            all(token in key for token in ("provider", "legal", "name")) for key in keys
        )
        has_university_title = any(
            all(token in key for token in ("right", "use", "university"))
            for key in keys
        )
        if has_legal_name and has_university_title:
            return idx
    raise ValueError("Could not find the OfS register header row")


def _find_column(headers: list[object], *tokens: str) -> int:
    for idx, header in enumerate(headers):
        key = _header_key(header)
        if all(token in key for token in tokens):
            return idx
    raise ValueError(f"Could not find OfS register column containing {tokens}")


def _normalize_postcode(postcode: str) -> str:
    return re.sub(r"[^A-Z0-9]", "", postcode.upper())


def _extract_postcode(address: str) -> str | None:
    match = _POSTCODE_RE.search(address)
    if match is None:
        return None
    return _normalize_postcode(match.group(1))


def _clean_provider_name(name: str) -> str:
    name = re.sub(r"\s+", " ", name).strip(" ,")
    if name.lower().endswith(", the"):
        name = f"The {name[:-5].strip(' ,')}"
    for suffix in _NOISY_PROVIDER_SUFFIXES:
        if name.lower().endswith(suffix):
            name = name[: -len(suffix)].strip(" ,")
            break
    if name.startswith("The ") and name != "The Open University":
        name = name[4:].strip()
    return name


def _split_trading_names(trading_names: str) -> list[str]:
    if not trading_names or trading_names.casefold() == "not applicable":
        return []
    return [
        _clean_provider_name(name)
        for name in trading_names.splitlines()
        if _clean_provider_name(name)
    ]


def _needs_trading_name(legal_name: str) -> bool:
    lower = legal_name.lower()
    return any(marker in lower for marker in _LEGAL_NAME_FALLBACK_MARKERS) or any(
        lower.endswith(suffix) for suffix in _NOISY_PROVIDER_SUFFIXES
    )


def _select_university_name(legal_name: str, trading_names: str) -> str:
    legal = _clean_provider_name(legal_name)
    trading = _split_trading_names(trading_names)
    if _needs_trading_name(legal_name):
        for name in trading:
            if "university" in name.lower() or "imperial college" in name.lower():
                return name
        if trading:
            return trading[0]
    return legal


def _slugify_name(name: str) -> str:
    slug = name.lower()
    slug = re.sub(r"[^a-z0-9 -]", "", slug)
    return re.sub(r"\s+", "-", slug).strip("-")


def _street_centroid(coords: list[tuple[float, float]]) -> tuple[float, float] | None:
    """Average (lat, lon) of a way's vertices."""
    if not coords:
        return None
    count = len(coords)
    lat = sum(lat for lat, _ in coords) / count
    lon = sum(lon for _, lon in coords) / count
    return lat, lon


def _normalize_street_name(name: str) -> str:
    """Grouping key for a street name: collapse whitespace, lowercase."""
    return re.sub(r"\s+", " ", name).strip().lower()


def _outcode_of_postcode(postcode: str) -> str:
    """Outward code (everything before the space) of a postcode, e.g. 'NW1' from 'NW1 6XE'."""
    return postcode.split(" ", 1)[0] if postcode else ""


def _outcode_tree(postcodes_path: Path) -> tuple[cKDTree, list[str]]:
    """Build a nearest-neighbour index from postcode coordinates to their outcode, so each
    street can be tagged with the outcode it sits in (used to disambiguate same-named roads).

    The tree lives in BNG metres (like `_london_postcode_tree`): in raw degrees
    1° of longitude is only ~0.6° of latitude at UK latitudes, which biases
    nearest-postcode picks E-W near outcode boundaries."""
    df = (
        pl.read_parquet(
            postcodes_path,
            columns=["pcds", "east1m", "north1m", "ctry25cd", "doterm"],
        )
        .filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
        .filter(_valid_bng_expr())
    )
    coords = np.column_stack(
        [
            df["east1m"].to_numpy().astype(np.float64),
            df["north1m"].to_numpy().astype(np.float64),
        ]
    )
    outcodes = [_outcode_of_postcode(pc) for pc in df["pcds"].to_list()]
    return cKDTree(coords), outcodes


def _build_street_places(
    streets: list[dict],
    tree: cKDTree,
    outcodes: list[str],
) -> list[dict]:
    """Group street segments by (normalized name, outcode), averaging centroids, so a road that
    OSM splits into many segments becomes one searchable result per outcode it passes through."""
    if not streets:
        return []

    lons = np.array([street["lon"] for street in streets], dtype=np.float64)
    lats = np.array([street["lat"] for street in streets], dtype=np.float64)
    eastings, northings = WGS84_TO_BNG.transform(lons, lats)
    _, indices = tree.query(np.column_stack([eastings, northings]))

    grouped: dict[tuple[str, str], dict] = {}
    for street, postcode_idx in zip(streets, indices):
        outcode = outcodes[postcode_idx]
        key = (_normalize_street_name(street["name"]), outcode)
        entry = grouped.get(key)
        if entry is None:
            grouped[key] = {
                "name": street["name"],
                "lat_sum": street["lat"],
                "lon_sum": street["lon"],
                "count": 1,
            }
        else:
            entry["lat_sum"] += street["lat"]
            entry["lon_sum"] += street["lon"]
            entry["count"] += 1

    places = []
    for entry in grouped.values():
        count = entry["count"]
        places.append(
            {
                "name": entry["name"],
                "place_type": "street",
                "lat": entry["lat_sum"] / count,
                "lon": entry["lon_sum"] / count,
                "population": 0,
                "travel_destination": False,
                "display_city": None,
            }
        )
    return sorted(places, key=lambda place: place["name"].lower())


def _poi_dedup_key(name: str, place_type: str, lat: float, lon: float) -> tuple:
    """Geographic de-dup key: round(.,2) is ~1.1km lat / ~0.7km UK lon.

    Coarse enough to collapse the SAME physical POI mapped twice a few metres
    apart, fine enough to keep genuinely distinct same-named POIs in different
    towns (e.g. "Victoria Park" in London vs Bristol).
    """
    return (name.lower(), place_type, round(lat, 2), round(lon, 2))


def _pois_to_places(pois: pl.DataFrame) -> list[dict]:
    """Map high-value named POIs onto gazetteer place rows (M), de-duplicated by (name, type, coords)."""
    if pois.is_empty():
        return []

    seen: set[tuple] = set()
    places: list[dict] = []
    for row in pois.iter_rows(named=True):
        place_type = HIGH_VALUE_POI_CATEGORIES.get(str(row.get("category", "")))
        if place_type is None:
            continue
        name = str(row.get("name") or "").strip()
        if len(name) < 3:
            continue
        lat = float(row["lat"])
        lon = float(row["lng"])
        key = _poi_dedup_key(name, place_type, lat, lon)
        if key in seen:
            continue
        seen.add(key)
        places.append(
            {
                "name": name,
                "place_type": place_type,
                "lat": lat,
                "lon": lon,
                "population": 0,
                "travel_destination": False,
                "display_city": None,
            }
        )
    return places


def _append_high_value_pois(places: list[dict], pois_path: Path) -> int:
    pois = pl.read_parquet(pois_path, columns=["name", "category", "lat", "lng"])
    new_places = _pois_to_places(pois)
    existing = {
        _poi_dedup_key(
            str(place["name"]), place["place_type"], place["lat"], place["lon"]
        )
        for place in places
    }
    added = 0
    for place in new_places:
        key = _poi_dedup_key(
            place["name"], place["place_type"], place["lat"], place["lon"]
        )
        if key in existing:
            continue
        places.append(place)
        existing.add(key)
        added += 1
    return added


def _postcode_lookup(postcodes_path: Path) -> dict[str, tuple[float, float]]:
    df = (
        pl.read_parquet(
            postcodes_path,
            columns=["pcds", "lat", "long", "ctry25cd", "doterm"],
        )
        .filter((pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null())
        .filter(_valid_wgs84_expr())
    )
    return {
        _normalize_postcode(postcode): (float(lat), float(lon))
        for postcode, lat, lon in df.select(["pcds", "lat", "long"]).iter_rows()
    }


def _display_city_from_tags(tags: dict[str, str]) -> str | None:
    """Use explicit OSM context where available, before we fall back to admin data."""
    for key in (
        "is_in",
        "is_in:city",
        "is_in:town",
        "is_in:county",
        "addr:city",
    ):
        value = tags.get(key)
        if value and _LONDON_TOKEN_RE.search(value):
            return "London"
    return None


def _parse_population(pop_str: str) -> int:
    """Robustly parse OSM population tags that may carry grouping separators,
    decimals, or surrounding text ("12,345", "5 000", "12345.0", "approx 5000").
    """
    # Take the integer part before any decimal point, then the first run of
    # digits ignoring grouping separators (commas/spaces) and other annotations.
    match = re.search(r"\d[\d,\s]*", pop_str.split(".", 1)[0])
    if match is None:
        return 0
    digits = re.sub(r"\D", "", match.group(0))
    return int(digits) if digits else 0


def _is_london_admin_expr() -> pl.Expr:
    return (
        (pl.col("rgn25cd") == LONDON_REGION_CODE)
        | pl.col("lad25cd").str.starts_with(LONDON_LAD_PREFIX).fill_null(False)
        | pl.col("cty25cd").is_in(LONDON_COUNTY_CODES)
    )


def _london_postcode_tree(postcodes_path: Path) -> tuple[cKDTree, np.ndarray]:
    required = [
        "doterm",
        "ctry25cd",
        "east1m",
        "north1m",
        "rgn25cd",
        "lad25cd",
        "cty25cd",
    ]
    df = (
        pl.read_parquet(postcodes_path, columns=required)
        .filter(
            (pl.col("ctry25cd") == ENGLAND_COUNTRY_CODE) & pl.col("doterm").is_null()
        )
        .filter(_valid_bng_expr())
        .with_columns(_is_london_admin_expr().alias("is_london"))
        .select("east1m", "north1m", "is_london")
    )
    if df.is_empty():
        raise ValueError(f"No active England postcodes in {postcodes_path}")

    coords = np.column_stack(
        [
            df["east1m"].to_numpy().astype(np.float64),
            df["north1m"].to_numpy().astype(np.float64),
        ]
    )
    london_flags = df["is_london"].to_numpy().astype(bool)
    return cKDTree(coords), london_flags


def _assign_london_display_city(
    places: list[dict],
    postcodes_path: Path,
    max_distance_m: float = DISPLAY_CITY_NEAREST_POSTCODE_MAX_M,
) -> int:
    """Tag places whose nearest active postcode is inside Greater London."""
    if not places:
        return 0

    tree, london_flags = _london_postcode_tree(postcodes_path)
    lons = np.array([float(place["lon"]) for place in places], dtype=np.float64)
    lats = np.array([float(place["lat"]) for place in places], dtype=np.float64)
    eastings, northings = WGS84_TO_BNG.transform(lons, lats)
    place_coords = np.column_stack([eastings, northings])
    distances, indices = tree.query(place_coords)

    assigned = 0
    for idx, place in enumerate(places):
        if place.get("display_city") or place.get("place_type") == "city":
            continue
        if distances[idx] <= max_distance_m and london_flags[indices[idx]]:
            place["display_city"] = "London"
            assigned += 1
    return assigned


def _ofs_universities(
    raw: pl.DataFrame, postcode_coords: dict[str, tuple[float, float]]
) -> tuple[list[dict], int]:
    rows = raw.rows()
    header_idx = _find_header_row(rows)
    headers = list(rows[header_idx])
    legal_idx = _find_column(headers, "provider", "legal", "name")
    trading_idx = _find_column(headers, "trading", "name")
    address_idx = _find_column(headers, "contact", "address")
    university_title_idx = _find_column(headers, "right", "use", "university")

    universities: list[dict] = []
    skipped = 0
    for row in rows[header_idx + 1 :]:
        if _cell_text(row[university_title_idx]).casefold() != "yes":
            continue

        name = _select_university_name(
            _cell_text(row[legal_idx]), _cell_text(row[trading_idx])
        )
        postcode = _extract_postcode(_cell_text(row[address_idx]))
        coords = postcode_coords.get(postcode or "")
        if not name or coords is None:
            skipped += 1
            continue

        lat, lon = coords
        universities.append(
            {
                "name": name,
                "place_type": "university",
                "lat": lat,
                "lon": lon,
                "population": 0,
                "travel_destination": True,
                "display_city": None,
            }
        )

    return universities, skipped


def _append_ofs_universities(
    places: list[dict], register_path: Path, postcodes_path: Path
) -> tuple[int, int]:
    postcode_coords = _postcode_lookup(postcodes_path)
    raw = pl.read_excel(register_path, has_header=False)
    universities, skipped = _ofs_universities(raw, postcode_coords)

    existing_slugs = {_slugify_name(str(place["name"])) for place in places}
    added = 0
    for university in universities:
        slug = _slugify_name(university["name"])
        if slug in existing_slugs:
            continue
        places.append(university)
        existing_slugs.add(slug)
        added += 1
    return added, skipped


def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
    """Extract station-level DLR destinations from NaPTAN access nodes."""
    df = pl.read_parquet(naptan_path)
    required = {"id", "name", "category", "lat", "lng"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"NaPTAN file is missing columns: {sorted(missing)}")

    rows: dict[str, dict] = {}
    for row in df.iter_rows(named=True):
        atco_id = str(row["id"] or "")
        match = _DLR_CODE_RE.search(atco_id)
        if not match:
            continue
        if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
            continue

        code = match.group(1)
        raw_name = str(row["name"] or "")
        if not raw_name:
            continue

        lat = float(row["lat"])
        lon = float(row["lng"])
        current = rows.get(code)
        if current is None:
            rows[code] = {
                "raw_name": raw_name,
                "lat_sum": lat,
                "lon_sum": lon,
                "count": 1,
            }
            continue

        current["lat_sum"] += lat
        current["lon_sum"] += lon
        current["count"] += 1
        if _station_name_score(raw_name) < _station_name_score(current["raw_name"]):
            current["raw_name"] = raw_name

    stations = []
    for station in rows.values():
        count = station["count"]
        display_name = _station_display_name(station["raw_name"], {"network": "DLR"})
        stations.append(
            {
                "name": display_name,
                "place_type": "station",
                "lat": station["lat_sum"] / count,
                "lon": station["lon_sum"] / count,
                "population": 0,
                "travel_destination": True,
                "display_city": None,
            }
        )

    return sorted(stations, key=lambda station: station["name"])


def _append_naptan_dlr_stations(places: list[dict], naptan_path: Path) -> int:
    existing_names = {str(place["name"]).casefold() for place in places}
    added = 0
    for station in _naptan_dlr_stations(naptan_path):
        key = station["name"].casefold()
        if key in existing_names:
            continue
        places.append(station)
        existing_names.add(key)
        added += 1
    return added


class PlaceHandler(osmium.SimpleHandler):
    def __init__(
        self, progress: tqdm, england_polygon, *, collect_streets: bool = False
    ) -> None:
        super().__init__()
        self._progress = progress
        self.places: list[dict] = []
        self.streets: list[dict] = []
        self._england = england_polygon
        self._collect_streets = collect_streets

    def _add(
        self,
        name: str,
        place_type: str,
        lat: float,
        lon: float,
        population: int,
        travel_destination: bool,
        display_city: str | None = None,
    ) -> None:
        self.places.append(
            {
                "name": name,
                "place_type": place_type,
                "lat": lat,
                "lon": lon,
                "population": population,
                "travel_destination": travel_destination,
                "display_city": display_city,
            }
        )
        self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)

    def node(self, n: osmium.osm.Node) -> None:
        self._progress.update(1)
        if not n.location.valid:
            return
        lat, lon = n.location.lat, n.location.lon
        if not (
            ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
            and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
        ):
            return
        if not self._england.contains(Point(lon, lat)):
            return

        tags = dict(n.tags)
        name = tags.get("name:en", tags.get("name", ""))
        if not name:
            return

        population = _parse_population(tags.get("population", ""))

        # place=* nodes
        place_type = tags.get("place")
        if place_type in SEARCH_PLACE_TYPES:
            self._add(
                name,
                place_type,
                lat,
                lon,
                population,
                travel_destination=place_type in TRAVEL_DESTINATION_PLACE_TYPES,
                display_city=None
                if place_type == "city"
                else _display_city_from_tags(tags),
            )
            return

        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
        if tags.get("railway") == "station":
            if _is_tram_station(tags):
                return
            display_name = _station_display_name(name, tags)
            self._add(
                display_name,
                "station",
                lat,
                lon,
                population,
                travel_destination=True,
                display_city=_display_city_from_tags(tags),
            )
            return

    def way(self, w: osmium.osm.Way) -> None:
        """Collect named, searchable highways as raw segments (grouped into streets later)."""
        if not self._collect_streets:
            return
        self._progress.update(1)
        if w.tags.get("highway") not in SEARCHABLE_HIGHWAY_TYPES:
            return
        name = w.tags.get("name:en", w.tags.get("name", ""))
        if not name:
            return

        # Way node refs expose resolved .lat/.lon directly (locations=True); accessing them
        # raises InvalidLocationError when a node's location is missing from the index.
        coords: list[tuple[float, float]] = []
        for node in w.nodes:
            try:
                coords.append((node.lat, node.lon))
            except osmium.InvalidLocationError:
                continue

        centroid = _street_centroid(coords)
        if centroid is None:
            return
        lat, lon = centroid
        if not (
            ENGLAND_BBOX_SOUTH <= lat <= ENGLAND_BBOX_NORTH
            and ENGLAND_BBOX_WEST <= lon <= ENGLAND_BBOX_EAST
        ):
            return
        if not self._england.contains(Point(lon, lat)):
            return
        self.streets.append({"name": name, "lat": lat, "lon": lon})


def main() -> None:
    parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
    parser.add_argument(
        "--boundary",
        type=Path,
        required=True,
        help="England boundary GeoJSON file",
    )
    parser.add_argument(
        "--naptan",
        type=Path,
        help="Optional NaPTAN parquet file used to add DLR station destinations",
    )
    parser.add_argument(
        "--university-register",
        type=Path,
        help="Optional OfS register spreadsheet used to add university destinations",
    )
    parser.add_argument(
        "--postcodes",
        type=Path,
        help=(
            "Postcode parquet used to geocode OfS university contact postcodes, assign "
            "Greater London display labels, and tag streets with their outcode"
        ),
    )
    parser.add_argument(
        "--pois",
        type=Path,
        help="Optional uk_pois.parquet; high-value named POIs are added to the gazetteer",
    )
    parser.add_argument(
        "--include-streets",
        action="store_true",
        help="Extract named highways as searchable streets (requires --postcodes)",
    )
    args = parser.parse_args()

    pbf_file = args.pbf
    england_polygon = load_england_polygon(args.boundary)

    if args.include_streets and not args.postcodes:
        raise ValueError("--postcodes is required with --include-streets")

    print("Extracting search place nodes + railway stations")
    with tqdm(
        unit=" elements",
        unit_scale=True,
        desc="Streaming",
        smoothing=0.05,
        mininterval=1.0,
    ) as progress:
        handler = PlaceHandler(
            progress, england_polygon, collect_streets=args.include_streets
        )
        handler.apply_file(str(pbf_file), locations=True)

    print(f"Extracted {len(handler.places):,} place nodes")
    if args.include_streets:
        print(f"Collected {len(handler.streets):,} named street segments")
        tree, outcodes = _outcode_tree(args.postcodes)
        street_places = _build_street_places(handler.streets, tree, outcodes)
        handler.places.extend(street_places)
        print(f"Added {len(street_places):,} grouped streets")
    if args.pois:
        added = _append_high_value_pois(handler.places, args.pois)
        print(f"Added {added:,} high-value POIs from {args.pois}")
    if args.naptan:
        added = _append_naptan_dlr_stations(handler.places, args.naptan)
        print(f"Added {added:,} DLR station destinations from NaPTAN")
    if args.university_register:
        if not args.postcodes:
            raise ValueError("--postcodes is required with --university-register")
        added, skipped = _append_ofs_universities(
            handler.places, args.university_register, args.postcodes
        )
        print(f"Added {added:,} university travel destinations from the OfS register")
        if skipped:
            print(f"Skipped {skipped:,} OfS university rows without usable coordinates")

    if handler.places:
        if args.postcodes:
            assigned = _assign_london_display_city(handler.places, args.postcodes)
            print(f"Assigned London display labels to {assigned:,} places")
        for place in handler.places:
            place.setdefault("display_city", None)
        df = pl.DataFrame(handler.places)
        df = df.with_columns(pl.col("display_city").cast(pl.Utf8))
        args.output.parent.mkdir(parents=True, exist_ok=True)
        df.write_parquet(args.output)
        print(f"Saved to {args.output}")
    else:
        print("No places found — skipping output")


if __name__ == "__main__":
    main()