perfect-postcode/pipeline/download/places.py
2026-03-14 21:36:00 +00:00

153 lines
4.6 KiB
Python

"""Extract place=* nodes and railway stations from OSM PBF → data/places.parquet.
Extracts named place nodes (cities, towns, suburbs, etc.) and railway stations
(tube, national rail, DLR, etc.) for typeahead search.
Reuses the same great-britain-latest.osm.pbf as pois.py.
"""
import argparse
from pathlib import Path
import osmium
import polars as pl
from tqdm import tqdm
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
PLACE_TYPES = {"city"}
# Suffixes to strip from raw station names before appending the typed suffix.
_STATION_STRIP = (
" tube station",
" underground station",
" railway station",
" dlr station",
" overground station",
" tram stop",
" station",
)
def _station_display_name(name: str, tags: dict[str, str]) -> str:
"""Build a descriptive station name like 'Bank tube station'."""
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
if station_tag == "subway" or "underground" in network:
suffix = "tube station"
elif "docklands" in network or "dlr" in network:
suffix = "DLR station"
elif "overground" in network:
suffix = "overground station"
elif "elizabeth" in network:
suffix = "Elizabeth line station"
elif station_tag == "light_rail" or "tramlink" in network or "tram" in network:
suffix = "tram stop"
else:
suffix = "railway station"
# Strip any existing station suffix from the raw name
lower = name.lower()
for s in _STATION_STRIP:
if lower.endswith(s):
name = name[: len(name) - len(s)].rstrip()
break
return f"{name} {suffix}"
class PlaceHandler(osmium.SimpleHandler):
def __init__(self, progress: tqdm) -> None:
super().__init__()
self._progress = progress
self.places: list[dict] = []
def _add(
self, name: str, place_type: str, lat: float, lon: float, population: int
) -> None:
self.places.append(
{
"name": name,
"place_type": place_type,
"lat": lat,
"lon": lon,
"population": population,
}
)
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
def node(self, n: osmium.osm.Node) -> None:
self._progress.update(1)
if not n.location.valid:
return
lat, lon = n.location.lat, n.location.lon
if not (
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
):
return
name = n.tags.get("name:en", n.tags.get("name", ""))
if not name:
return
pop_str = n.tags.get("population", "")
try:
population = int(pop_str)
except ValueError:
population = 0
# place=* nodes (cities, towns, suburbs, etc.)
place_type = n.tags.get("place")
if place_type in PLACE_TYPES:
self._add(name, place_type, lat, lon, population)
return
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
if n.tags.get("railway") == "station":
tags = dict(n.tags)
station_tag = tags.get("station", "")
network = tags.get("network", "").lower()
# Skip tram stops
if station_tag == "light_rail" or "tramlink" in network or "tram" in network:
return
display_name = _station_display_name(name, tags)
self._add(display_name, "station", lat, lon, population)
return
def main() -> None:
parser = argparse.ArgumentParser(description="Extract place names from OSM PBF")
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--pbf", type=Path, required=True, help="Path to OSM PBF file"
)
args = parser.parse_args()
pbf_file = args.pbf
print("Extracting place nodes: cities + railway stations")
with tqdm(
unit=" elements",
unit_scale=True,
desc="Streaming",
smoothing=0.05,
mininterval=1.0,
) as progress:
handler = PlaceHandler(progress)
handler.apply_file(str(pbf_file), locations=True)
print(f"Extracted {len(handler.places):,} place nodes")
if handler.places:
df = pl.DataFrame(handler.places)
args.output.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(args.output)
print(f"Saved to {args.output}")
else:
print("No places found — skipping output")
if __name__ == "__main__":
main()