99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
"""Extract place=* nodes from OSM PBF → data/places.parquet.
|
|
|
|
Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
|
|
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
|
"""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import osmium
|
|
import polars as pl
|
|
from tqdm import tqdm
|
|
|
|
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
|
|
|
|
PLACE_TYPES = {
|
|
"city",
|
|
"borough",
|
|
"town",
|
|
"suburb",
|
|
"neighbourhood",
|
|
"village",
|
|
"hamlet",
|
|
"locality",
|
|
"isolated_dwelling",
|
|
}
|
|
|
|
|
|
class PlaceHandler(osmium.SimpleHandler):
|
|
def __init__(self, progress: tqdm) -> None:
|
|
super().__init__()
|
|
self._progress = progress
|
|
self.places: list[dict] = []
|
|
|
|
def node(self, n: osmium.osm.Node) -> None:
|
|
self._progress.update(1)
|
|
if not n.location.valid:
|
|
return
|
|
lat, lon = n.location.lat, n.location.lon
|
|
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
|
|
return
|
|
place_type = n.tags.get("place")
|
|
if place_type not in PLACE_TYPES:
|
|
return
|
|
name = n.tags.get("name:en", n.tags.get("name", ""))
|
|
if not name:
|
|
return
|
|
self.places.append(
|
|
{"name": name, "place_type": place_type, "lat": lat, "lon": lon}
|
|
)
|
|
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract place names from OSM PBF"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
parser.add_argument(
|
|
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.pbf and args.pbf.exists():
|
|
pbf_file = args.pbf
|
|
print(f"Using existing PBF: {pbf_file}")
|
|
else:
|
|
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
|
if not pbf_file.exists():
|
|
download_pbf(pbf_file)
|
|
else:
|
|
print(f"Using cached PBF: {pbf_file}")
|
|
|
|
print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
|
|
with tqdm(
|
|
unit=" elements",
|
|
unit_scale=True,
|
|
desc="Streaming",
|
|
smoothing=0.05,
|
|
mininterval=1.0,
|
|
) as progress:
|
|
handler = PlaceHandler(progress)
|
|
handler.apply_file(str(pbf_file), locations=True)
|
|
|
|
print(f"Extracted {len(handler.places):,} place nodes")
|
|
|
|
if handler.places:
|
|
df = pl.DataFrame(handler.places)
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
df.write_parquet(args.output)
|
|
print(f"Saved to {args.output}")
|
|
else:
|
|
print("No places found — skipping output")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|