Can't even keep track anymore
This commit is contained in:
parent
dccc1e439d
commit
3a3f899ea2
50 changed files with 1144 additions and 560 deletions
99
pipeline/download/places.py
Normal file
99
pipeline/download/places.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
"""Extract place=* nodes from OSM PBF → data/places.parquet.
|
||||
|
||||
Extracts named place nodes (cities, towns, suburbs, etc.) for typeahead search.
|
||||
Reuses the same great-britain-latest.osm.pbf as pois.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST, download_pbf
|
||||
|
||||
PLACE_TYPES = {
|
||||
"city",
|
||||
"borough",
|
||||
"town",
|
||||
"suburb",
|
||||
"neighbourhood",
|
||||
"village",
|
||||
"hamlet",
|
||||
"locality",
|
||||
"isolated_dwelling",
|
||||
}
|
||||
|
||||
|
||||
class PlaceHandler(osmium.SimpleHandler):
|
||||
def __init__(self, progress: tqdm) -> None:
|
||||
super().__init__()
|
||||
self._progress = progress
|
||||
self.places: list[dict] = []
|
||||
|
||||
def node(self, n: osmium.osm.Node) -> None:
|
||||
self._progress.update(1)
|
||||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not (UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH and UK_BBOX_WEST <= lon <= UK_BBOX_EAST):
|
||||
return
|
||||
place_type = n.tags.get("place")
|
||||
if place_type not in PLACE_TYPES:
|
||||
return
|
||||
name = n.tags.get("name:en", n.tags.get("name", ""))
|
||||
if not name:
|
||||
return
|
||||
self.places.append(
|
||||
{"name": name, "place_type": place_type, "lat": lat, "lon": lon}
|
||||
)
|
||||
self._progress.set_postfix(places=f"{len(self.places):,}", refresh=False)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract place names from OSM PBF"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.pbf and args.pbf.exists():
|
||||
pbf_file = args.pbf
|
||||
print(f"Using existing PBF: {pbf_file}")
|
||||
else:
|
||||
pbf_file = Path("data/great-britain-latest.osm.pbf")
|
||||
if not pbf_file.exists():
|
||||
download_pbf(pbf_file)
|
||||
else:
|
||||
print(f"Using cached PBF: {pbf_file}")
|
||||
|
||||
print(f"Extracting place nodes: {sorted(PLACE_TYPES)}")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
desc="Streaming",
|
||||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = PlaceHandler(progress)
|
||||
handler.apply_file(str(pbf_file), locations=True)
|
||||
|
||||
print(f"Extracted {len(handler.places):,} place nodes")
|
||||
|
||||
if handler.places:
|
||||
df = pl.DataFrame(handler.places)
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(args.output)
|
||||
print(f"Saved to {args.output}")
|
||||
else:
|
||||
print("No places found — skipping output")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue