Add POI pipeline

This commit is contained in:
Andras Schmelczer 2026-01-26 21:31:00 +00:00
parent 8a8df2ebfa
commit 4d8b626150
4 changed files with 124 additions and 0 deletions

54
download_pois.py Normal file
View file

@ -0,0 +1,54 @@
"""Download POI data for the UK from Overture Maps."""
from pathlib import Path
import overturemaps
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
# UK bounding box (west, south, east, north)
UK_BBOX = (-8.65, 49.86, 1.77, 60.86)
OUTPUT_DIR = Path("data_sources")
OUTPUT_FILE = OUTPUT_DIR / "uk_pois.parquet"
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
if OUTPUT_FILE.exists():
print(f"POI file already exists: {OUTPUT_FILE}")
print("Delete it manually to re-download.")
return
print("Downloading UK POI data from Overture Maps...")
print(f"Bounding box: {UK_BBOX}")
print("This may take several minutes...")
reader = overturemaps.record_batch_reader("place", bbox=UK_BBOX)
# Read all batches
batches = []
with tqdm(desc="Downloading batches", unit=" batches") as pbar:
for batch in reader:
batches.append(batch)
pbar.update(1)
pbar.set_postfix(rows=sum(b.num_rows for b in batches))
if not batches:
print("No data found in bounding box!")
return
# Combine batches into a table and write
table = pa.Table.from_batches(batches, schema=reader.schema)
print(f"\nWriting {table.num_rows:,} POIs to {OUTPUT_FILE}...")
pq.write_table(table, OUTPUT_FILE)
print(f"Download complete: {OUTPUT_FILE}")
print(f"File size: {OUTPUT_FILE.stat().st_size / 1024 / 1024:.1f} MB")
if __name__ == "__main__":
main()