Add POIs and journey times to map
This commit is contained in:
parent
7bfb1729bf
commit
500b9ef2aa
11 changed files with 914 additions and 177 deletions
0
pipeline/pois/__init__.py
Normal file
0
pipeline/pois/__init__.py
Normal file
181
pipeline/pois/__main__.py
Normal file
181
pipeline/pois/__main__.py
Normal file
|
|
@ -0,0 +1,181 @@
|
|||
"""Single-pass POI extraction from OSM PBF file using pyosmium."""
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
|
||||
import osmium
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from .config import (
|
||||
GB_PBF_FILE,
|
||||
GEOFABRIK_GB_URL,
|
||||
OSM_TAG_MAPPING,
|
||||
OUTPUT_FILE,
|
||||
TAG_KEYS_TO_CHECK,
|
||||
UK_BBOX_EAST,
|
||||
UK_BBOX_NORTH,
|
||||
UK_BBOX_SOUTH,
|
||||
UK_BBOX_WEST,
|
||||
)
|
||||
|
||||
# Approximate element count for the GB PBF extract (for progress estimation).
|
||||
ESTIMATED_ELEMENTS = 500_000_000
|
||||
|
||||
|
||||
def download_pbf() -> None:
|
||||
"""Download Great Britain PBF extract from Geofabrik."""
|
||||
GB_PBF_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = GB_PBF_FILE.with_suffix(".pbf.tmp")
|
||||
print(f"Downloading {GEOFABRIK_GB_URL}")
|
||||
|
||||
with (
|
||||
tqdm(unit="B", unit_scale=True, desc="Downloading") as bar,
|
||||
urllib.request.urlopen(GEOFABRIK_GB_URL) as resp,
|
||||
open(tmp, "wb") as f,
|
||||
):
|
||||
length = resp.headers.get("Content-Length")
|
||||
if length:
|
||||
bar.total = int(length)
|
||||
while chunk := resp.read(1 << 20):
|
||||
f.write(chunk)
|
||||
bar.update(len(chunk))
|
||||
|
||||
tmp.rename(GB_PBF_FILE)
|
||||
print(f"Saved to {GB_PBF_FILE}")
|
||||
|
||||
|
||||
class POIHandler(osmium.SimpleHandler):
|
||||
"""Streams OSM data, filters to UK bbox, extracts matching POIs."""
|
||||
|
||||
def __init__(self, progress: tqdm) -> None:
|
||||
super().__init__()
|
||||
self.pois: list[dict] = []
|
||||
self._poi_count = 0
|
||||
self._progress = progress
|
||||
|
||||
def _in_uk(self, lat: float, lon: float) -> bool:
|
||||
return (
|
||||
UK_BBOX_SOUTH <= lat <= UK_BBOX_NORTH
|
||||
and UK_BBOX_WEST <= lon <= UK_BBOX_EAST
|
||||
)
|
||||
|
||||
def _match_tags(self, tags: osmium.osm.TagList) -> str | None:
|
||||
for key in TAG_KEYS_TO_CHECK:
|
||||
if key in tags:
|
||||
value = tags[key]
|
||||
if value in TAG_KEYS_TO_CHECK[key]:
|
||||
return OSM_TAG_MAPPING[(key, value)]
|
||||
return None
|
||||
|
||||
def _get_name(self, tags: osmium.osm.TagList) -> str:
|
||||
return tags.get("name:en", tags.get("name", ""))
|
||||
|
||||
def _tags_to_json(self, tags: osmium.osm.TagList) -> str:
|
||||
return json.dumps({tag.k: tag.v for tag in tags})
|
||||
|
||||
def _add_poi(
|
||||
self, osm_id: str, tags: osmium.osm.TagList, category: str, lat: float, lng: float
|
||||
) -> None:
|
||||
self.pois.append(
|
||||
{
|
||||
"id": osm_id,
|
||||
"name": self._get_name(tags),
|
||||
"category": category,
|
||||
"lat": lat,
|
||||
"lng": lng,
|
||||
"osm_tags": self._tags_to_json(tags),
|
||||
}
|
||||
)
|
||||
self._poi_count += 1
|
||||
self._progress.set_postfix(pois=f"{self._poi_count:,}", refresh=False)
|
||||
|
||||
def _tick(self) -> None:
|
||||
self._progress.update(1)
|
||||
|
||||
def node(self, n: osmium.osm.Node) -> None:
|
||||
self._tick()
|
||||
if not n.location.valid:
|
||||
return
|
||||
lat, lon = n.location.lat, n.location.lon
|
||||
if not self._in_uk(lat, lon):
|
||||
return
|
||||
category = self._match_tags(n.tags)
|
||||
if category:
|
||||
self._add_poi(f"n{n.id}", n.tags, category, lat, lon)
|
||||
|
||||
def way(self, w: osmium.osm.Way) -> None:
|
||||
self._tick()
|
||||
category = self._match_tags(w.tags)
|
||||
if not category:
|
||||
return
|
||||
|
||||
lats = []
|
||||
lons = []
|
||||
for node in w.nodes:
|
||||
try:
|
||||
lats.append(node.location.lat)
|
||||
lons.append(node.location.lon)
|
||||
except osmium.InvalidLocationError:
|
||||
continue
|
||||
|
||||
if not lats:
|
||||
return
|
||||
|
||||
centroid_lat = sum(lats) / len(lats)
|
||||
centroid_lng = sum(lons) / len(lons)
|
||||
|
||||
if not self._in_uk(centroid_lat, centroid_lng):
|
||||
return
|
||||
|
||||
self._add_poi(f"w{w.id}", w.tags, category, centroid_lat, centroid_lng)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not GB_PBF_FILE.exists():
|
||||
download_pbf()
|
||||
|
||||
print(f"=== POI Extraction from {GB_PBF_FILE} ===")
|
||||
print(
|
||||
f"UK bbox: ({UK_BBOX_WEST}, {UK_BBOX_SOUTH}, {UK_BBOX_EAST}, {UK_BBOX_NORTH})"
|
||||
)
|
||||
print(f"Categories: {len(OSM_TAG_MAPPING)}")
|
||||
print()
|
||||
|
||||
with tqdm(
|
||||
total=ESTIMATED_ELEMENTS,
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
desc="Streaming",
|
||||
smoothing=0.05,
|
||||
mininterval=1.0,
|
||||
) as progress:
|
||||
handler = POIHandler(progress)
|
||||
handler.apply_file(str(GB_PBF_FILE), locations=True)
|
||||
|
||||
print(f"Extracted {len(handler.pois):,} POIs")
|
||||
|
||||
if not handler.pois:
|
||||
print("No POIs found.")
|
||||
return
|
||||
|
||||
df = pl.DataFrame(handler.pois)
|
||||
|
||||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_parquet(OUTPUT_FILE)
|
||||
print(f"Saved to {OUTPUT_FILE}")
|
||||
|
||||
print("\n=== Summary ===")
|
||||
print(f"Total POIs: {len(df):,}")
|
||||
print("\nPOIs by category:")
|
||||
category_counts = (
|
||||
df.group_by("category")
|
||||
.agg(pl.len().alias("count"))
|
||||
.sort("count", descending=True)
|
||||
)
|
||||
for row in category_counts.iter_rows(named=True):
|
||||
print(f" {row['category']}: {row['count']:,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
147
pipeline/pois/config.py
Normal file
147
pipeline/pois/config.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
"""Configuration for POI extraction from OpenStreetMap."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# File paths
|
||||
DATA_DIR = Path(__file__).parent.parent.parent / "data_sources"
|
||||
GB_PBF_FILE = DATA_DIR / "great-britain-latest.osm.pbf"
|
||||
OUTPUT_FILE = DATA_DIR / "uk_pois.parquet"
|
||||
|
||||
# Geofabrik download URL for Great Britain (~1.1GB PBF, much faster than planet XML)
|
||||
GEOFABRIK_GB_URL = (
|
||||
"https://download.geofabrik.de/europe/great-britain-latest.osm.pbf"
|
||||
)
|
||||
|
||||
# UK bounding box (west, south, east, north) — used for way centroid filtering
|
||||
UK_BBOX_WEST = -7.57
|
||||
UK_BBOX_SOUTH = 49.96
|
||||
UK_BBOX_EAST = 1.68
|
||||
UK_BBOX_NORTH = 58.64
|
||||
|
||||
# OSM tag mapping to categories
|
||||
# Maps (tag_key, tag_value) -> category name
|
||||
OSM_TAG_MAPPING: dict[tuple[str, str], str] = {
|
||||
# Education
|
||||
("amenity", "school"): "school",
|
||||
("amenity", "kindergarten"): "preschool",
|
||||
("amenity", "college"): "college_university",
|
||||
("amenity", "university"): "college_university",
|
||||
("amenity", "library"): "library",
|
||||
("amenity", "language_school"): "school",
|
||||
("amenity", "music_school"): "school",
|
||||
("amenity", "driving_school"): "school",
|
||||
# Healthcare
|
||||
("amenity", "hospital"): "hospital",
|
||||
("amenity", "clinic"): "public_health_clinic",
|
||||
("amenity", "doctors"): "doctor",
|
||||
("amenity", "dentist"): "dentist",
|
||||
("amenity", "pharmacy"): "pharmacy",
|
||||
("amenity", "veterinary"): "veterinary",
|
||||
("amenity", "nursing_home"): "nursing_home",
|
||||
("amenity", "social_facility"): "social_facility",
|
||||
# Transport
|
||||
("railway", "station"): "train_station",
|
||||
("railway", "halt"): "train_station",
|
||||
("railway", "tram_stop"): "tram_stop",
|
||||
("amenity", "bus_station"): "bus_station",
|
||||
("amenity", "ferry_terminal"): "ferry_terminal",
|
||||
("public_transport", "station"): "train_station",
|
||||
("public_transport", "stop_position"): "bus_stop",
|
||||
("station", "subway"): "metro_station",
|
||||
("station", "light_rail"): "light_rail_station",
|
||||
("aeroway", "aerodrome"): "airport",
|
||||
("highway", "bus_stop"): "bus_stop",
|
||||
# Parks & Leisure
|
||||
("leisure", "park"): "park",
|
||||
("leisure", "nature_reserve"): "nature_reserve",
|
||||
("leisure", "dog_park"): "dog_park",
|
||||
("leisure", "playground"): "playground",
|
||||
("leisure", "sports_centre"): "sports_centre",
|
||||
("leisure", "swimming_pool"): "swimming_pool",
|
||||
("leisure", "fitness_centre"): "gym",
|
||||
("leisure", "golf_course"): "golf_course",
|
||||
("leisure", "garden"): "garden",
|
||||
("leisure", "marina"): "marina",
|
||||
("boundary", "national_park"): "national_park",
|
||||
# Emergency
|
||||
("amenity", "police"): "police_department",
|
||||
("amenity", "fire_station"): "fire_department",
|
||||
# Shopping
|
||||
("shop", "supermarket"): "supermarket",
|
||||
("shop", "convenience"): "convenience_store",
|
||||
("shop", "grocery"): "grocery_store",
|
||||
("shop", "bakery"): "bakery",
|
||||
("shop", "butcher"): "butcher",
|
||||
("shop", "greengrocer"): "greengrocer",
|
||||
("shop", "deli"): "deli",
|
||||
("shop", "department_store"): "department_store",
|
||||
("shop", "clothes"): "clothing_store",
|
||||
("shop", "shoes"): "shoe_store",
|
||||
("shop", "electronics"): "electronics_store",
|
||||
("shop", "hardware"): "hardware_store",
|
||||
("shop", "furniture"): "furniture_store",
|
||||
("shop", "car"): "car_dealer",
|
||||
("shop", "car_repair"): "car_repair",
|
||||
("shop", "hairdresser"): "hairdresser",
|
||||
("shop", "beauty"): "beauty_salon",
|
||||
("shop", "optician"): "optician",
|
||||
("shop", "newsagent"): "newsagent",
|
||||
("shop", "books"): "bookshop",
|
||||
("shop", "charity"): "charity_shop",
|
||||
("shop", "alcohol"): "off_licence",
|
||||
("shop", "laundry"): "laundry",
|
||||
("shop", "dry_cleaning"): "dry_cleaning",
|
||||
("shop", "mall"): "shopping_centre",
|
||||
# Food & Drink
|
||||
("amenity", "restaurant"): "restaurant",
|
||||
("amenity", "cafe"): "cafe",
|
||||
("amenity", "pub"): "pub",
|
||||
("amenity", "bar"): "bar",
|
||||
("amenity", "fast_food"): "fast_food",
|
||||
("amenity", "food_court"): "food_court",
|
||||
("amenity", "ice_cream"): "ice_cream",
|
||||
("amenity", "biergarten"): "beer_garden",
|
||||
# Finance
|
||||
("amenity", "bank"): "bank",
|
||||
("amenity", "atm"): "atm",
|
||||
("amenity", "bureau_de_change"): "bureau_de_change",
|
||||
# Entertainment & Culture
|
||||
("amenity", "cinema"): "cinema",
|
||||
("amenity", "theatre"): "theatre",
|
||||
("amenity", "nightclub"): "nightclub",
|
||||
("amenity", "community_centre"): "community_centre",
|
||||
("amenity", "arts_centre"): "arts_centre",
|
||||
("tourism", "museum"): "museum",
|
||||
("tourism", "gallery"): "gallery",
|
||||
("tourism", "attraction"): "attraction",
|
||||
("tourism", "zoo"): "zoo",
|
||||
("tourism", "theme_park"): "theme_park",
|
||||
("tourism", "viewpoint"): "viewpoint",
|
||||
# Accommodation
|
||||
("tourism", "hotel"): "hotel",
|
||||
("tourism", "hostel"): "hostel",
|
||||
("tourism", "guest_house"): "guest_house",
|
||||
("tourism", "camp_site"): "campsite",
|
||||
("tourism", "caravan_site"): "caravan_site",
|
||||
# Religion
|
||||
("amenity", "place_of_worship"): "place_of_worship",
|
||||
# Government & Public
|
||||
("amenity", "townhall"): "town_hall",
|
||||
("amenity", "courthouse"): "courthouse",
|
||||
("amenity", "post_office"): "post_office",
|
||||
("amenity", "prison"): "prison",
|
||||
("amenity", "recycling"): "recycling",
|
||||
("amenity", "waste_disposal"): "waste_disposal",
|
||||
("amenity", "toilets"): "public_toilets",
|
||||
# Fuel
|
||||
("amenity", "fuel"): "petrol_station",
|
||||
("amenity", "charging_station"): "ev_charging",
|
||||
# Parking
|
||||
("amenity", "parking"): "parking",
|
||||
("amenity", "bicycle_parking"): "bicycle_parking",
|
||||
}
|
||||
|
||||
# Build reverse lookup: tag_key -> set of tag_values we care about
|
||||
TAG_KEYS_TO_CHECK: dict[str, set[str]] = {}
|
||||
for (key, value), _ in OSM_TAG_MAPPING.items():
|
||||
TAG_KEYS_TO_CHECK.setdefault(key, set()).add(value)
|
||||
|
|
@ -6,31 +6,47 @@ import polars as pl
|
|||
|
||||
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR
|
||||
|
||||
JOURNEY_COLS = [
|
||||
"public_transport_easy_minutes",
|
||||
"public_transport_quick_minutes",
|
||||
"cycling_minutes",
|
||||
]
|
||||
|
||||
AGGREGATE_COLS = [
|
||||
"median_pt_easy_minutes",
|
||||
"median_pt_quick_minutes",
|
||||
"median_cycling_minutes",
|
||||
"median_journey_minutes",
|
||||
]
|
||||
|
||||
|
||||
def aggregate_journey_times(
|
||||
journey_times_path: Path | None = None,
|
||||
postcodes_h3_path: Path | None = None,
|
||||
output_dir: Path | None = None,
|
||||
aggregates_dir: Path | None = None,
|
||||
) -> list[Path]:
|
||||
"""
|
||||
Aggregate journey times by H3 cells at all resolutions.
|
||||
Add journey times to existing H3 aggregate parquet files.
|
||||
|
||||
Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode,
|
||||
then groups by H3 cell to compute median journey time.
|
||||
Joins journey_times_bank_checkpoint.parquet with postcodes_h3.parquet on postcode,
|
||||
aggregates by H3 cell, then merges into existing res{N}.parquet files.
|
||||
"""
|
||||
journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet"
|
||||
journey_times_path = (
|
||||
journey_times_path
|
||||
or PROCESSED_DIR / "journey_times_bank_checkpoint.parquet"
|
||||
)
|
||||
postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
|
||||
output_dir = output_dir or AGGREGATES_DIR
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
aggregates_dir = aggregates_dir or AGGREGATES_DIR
|
||||
|
||||
# Load journey times data
|
||||
journey_df = pl.read_parquet(journey_times_path).select(
|
||||
["postcode", "public_transport_minutes"]
|
||||
["postcode"] + JOURNEY_COLS
|
||||
)
|
||||
|
||||
# Filter out null journey times
|
||||
journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null())
|
||||
# Filter out rows where all journey time columns are null
|
||||
journey_df = journey_df.filter(
|
||||
pl.any_horizontal(pl.col(c).is_not_null() for c in JOURNEY_COLS)
|
||||
)
|
||||
|
||||
if journey_df.height == 0:
|
||||
print("No valid journey times found")
|
||||
|
|
@ -48,31 +64,63 @@ def aggregate_journey_times(
|
|||
|
||||
print(f"Joined {joined_df.height} postcodes with journey times")
|
||||
|
||||
saved_paths = []
|
||||
updated_paths = []
|
||||
|
||||
for resolution in H3_RESOLUTIONS:
|
||||
h3_col = f"h3_res{resolution}"
|
||||
parquet_path = aggregates_dir / f"res{resolution}.parquet"
|
||||
|
||||
if not parquet_path.exists():
|
||||
print(f"Skipping resolution {resolution} - {parquet_path} not found")
|
||||
continue
|
||||
|
||||
if h3_col not in joined_df.columns:
|
||||
print(f"Skipping resolution {resolution} - column {h3_col} not found")
|
||||
continue
|
||||
|
||||
# Aggregate by H3 cell - compute median journey time
|
||||
agg_df = (
|
||||
# Aggregate journey times by H3 cell
|
||||
journey_agg = (
|
||||
joined_df.group_by(h3_col)
|
||||
.agg(
|
||||
pl.col("public_transport_minutes").median().alias("median_journey_minutes"),
|
||||
pl.col("public_transport_minutes").count().alias("journey_count"),
|
||||
pl.col("public_transport_easy_minutes")
|
||||
.median()
|
||||
.alias("median_pt_easy_minutes"),
|
||||
pl.col("public_transport_quick_minutes")
|
||||
.median()
|
||||
.alias("median_pt_quick_minutes"),
|
||||
pl.col("cycling_minutes")
|
||||
.median()
|
||||
.alias("median_cycling_minutes"),
|
||||
pl.col("public_transport_quick_minutes")
|
||||
.median()
|
||||
.alias("median_journey_minutes"),
|
||||
)
|
||||
.rename({h3_col: "h3"})
|
||||
)
|
||||
|
||||
output_path = output_dir / f"journey_times_res{resolution}.parquet"
|
||||
agg_df.write_parquet(output_path)
|
||||
saved_paths.append(output_path)
|
||||
print(f"Saved {agg_df.height} cells to {output_path}")
|
||||
# Load existing parquet
|
||||
existing_df = pl.read_parquet(parquet_path)
|
||||
|
||||
return saved_paths
|
||||
# Drop existing journey time columns if present
|
||||
existing_df = existing_df.drop(
|
||||
[c for c in AGGREGATE_COLS if c in existing_df.columns]
|
||||
)
|
||||
|
||||
# Left join journey times onto existing data
|
||||
updated_df = existing_df.join(journey_agg, on="h3", how="left")
|
||||
|
||||
# Save back to parquet
|
||||
updated_df.write_parquet(parquet_path)
|
||||
updated_paths.append(parquet_path)
|
||||
matched = updated_df.filter(
|
||||
pl.col("median_journey_minutes").is_not_null()
|
||||
).height
|
||||
print(
|
||||
f"Updated {parquet_path.name}: {matched} rows with journey times "
|
||||
f"(out of {updated_df.height} total)"
|
||||
)
|
||||
|
||||
return updated_paths
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import polars as pl
|
|||
from pipeline.sources.postcodes import save_postcodes
|
||||
from pipeline.sources.property_prices import PropertyPricesSource
|
||||
from pipeline.processors.h3_aggregator import save_aggregates
|
||||
from pipeline.processors.journey_times_aggregator import aggregate_journey_times
|
||||
|
||||
|
||||
def run_pipeline():
|
||||
|
|
@ -14,22 +15,31 @@ def run_pipeline():
|
|||
print("=" * 60)
|
||||
|
||||
# Step 1: Process postcodes with H3 indices
|
||||
print("\n[1/3] Processing postcodes with H3 indices...")
|
||||
print("\n[1/4] Processing postcodes with H3 indices...")
|
||||
postcodes_path = save_postcodes()
|
||||
print(f" Saved: {postcodes_path}")
|
||||
|
||||
print("\n[2/3] Processing property prices...")
|
||||
print("\n[2/4] Processing property prices...")
|
||||
postcodes = pl.scan_parquet(postcodes_path)
|
||||
property_source = PropertyPricesSource()
|
||||
properties = property_source.process(postcodes)
|
||||
print(" Joined property prices with postcodes")
|
||||
|
||||
print("\n[3/3] Aggregating at H3 resolutions...")
|
||||
print("\n[3/4] Aggregating at H3 resolutions...")
|
||||
saved_paths = save_aggregates(properties)
|
||||
for path in saved_paths:
|
||||
size_mb = path.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved: {path.name} ({size_mb:.1f} MB)")
|
||||
|
||||
print("\n[4/4] Adding journey times to aggregates...")
|
||||
updated_paths = aggregate_journey_times()
|
||||
if updated_paths:
|
||||
for path in updated_paths:
|
||||
size_mb = path.stat().st_size / (1024 * 1024)
|
||||
print(f" Updated: {path.name} ({size_mb:.1f} MB)")
|
||||
else:
|
||||
print(" Skipped (no journey time data found)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_pipeline()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue