"""Download and prepare transit network data for R5 routing. Downloads: - England OSM PBF from Geofabrik (~1.5GB) - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry) Then processes for R5 compatibility: - Cleans GTFS (fixes stop_times >72h, feed_info year >2100) - Crops OSM PBF to London bounding box via osmium - Crops GTFS to London bounding box (keeps only London-touching trips) Requires: osmium-tool (apt install osmium-tool) Output directory: property-data/transit/ Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready) """ import argparse import csv import io import os import subprocess import urllib.request import zipfile from pathlib import Path from tqdm import tqdm ENGLAND_PBF_URL = ( "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf" ) # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/" USER_AGENT = "property-map-pipeline/1.0 (https://github.com)" # London + Home Counties bounding box (~50km buffer around Greater London) LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35} def _download_http(url: str, dest: Path, *, desc: str) -> None: """Stream-download a URL to a file with progress bar.""" dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(dest.suffix + ".tmp") req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) with ( tqdm(unit="B", unit_scale=True, desc=desc) as bar, urllib.request.urlopen(req) as resp, open(tmp, "wb") as f, ): length = resp.headers.get("Content-Length") if length: bar.total = int(length) while chunk := resp.read(1 << 20): f.write(chunk) bar.update(len(chunk)) tmp.rename(dest) print(f" Saved to {dest}") def download_osm_pbf(output_dir: Path) -> Path: """Download England OSM PBF extract from Geofabrik.""" dest = output_dir / "england.osm.pbf" if dest.exists(): print(f"OSM PBF already exists: {dest}") return dest print("Downloading England OSM PBF (~1.5 GB)...") _download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf") return dest def download_bods_gtfs(output_dir: Path) -> Path: """Download BODS GTFS (all England bus/tram/ferry timetables).""" dest = output_dir / "bods_gtfs_raw.zip" if dest.exists(): print(f"BODS GTFS already exists: {dest}") return dest print("Downloading BODS GTFS (~1.5 GB)...") _download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip") return dest def clean_gtfs(src: Path, dst: Path) -> None: """Fix R5-incompatible entries in GTFS. - Removes stop_times with arrival/departure hour > 72 - Caps feed_info end_date year to 2099 """ if dst.exists(): print(f"Cleaned GTFS already exists: {dst}") return print("Cleaning GTFS for R5 compatibility...") with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile( dst, "w", zipfile.ZIP_DEFLATED ) as zout: for info in zin.infolist(): if info.filename == "stop_times.txt": dropped = 0 with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1 dep_idx = ( cols.index("departure_time") if "departure_time" in cols else -1 ) import tempfile tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") skip = False for idx in [arr_idx, dep_idx]: if 0 <= idx < len(parts): time_val = parts[idx].strip('"') if ":" in time_val: try: hour = int(time_val.split(":")[0]) if hour > 72: skip = True break except ValueError: pass if skip: dropped += 1 else: tmp.write(line) tmp.close() print(f" stop_times: dropped {dropped} rows with hours > 72") zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "feed_info.txt": data = zin.read(info).decode("utf-8") lines = data.strip().split("\n") header_line = lines[0] feed_cols = header_line.split(",") fixed_lines = [header_line] for line in lines[1:]: parts = line.split(",") for i, col_name in enumerate(feed_cols): if "end_date" in col_name.lower() and i < len(parts): date_val = parts[i].strip('"') if len(date_val) == 8: year = int(date_val[:4]) if year > 2100: parts[i] = "20991231" print(f" feed_info: capped end_date {date_val} → 20991231") fixed_lines.append(",".join(parts)) zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n") else: zout.writestr(info, zin.read(info)) print(f" Saved to {dst}") def crop_osm_to_london(src: Path, dst: Path) -> None: """Extract London bounding box from England OSM PBF using osmium.""" if dst.exists(): print(f"London OSM PBF already exists: {dst}") return bbox = LONDON_BBOX bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}" print(f"Cropping OSM PBF to London bbox ({bbox_str})...") subprocess.run( ["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"], check=True, ) size_mb = dst.stat().st_size / (1024 * 1024) print(f" Saved to {dst} ({size_mb:.0f} MB)") def crop_gtfs_to_london(src: Path, dst: Path) -> None: """Crop GTFS to trips touching the London bounding box.""" if dst.exists(): print(f"London GTFS already exists: {dst}") return bbox = LONDON_BBOX print("Cropping GTFS to London area...") with zipfile.ZipFile(src, "r") as zin: # Step 1: Find stops in bbox print(" Finding stops in bbox...") with zin.open("stops.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) stops_in_bbox = set() all_stops = list(reader) for row in all_stops: lat = float(row["stop_lat"]) lon = float(row["stop_lon"]) if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]: stops_in_bbox.add(row["stop_id"]) print(f" {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox") # Step 2: Find trips touching these stops print(" Finding trips touching London stops...") with zin.open("stop_times.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) st_fieldnames = reader.fieldnames trips_in_bbox = set() for row in reader: if row["stop_id"] in stops_in_bbox: trips_in_bbox.add(row["trip_id"]) print(f" {len(trips_in_bbox):,} trips touch London") # Step 3: Collect all stop_times for those trips print(" Collecting stop_times for London trips...") stop_times_kept = [] with zin.open("stop_times.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) for row in reader: if row["trip_id"] in trips_in_bbox: stop_times_kept.append(row) stops_needed = {row["stop_id"] for row in stop_times_kept} print(f" {len(stop_times_kept):,} stop_times kept") # Step 4: Read trips and find needed routes/services/shapes print(" Reading trips...") with zin.open("trips.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) trips_fieldnames = reader.fieldnames all_trips = list(reader) trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox] routes_needed = {t["route_id"] for t in trips_kept} services_needed = {t["service_id"] for t in trips_kept} shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""} # Step 5: Write cropped GTFS print(" Writing cropped GTFS...") with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout: # stops stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed] _write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept) # stop_times _write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept) # trips _write_csv(zout, "trips.txt", trips_fieldnames, trips_kept) # routes with zin.open("routes.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) routes_fn = reader.fieldnames routes_kept = [r for r in reader if r["route_id"] in routes_needed] _write_csv(zout, "routes.txt", routes_fn, routes_kept) # agency (copy all) zout.writestr("agency.txt", zin.read("agency.txt")) # calendar with zin.open("calendar.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) cal_fn = reader.fieldnames cal_kept = [r for r in reader if r["service_id"] in services_needed] _write_csv(zout, "calendar.txt", cal_fn, cal_kept) # calendar_dates with zin.open("calendar_dates.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) cd_fn = reader.fieldnames cd_kept = [r for r in reader if r["service_id"] in services_needed] _write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept) # shapes (stream — can be very large) print(" Streaming shapes.txt...") with zin.open("shapes.txt") as f: reader = csv.DictReader(io.TextIOWrapper(f)) shapes_fn = reader.fieldnames shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed] _write_csv(zout, "shapes.txt", shapes_fn, shapes_rows) # feed_info + frequencies (copy) zout.writestr("feed_info.txt", zin.read("feed_info.txt")) zout.writestr("frequencies.txt", zin.read("frequencies.txt")) size_mb = dst.stat().st_size / (1024 * 1024) print(f" Saved to {dst} ({size_mb:.0f} MB)") def _write_csv( zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict] ) -> None: buf = io.StringIO() w = csv.DictWriter(buf, fieldnames=fieldnames) w.writeheader() w.writerows(rows) zout.writestr(name, buf.getvalue()) print(f" {name}: {len(rows):,} rows") def main() -> None: parser = argparse.ArgumentParser( description="Download and prepare transit network data for R5 routing engine" ) parser.add_argument( "--output", type=Path, required=True, help="Output directory for transit data", ) args = parser.parse_args() output_dir: Path = args.output raw_dir = output_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) # Download raw data england_pbf = download_osm_pbf(raw_dir) bods_raw = download_bods_gtfs(raw_dir) # Clean GTFS (fix R5 incompatibilities) bods_clean = raw_dir / "bods_gtfs_clean.zip" clean_gtfs(bods_raw, bods_clean) # Crop to London area for R5 (full England requires >30GB RAM) london_pbf = output_dir / "london.osm.pbf" crop_osm_to_london(england_pbf, london_pbf) london_gtfs = output_dir / "bods_gtfs.zip" crop_gtfs_to_london(bods_clean, london_gtfs) # Summary print() print("Transit data ready for R5:") for f in sorted(output_dir.iterdir()): if f.is_dir() or f.name.startswith("."): continue size_mb = f.stat().st_size / (1024 * 1024) print(f" {f.name}: {size_mb:.1f} MB") if __name__ == "__main__": main()