"""Download and prepare transit network data for R5 routing. Downloads: - England OSM PBF from Geofabrik (~1.5GB) - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry) - TfL TransXChange timetables → converted to GTFS - National Rail CIF timetable → converted to GTFS (requires credentials) Then processes for R5 compatibility: - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100) - Converts TfL TransXChange to GTFS via transxchange2gtfs - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker) Requires: osmium-tool, Node.js (npx), Docker (for national rail) Output directory: property-data/transit/ raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip """ import argparse import json import os import subprocess import tempfile import time import urllib.parse import urllib.request import zipfile from pathlib import Path from tqdm import tqdm ENGLAND_PBF_URL = ( "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf" ) # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/" # TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car) TFL_TRANSXCHANGE_URL = ( "https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip" ) # NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken) NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv" # National Rail Open Data API NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate" NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable" USER_AGENT = "property-map-pipeline/1.0 (https://github.com)" def _download_http(url: str, dest: Path, *, desc: str, headers: dict | None = None) -> None: """Stream-download a URL to a file with progress bar.""" dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(dest.suffix + ".tmp") req_headers = {"User-Agent": USER_AGENT} if headers: req_headers.update(headers) req = urllib.request.Request(url, headers=req_headers) with ( tqdm(unit="B", unit_scale=True, desc=desc) as bar, urllib.request.urlopen(req) as resp, open(tmp, "wb") as f, ): length = resp.headers.get("Content-Length") if length: bar.total = int(length) while chunk := resp.read(1 << 20): f.write(chunk) bar.update(len(chunk)) tmp.rename(dest) print(f" Saved to {dest}") def download_osm_pbf(output_dir: Path) -> Path: """Download England OSM PBF extract from Geofabrik.""" dest = output_dir / "england.osm.pbf" if dest.exists(): print(f"OSM PBF already exists: {dest}") return dest print("Downloading England OSM PBF (~1.5 GB)...") _download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf") return dest def download_bods_gtfs(output_dir: Path) -> Path: """Download BODS GTFS (all England bus/tram/ferry timetables).""" dest = output_dir / "bods_gtfs_raw.zip" if dest.exists(): print(f"BODS GTFS already exists: {dest}") return dest print("Downloading BODS GTFS (~1.5 GB)...") _download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip") return dest def clean_gtfs(src: Path, dst: Path) -> None: """Fix R5-incompatible entries in GTFS. - Removes stop_times with arrival/departure hour > 72 - Caps feed_info end_date year to 2099 """ if dst.exists(): print(f"Cleaned GTFS already exists: {dst}") return print("Cleaning GTFS for R5 compatibility...") with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile( dst, "w", zipfile.ZIP_DEFLATED ) as zout: for info in zin.infolist(): if info.filename == "stop_times.txt": dropped = 0 with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1 dep_idx = ( cols.index("departure_time") if "departure_time" in cols else -1 ) tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") skip = False for idx in [arr_idx, dep_idx]: if 0 <= idx < len(parts): time_val = parts[idx].strip('"') if ":" in time_val: try: hour = int(time_val.split(":")[0]) if hour > 72: skip = True break except ValueError: pass if skip: dropped += 1 else: tmp.write(line) tmp.close() print(f" stop_times: dropped {dropped} rows with hours > 72") zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "feed_info.txt": data = zin.read(info).decode("utf-8") lines = data.strip().split("\n") header_line = lines[0] feed_cols = header_line.split(",") fixed_lines = [header_line] for line in lines[1:]: parts = line.split(",") for i, col_name in enumerate(feed_cols): if "end_date" in col_name.lower() and i < len(parts): date_val = parts[i].strip('"') if len(date_val) == 8: year = int(date_val[:4]) if year > 2100: parts[i] = "20991231" print(f" feed_info: capped end_date {date_val} → 20991231") fixed_lines.append(",".join(parts)) zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n") else: zout.writestr(info, zin.read(info)) print(f" Saved to {dst}") def download_tfl_transxchange(raw_dir: Path) -> Path: """Download TfL TransXChange timetable bundle.""" dest = raw_dir / "tfl_transxchange.zip" if dest.exists(): print(f"TfL TransXChange already exists: {dest}") return dest print("Downloading TfL TransXChange timetables...") _download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip") return dest def download_naptan() -> None: """Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs).""" dest = Path("/tmp/Stops.csv") if dest.exists(): print(f"NaPTAN Stops.csv already exists: {dest}") return print("Downloading NaPTAN stops data...") _download_http(NAPTAN_URL, dest, desc="Stops.csv") def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path: """Convert TfL TransXChange to GTFS using transxchange2gtfs.""" dest = output_dir / "tfl_gtfs.zip" if dest.exists(): print(f"TfL GTFS already exists: {dest}") return dest txc_path = raw_dir / "tfl_transxchange.zip" # Ensure NaPTAN is available (transxchange2gtfs has a broken download URL) download_naptan() print("Converting TfL TransXChange → GTFS...") subprocess.run( ["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)], check=True, ) size_mb = dest.stat().st_size / (1024 * 1024) print(f" Saved to {dest} ({size_mb:.1f} MB)") return dest def download_national_rail_cif(raw_dir: Path) -> Path | None: """Download National Rail CIF timetable (requires credentials).""" dest = raw_dir / "national_rail_cif.zip" if dest.exists(): print(f"National Rail CIF already exists: {dest}") return dest email = os.environ.get("NATIONAL_RAIL_EMAIL") password = os.environ.get("NATIONAL_RAIL_PASSWORD") if not email or not password: print("Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail") return None print("Authenticating with National Rail Open Data...") auth_data = urllib.parse.urlencode({"username": email, "password": password}).encode() auth_req = urllib.request.Request( NR_AUTH_URL, data=auth_data, headers={"User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded"}, ) with urllib.request.urlopen(auth_req) as resp: token_data = json.loads(resp.read()) token = token_data["token"] print(" Authenticated successfully") print("Downloading National Rail CIF timetable...") _download_http( NR_TIMETABLE_URL, dest, desc="national_rail_cif.zip", headers={"X-Auth-Token": token}, ) return dest def clean_national_rail_gtfs(src: Path, dst: Path) -> None: """Fix R5-incompatible entries in dtd2mysql-generated National Rail GTFS. Fixes: - Interior pass-through stops (pickup_type=1, drop_off_type=1) → normal stops. R5 builds TripPatterns from the full stop sequence but may build shorter TripSchedules when stops are non-boarding, causing ArrayIndexOutOfBoundsException. - Removes stop_times referencing stops not in stops.txt. - Removes trips with backwards travel times. - Converts route_type=714 (rail replacement bus) to 3 (bus) for R5 compatibility. - Removes non-standard links.txt file. - Renumbers stop_sequence to 0-based (R5/BODS convention). - Fixes bogus coordinates (lat < 0) on Irish CIE stations. """ if dst.exists(): print(f"Cleaned National Rail GTFS already exists: {dst}") return print("Cleaning National Rail GTFS for R5 compatibility...") # First pass: collect valid stop IDs and find bad trips stop_ids: set[str] = set() bad_trip_ids: set[str] = set() with zipfile.ZipFile(src, "r") as zin: # Load valid stop IDs with zin.open("stops.txt") as f: header = f.readline().decode("utf-8").strip() stop_id_idx = header.split(",").index("stop_id") lat_idx = header.split(",").index("stop_lat") for line in f: parts = line.decode("utf-8", errors="replace").strip().split(",") if parts: stop_ids.add(parts[stop_id_idx]) # Find trips with backwards travel times with zin.open("stop_times.txt") as f: st_header = f.readline().decode("utf-8").strip() st_cols = st_header.split(",") trip_id_idx = st_cols.index("trip_id") dep_idx = st_cols.index("departure_time") prev_trip = "" prev_dep_secs = -1 for line in f: parts = line.decode("utf-8", errors="replace").strip().split(",") if not parts: continue trip_id = parts[trip_id_idx].strip('"') if trip_id != prev_trip: prev_trip = trip_id prev_dep_secs = -1 dep_str = parts[dep_idx].strip('"') if ":" in dep_str: try: h, m, s = dep_str.split(":") dep_secs = int(h) * 3600 + int(m) * 60 + int(s) if dep_secs < prev_dep_secs: bad_trip_ids.add(trip_id) prev_dep_secs = dep_secs except ValueError: pass print(f" Found {len(bad_trip_ids)} trips with backwards travel times") # Second pass: write cleaned zip passthrough_fixed = 0 orphan_stops_removed = 0 bad_trips_removed = 0 seqs_renumbered = 0 coords_fixed = 0 route_types_fixed = 0 with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile( dst, "w", zipfile.ZIP_DEFLATED ) as zout: for info in zin.infolist(): # Skip non-standard links.txt if info.filename == "links.txt": continue if info.filename == "stop_times.txt": with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") trip_id_idx = cols.index("trip_id") stop_id_idx = cols.index("stop_id") seq_idx = cols.index("stop_sequence") pickup_idx = cols.index("pickup_type") if "pickup_type" in cols else -1 dropoff_idx = cols.index("drop_off_type") if "drop_off_type" in cols else -1 tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) prev_trip = "" seq_counter = 0 for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") trip_id = parts[trip_id_idx].strip('"') stop_id = parts[stop_id_idx].strip('"') # Skip trips with backwards times if trip_id in bad_trip_ids: bad_trips_removed += 1 continue # Skip stop_times referencing missing stops if stop_id not in stop_ids: orphan_stops_removed += 1 continue # Fix pass-through stops: set pickup/dropoff to 0 (normal) if pickup_idx >= 0 and dropoff_idx >= 0: pickup = parts[pickup_idx].strip('"') dropoff = parts[dropoff_idx].strip('"') if pickup == "1" and dropoff == "1": parts[pickup_idx] = "0" parts[dropoff_idx] = "0" passthrough_fixed += 1 # Renumber stop_sequence to 0-based if trip_id != prev_trip: prev_trip = trip_id seq_counter = 0 else: seq_counter += 1 old_seq = parts[seq_idx].strip('"') parts[seq_idx] = str(seq_counter) if old_seq != str(seq_counter): seqs_renumbered += 1 tmp.write((",".join(parts) + "\n").encode("utf-8")) tmp.close() zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "stops.txt": with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") lat_idx = cols.index("stop_lat") lon_idx = cols.index("stop_lon") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") try: lat = float(parts[lat_idx]) # Fix bogus Irish CIE coordinates (South Atlantic) if lat < 0: # Set to a neutral UK coordinate that won't be routed to parts[lat_idx] = "54.0" parts[lon_idx] = "-2.0" coords_fixed += 1 except ValueError: pass tmp.write((",".join(parts) + "\n").encode("utf-8")) tmp.close() zout.write(tmp.name, "stops.txt") os.unlink(tmp.name) elif info.filename == "routes.txt": with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") rt_idx = cols.index("route_type") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") if parts[rt_idx].strip('"') == "714": parts[rt_idx] = "3" route_types_fixed += 1 tmp.write((",".join(parts) + "\n").encode("utf-8")) tmp.close() zout.write(tmp.name, "routes.txt") os.unlink(tmp.name) elif info.filename == "trips.txt": # Remove trips that have backwards travel times with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") if parts[trip_id_idx].strip('"') not in bad_trip_ids: tmp.write(line) tmp.close() zout.write(tmp.name, "trips.txt") os.unlink(tmp.name) elif info.filename == "calendar.txt": # Cap end_date year to 2099 with zin.open(info) as f: header = f.readline() header_str = header.decode("utf-8").strip() cols = header_str.split(",") end_idx = cols.index("end_date") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt" ) tmp.write(header) for line in f: line_str = line.decode("utf-8", errors="replace").strip() if not line_str: continue parts = line_str.split(",") date_val = parts[end_idx].strip('"') if len(date_val) == 8: try: year = int(date_val[:4]) if year > 2099: parts[end_idx] = "20991231" except ValueError: pass tmp.write((",".join(parts) + "\n").encode("utf-8")) tmp.close() zout.write(tmp.name, "calendar.txt") os.unlink(tmp.name) else: zout.writestr(info, zin.read(info)) print(f" Pass-through stops fixed: {passthrough_fixed}") print(f" Orphan stop references removed: {orphan_stops_removed}") print(f" Bad trip stop_times removed: {bad_trips_removed}") print(f" Stop sequences renumbered: {seqs_renumbered}") print(f" Bogus coordinates fixed: {coords_fixed}") print(f" Route types 714→3 fixed: {route_types_fixed}") print(f" Saved to {dst}") def _docker_run_dtd2mysql( network: str, db_container: str, volumes: list[str], args: list[str] ) -> None: """Run dtd2mysql in a Node.js container on the same Docker network as MariaDB.""" cmd = [ "docker", "run", "--rm", "--network", network, "-e", f"DATABASE_HOSTNAME={db_container}", "-e", "DATABASE_USERNAME=root", "-e", "DATABASE_PASSWORD=root", "-e", "DATABASE_NAME=dtd", ] for v in volumes: cmd.extend(["-v", v]) # Install zip (needed for --gtfs-zip) then run dtd2mysql inner = "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args) cmd.extend(["node:20", "bash", "-c", inner]) subprocess.run(cmd, check=True) def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path: """Convert National Rail CIF to GTFS using dtd2mysql + MariaDB Docker. Runs both MariaDB and dtd2mysql as Docker containers on a shared network, since Docker port forwarding is not available in all environments. Then cleans the output for R5 compatibility. """ dest = output_dir / "national_rail_gtfs.zip" if dest.exists(): print(f"National Rail GTFS already exists: {dest}") return dest raw_dest = raw_dir / "national_rail_gtfs_raw.zip" if not raw_dest.exists(): db_container = "propertymap-mariadb-temp" network = "propertymap-dtd-net" print("Creating Docker network and starting MariaDB...") subprocess.run(["docker", "network", "create", network], capture_output=True) subprocess.run( [ "docker", "run", "-d", "--name", db_container, "--network", network, "-e", "MARIADB_ROOT_PASSWORD=root", "-e", "MARIADB_DATABASE=dtd", "mariadb:latest", ], check=True, ) try: # Wait for MariaDB to be ready print(" Waiting for MariaDB to be ready...") for attempt in range(30): result = subprocess.run( ["docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1"], capture_output=True, ) if result.returncode == 0: break time.sleep(2) else: raise RuntimeError("MariaDB did not become ready in time") raw_abs = str(raw_dir.resolve()) print("Importing CIF timetable into MariaDB...") _docker_run_dtd2mysql( network, db_container, volumes=[f"{raw_abs}:/data:ro"], args=["--timetable", "/data/national_rail_cif.zip"], ) print("Exporting GTFS from MariaDB...") _docker_run_dtd2mysql( network, db_container, volumes=[f"{raw_abs}:/output"], args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"], ) finally: print("Cleaning up Docker resources...") subprocess.run(["docker", "stop", db_container], capture_output=True) subprocess.run(["docker", "rm", db_container], capture_output=True) subprocess.run(["docker", "network", "rm", network], capture_output=True) # Clean the raw GTFS for R5 compatibility clean_national_rail_gtfs(raw_dest, dest) return dest def main() -> None: parser = argparse.ArgumentParser( description="Download and prepare transit network data for R5 routing engine" ) parser.add_argument( "--output", type=Path, required=True, help="Output directory for transit data", ) parser.add_argument( "--skip-tfl", action="store_true", help="Skip TfL TransXChange download and conversion", ) parser.add_argument( "--skip-national-rail", action="store_true", help="Skip National Rail CIF download and conversion", ) args = parser.parse_args() output_dir: Path = args.output raw_dir = output_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) # 1. Download and clean BODS GTFS download_osm_pbf(raw_dir) bods_raw = download_bods_gtfs(raw_dir) bods_clean = output_dir / "bods_gtfs.zip" clean_gtfs(bods_raw, bods_clean) # 2. TfL TransXChange → GTFS if args.skip_tfl: print("Skipping TfL (--skip-tfl)") else: download_tfl_transxchange(raw_dir) convert_tfl_to_gtfs(raw_dir, output_dir) # 3. National Rail CIF → GTFS if args.skip_national_rail: print("Skipping National Rail (--skip-national-rail)") else: cif = download_national_rail_cif(raw_dir) if cif is not None: convert_national_rail_to_gtfs(raw_dir, output_dir) # Summary print() print("Transit data ready for R5:") for f in sorted(output_dir.iterdir()): if f.is_dir() or f.name.startswith("."): continue size_mb = f.stat().st_size / (1024 * 1024) print(f" {f.name}: {size_mb:.1f} MB") print() print("IMPORTANT: If you previously built a network from London-only data,") print("delete the stale cache before running R5:") print(" rm -f property-data/r5-network/network.dat") if __name__ == "__main__": main()