"""Download and prepare transit network data for R5 routing. Downloads: - England OSM PBF from Geofabrik (~1.5GB) - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry) - TfL TransXChange timetables → converted to GTFS - National Rail CIF timetable → converted to GTFS (requires credentials) Then processes for R5 compatibility: - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100) - Converts high-frequency metro/tram services to frequency-based GTFS - Converts TfL TransXChange to GTFS via transxchange2gtfs - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker) Requires: osmium-tool, Node.js (npx), Docker (for national rail) Output directory: property-data/transit/ raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip """ import argparse import csv import io import json import os import shutil import statistics import subprocess import tempfile import time import urllib.parse import urllib.request import zipfile from collections import defaultdict from pathlib import Path from tqdm import tqdm from pipeline.local_temp import local_tmp_dir ENGLAND_PBF_URL = ( "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf" ) # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/" # TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car) TFL_TRANSXCHANGE_URL = ( "https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip" ) # NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken) NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv" # National Rail Open Data API NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate" NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable" USER_AGENT = "property-map-pipeline/1.0 (https://github.com)" TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0" def _download_http( url: str, dest: Path, *, desc: str, headers: dict | None = None ) -> None: """Stream-download a URL to a file with progress bar.""" dest.parent.mkdir(parents=True, exist_ok=True) tmp = dest.with_suffix(dest.suffix + ".tmp") req_headers = {"User-Agent": USER_AGENT} if headers: req_headers.update(headers) req = urllib.request.Request(url, headers=req_headers) with ( tqdm(unit="B", unit_scale=True, desc=desc) as bar, urllib.request.urlopen(req) as resp, open(tmp, "wb") as f, ): length = resp.headers.get("Content-Length") if length: bar.total = int(length) while chunk := resp.read(1 << 20): f.write(chunk) bar.update(len(chunk)) tmp.rename(dest) print(f" Saved to {dest}") def download_osm_pbf(output_dir: Path) -> Path: """Download England OSM PBF extract from Geofabrik.""" dest = output_dir / "england.osm.pbf" if dest.exists(): print(f"OSM PBF already exists: {dest}") return dest print("Downloading England OSM PBF (~1.5 GB)...") _download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf") return dest def download_bods_gtfs(output_dir: Path) -> Path: """Download BODS GTFS (all England bus/tram/ferry timetables).""" dest = output_dir / "bods_gtfs_raw.zip" if dest.exists(): print(f"BODS GTFS already exists: {dest}") return dest print("Downloading BODS GTFS (~1.5 GB)...") _download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip") return dest def _parse_csv_line(line: bytes | str) -> list[str]: """Parse a single GTFS CSV record.""" if isinstance(line, bytes): line = line.decode("utf-8", errors="replace") line = line.rstrip("\r\n") if not line: return [] return next(csv.reader([line])) def _format_csv_row(parts: list[str]) -> bytes: """Serialize one GTFS CSV row with stable LF line endings.""" output = io.StringIO() csv.writer(output, lineterminator="\n").writerow(parts) return output.getvalue().encode("utf-8") def _format_csv_rows(rows: list[list[str]]) -> str: output = io.StringIO() writer = csv.writer(output, lineterminator="\n") writer.writerows(rows) return output.getvalue() def clean_gtfs(src: Path, dst: Path) -> None: """Fix R5-incompatible entries in GTFS. - Removes stop_times with arrival/departure hour > 72 - Caps feed_info end_date year to 2099 """ if dst.exists(): print(f"Cleaned GTFS already exists: {dst}") return print("Cleaning GTFS for R5 compatibility...") with ( zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout, ): for info in zin.infolist(): if info.filename == "stop_times.txt": dropped = 0 with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) arr_idx = ( cols.index("arrival_time") if "arrival_time" in cols else -1 ) dep_idx = ( cols.index("departure_time") if "departure_time" in cols else -1 ) tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue skip = False for idx in [arr_idx, dep_idx]: if 0 <= idx < len(parts): time_val = parts[idx].strip('"') if ":" in time_val: try: hour = int(time_val.split(":")[0]) if hour > 72: skip = True break except ValueError: pass if skip: dropped += 1 else: tmp.write(line) tmp.close() print(f" stop_times: dropped {dropped} rows with hours > 72") zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "feed_info.txt": data = zin.read(info).decode("utf-8") rows = list(csv.reader(io.StringIO(data))) if not rows: zout.writestr("feed_info.txt", data) continue feed_cols = rows[0] fixed_rows = [feed_cols] for parts in rows[1:]: for i, col_name in enumerate(feed_cols): if "end_date" in col_name.lower() and i < len(parts): date_val = parts[i].strip('"') if len(date_val) == 8: year = int(date_val[:4]) if year > 2100: parts[i] = "20991231" print( f" feed_info: capped end_date {date_val} → 20991231" ) fixed_rows.append(parts) zout.writestr("feed_info.txt", _format_csv_rows(fixed_rows)) else: zout.writestr(info, zin.read(info)) print(f" Saved to {dst}") def _parse_gtfs_time(time_str: str) -> int | None: """Parse HH:MM:SS to seconds since midnight. Returns None on failure.""" time_str = time_str.strip('"') if ":" not in time_str: return None try: h, m, s = time_str.split(":") return int(h) * 3600 + int(m) * 60 + int(s) except ValueError: return None def _secs_to_gtfs_time(s: int) -> str: """Convert seconds since midnight to HH:MM:SS.""" h = s // 3600 m = (s % 3600) // 60 sec = s % 60 return f"{h:02d}:{m:02d}:{sec:02d}" def convert_high_freq_to_frequency_based( src: Path, dst: Path, *, max_headway_minutes: int = 15 ) -> None: """Convert high-frequency scheduled services to frequency-based GTFS entries. Identifies metro (route_type=1) and tram (route_type=0) routes with regular headways under max_headway_minutes, then creates frequencies.txt entries and removes redundant trips. R5's RAPTOR produces smoother percentile results for frequency-based services, matching the "just turn up" reality of high-frequency metro/tram services. """ if dst.exists(): print(f"Frequency-converted GTFS already exists: {dst}") return print("Converting high-frequency services to frequency-based...") max_headway_secs = max_headway_minutes * 60 with zipfile.ZipFile(src, "r") as zin: # Step 1: Find metro/tram route IDs target_route_ids: set[str] = set() with zin.open("routes.txt") as f: cols = _parse_csv_line(f.readline()) route_id_idx = cols.index("route_id") rt_idx = cols.index("route_type") for line in f: parts = _parse_csv_line(line) if not parts: continue route_type = parts[rt_idx].strip('"') if route_type in ("0", "1"): # tram, metro/subway target_route_ids.add(parts[route_id_idx].strip('"')) if not target_route_ids: print(" No metro/tram routes found, copying unchanged") shutil.copy2(src, dst) return print(f" Found {len(target_route_ids)} metro/tram routes") # Step 2: Map target trips to grouping keys trip_group_key: dict[str, tuple[str, str, str]] = {} with zin.open("trips.txt") as f: cols = _parse_csv_line(f.readline()) trip_id_idx = cols.index("trip_id") route_id_idx = cols.index("route_id") dir_idx = cols.index("direction_id") if "direction_id" in cols else -1 service_idx = cols.index("service_id") for line in f: parts = _parse_csv_line(line) if not parts: continue route_id = parts[route_id_idx].strip('"') if route_id in target_route_ids: trip_id = parts[trip_id_idx].strip('"') direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0" service_id = parts[service_idx].strip('"') trip_group_key[trip_id] = (route_id, direction, service_id) print(f" Found {len(trip_group_key)} trips on target routes") # Step 3: Get first departure time and first stop for each target trip trip_first_dep: dict[str, int] = {} trip_first_stop: dict[str, str] = {} with zin.open("stop_times.txt") as f: cols = _parse_csv_line(f.readline()) trip_id_idx = cols.index("trip_id") dep_idx = cols.index("departure_time") seq_idx = cols.index("stop_sequence") stop_id_idx = cols.index("stop_id") for line in f: parts = _parse_csv_line(line) if not parts: continue trip_id = parts[trip_id_idx].strip('"') if trip_id not in trip_group_key: continue if parts[seq_idx].strip('"') == "0": dep_secs = _parse_gtfs_time(parts[dep_idx]) if dep_secs is not None: trip_first_dep[trip_id] = dep_secs trip_first_stop[trip_id] = parts[stop_id_idx].strip('"') # Step 4: Group trips by (route, direction, service, first_stop) and compute headways groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list) for trip_id, dep_secs in trip_first_dep.items(): route_id, direction, service_id = trip_group_key[trip_id] first_stop = trip_first_stop.get(trip_id, "") key = (route_id, direction, service_id, first_stop) groups[key].append((trip_id, dep_secs)) trips_to_remove: set[str] = set() frequency_entries: list[tuple[str, int, int, int]] = [] groups_converted = 0 for _key, trips in groups.items(): if len(trips) < 4: continue trips.sort(key=lambda x: x[1]) headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)] headways = [h for h in headways if h > 0] if len(headways) < 3: continue median_hw = statistics.median(headways) if median_hw > max_headway_secs or median_hw < 30: continue mean_hw = statistics.mean(headways) if mean_hw == 0: continue stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0 if stdev_hw / mean_hw > 0.5: continue # Convert: keep first trip as template, remove the rest template_trip_id = trips[0][0] start_secs = trips[0][1] end_secs = trips[-1][1] + int(median_hw) headway_rounded = max(60, round(median_hw / 60) * 60) frequency_entries.append( (template_trip_id, start_secs, end_secs, headway_rounded) ) for trip_id, _ in trips[1:]: trips_to_remove.add(trip_id) groups_converted += 1 print(f" Converted {groups_converted} trip groups to frequency-based") print(f" Removing {len(trips_to_remove)} redundant trips") print(f" Created {len(frequency_entries)} frequency entries") # Step 5: Write modified GTFS with ( zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout, ): for info in zin.infolist(): if info.filename == "trips.txt": with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue if parts[trip_id_idx].strip('"') not in trips_to_remove: tmp.write(line) tmp.close() zout.write(tmp.name, "trips.txt") os.unlink(tmp.name) elif info.filename == "stop_times.txt": with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue if parts[trip_id_idx].strip('"') not in trips_to_remove: tmp.write(line) tmp.close() zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "frequencies.txt": pass # we'll write our own below else: zout.writestr(info, zin.read(info)) # Write frequencies.txt freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"] for trip_id, start, end, headway in frequency_entries: freq_lines.append( f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n" ) zout.writestr("frequencies.txt", "".join(freq_lines)) print(f" Saved to {dst}") def download_tfl_transxchange(raw_dir: Path) -> Path: """Download TfL TransXChange timetable bundle.""" dest = raw_dir / "tfl_transxchange.zip" if dest.exists(): print(f"TfL TransXChange already exists: {dest}") return dest print("Downloading TfL TransXChange timetables...") _download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip") return dest def download_naptan() -> None: """Download NaPTAN stops to the local temp dir for transxchange2gtfs.""" dest = local_tmp_dir() / "Stops.csv" if dest.exists(): print(f"NaPTAN Stops.csv already exists: {dest}") return print("Downloading NaPTAN stops data...") _download_http(NAPTAN_URL, dest, desc="Stops.csv") def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path: """Convert TfL TransXChange to GTFS using transxchange2gtfs.""" dest = output_dir / "tfl_gtfs.zip" if dest.exists(): print(f"TfL GTFS already exists: {dest}") return dest txc_path = raw_dir / "tfl_transxchange.zip" # Ensure NaPTAN is available (transxchange2gtfs has a broken download URL) download_naptan() print("Converting TfL TransXChange → GTFS...") # The shim patches known packaging/runtime issues in the pinned npm package # before loading its CLI from npx's temporary install. shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js") subprocess.run( [ "npx", "--yes", "--package", TRANSXCHANGE2GTFS_PACKAGE, "sh", "-c", "\n".join( [ 'bin="$(command -v transxchange2gtfs)"', 'script="$(readlink -f "$bin")"', 'pkg_dir="$(dirname "$(dirname "$script")")"', 'shim="$1"', "shift", 'exec node "$shim" "$pkg_dir" "$@"', ] ), "transxchange2gtfs", str(shim_path.resolve()), str(txc_path.resolve()), str(dest.resolve()), ], check=True, ) required_files = { "agency.txt", "calendar.txt", "calendar_dates.txt", "routes.txt", "stop_times.txt", "stops.txt", "trips.txt", } if not dest.exists() or not zipfile.is_zipfile(dest): raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}") with zipfile.ZipFile(dest) as z: missing = required_files - set(z.namelist()) if missing: missing_str = ", ".join(sorted(missing)) raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}") size_mb = dest.stat().st_size / (1024 * 1024) print(f" Saved to {dest} ({size_mb:.1f} MB)") return dest def download_national_rail_cif(raw_dir: Path) -> Path | None: """Download National Rail CIF timetable (requires credentials).""" dest = raw_dir / "national_rail_cif.zip" if dest.exists(): print(f"National Rail CIF already exists: {dest}") return dest email = os.environ.get("NATIONAL_RAIL_EMAIL") password = os.environ.get("NATIONAL_RAIL_PASSWORD") if not email or not password: print( "Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail" ) return None print("Authenticating with National Rail Open Data...") auth_data = urllib.parse.urlencode( {"username": email, "password": password} ).encode() auth_req = urllib.request.Request( NR_AUTH_URL, data=auth_data, headers={ "User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded", }, ) with urllib.request.urlopen(auth_req) as resp: token_data = json.loads(resp.read()) token = token_data["token"] print(" Authenticated successfully") print("Downloading National Rail CIF timetable...") _download_http( NR_TIMETABLE_URL, dest, desc="national_rail_cif.zip", headers={"X-Auth-Token": token}, ) return dest def clean_national_rail_gtfs(src: Path, dst: Path) -> None: """Fix R5-incompatible entries in dtd2mysql-generated National Rail GTFS. Fixes: - Interior pass-through stops (pickup_type=1, drop_off_type=1) → normal stops. R5 builds TripPatterns from the full stop sequence but may build shorter TripSchedules when stops are non-boarding, causing ArrayIndexOutOfBoundsException. - Removes stop_times referencing stops not in stops.txt. - Removes trips with backwards travel times. - Converts route_type=714 (rail replacement bus) to 3 (bus) for R5 compatibility. - Removes non-standard links.txt file. - Renumbers stop_sequence to 0-based (R5/BODS convention). - Fixes bogus coordinates (lat < 0) on Irish CIE stations. """ if dst.exists(): print(f"Cleaned National Rail GTFS already exists: {dst}") return print("Cleaning National Rail GTFS for R5 compatibility...") # First pass: collect valid stop IDs and find bad trips stop_ids: set[str] = set() bad_trip_ids: set[str] = set() with zipfile.ZipFile(src, "r") as zin: # Load valid stop IDs with zin.open("stops.txt") as f: cols = _parse_csv_line(f.readline()) stop_id_idx = cols.index("stop_id") for line in f: parts = _parse_csv_line(line) if parts: stop_ids.add(parts[stop_id_idx]) # Find trips with backwards travel times with zin.open("stop_times.txt") as f: st_cols = _parse_csv_line(f.readline()) trip_id_idx = st_cols.index("trip_id") dep_idx = st_cols.index("departure_time") prev_trip = "" prev_dep_secs = -1 for line in f: parts = _parse_csv_line(line) if not parts: continue trip_id = parts[trip_id_idx].strip('"') if trip_id != prev_trip: prev_trip = trip_id prev_dep_secs = -1 dep_str = parts[dep_idx].strip('"') if ":" in dep_str: try: h, m, s = dep_str.split(":") dep_secs = int(h) * 3600 + int(m) * 60 + int(s) if dep_secs < prev_dep_secs: bad_trip_ids.add(trip_id) prev_dep_secs = dep_secs except ValueError: pass print(f" Found {len(bad_trip_ids)} trips with backwards travel times") # Second pass: write cleaned zip passthrough_fixed = 0 orphan_stops_removed = 0 bad_trips_removed = 0 seqs_renumbered = 0 coords_fixed = 0 route_types_fixed = 0 with ( zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout, ): for info in zin.infolist(): # Skip non-standard links.txt if info.filename == "links.txt": continue if info.filename == "stop_times.txt": with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) trip_id_idx = cols.index("trip_id") stop_id_idx = cols.index("stop_id") seq_idx = cols.index("stop_sequence") pickup_idx = ( cols.index("pickup_type") if "pickup_type" in cols else -1 ) dropoff_idx = ( cols.index("drop_off_type") if "drop_off_type" in cols else -1 ) tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) prev_trip = "" seq_counter = 0 for line in f: parts = _parse_csv_line(line) if not parts: continue trip_id = parts[trip_id_idx].strip('"') stop_id = parts[stop_id_idx].strip('"') # Skip trips with backwards times if trip_id in bad_trip_ids: bad_trips_removed += 1 continue # Skip stop_times referencing missing stops if stop_id not in stop_ids: orphan_stops_removed += 1 continue # Fix pass-through stops: set pickup/dropoff to 0 (normal) if pickup_idx >= 0 and dropoff_idx >= 0: pickup = parts[pickup_idx].strip('"') dropoff = parts[dropoff_idx].strip('"') if pickup == "1" and dropoff == "1": parts[pickup_idx] = "0" parts[dropoff_idx] = "0" passthrough_fixed += 1 # Renumber stop_sequence to 0-based if trip_id != prev_trip: prev_trip = trip_id seq_counter = 0 else: seq_counter += 1 old_seq = parts[seq_idx].strip('"') parts[seq_idx] = str(seq_counter) if old_seq != str(seq_counter): seqs_renumbered += 1 tmp.write(_format_csv_row(parts)) tmp.close() zout.write(tmp.name, "stop_times.txt") os.unlink(tmp.name) elif info.filename == "stops.txt": with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) lat_idx = cols.index("stop_lat") lon_idx = cols.index("stop_lon") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue try: lat = float(parts[lat_idx]) # Fix bogus Irish CIE coordinates (South Atlantic) if lat < 0: # Set to a neutral UK coordinate that won't be routed to parts[lat_idx] = "54.0" parts[lon_idx] = "-2.0" coords_fixed += 1 except ValueError: pass tmp.write(_format_csv_row(parts)) tmp.close() zout.write(tmp.name, "stops.txt") os.unlink(tmp.name) elif info.filename == "routes.txt": with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) rt_idx = cols.index("route_type") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue if parts[rt_idx].strip('"') == "714": parts[rt_idx] = "3" route_types_fixed += 1 tmp.write(_format_csv_row(parts)) tmp.close() zout.write(tmp.name, "routes.txt") os.unlink(tmp.name) elif info.filename == "trips.txt": # Remove trips that have backwards travel times with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) trip_id_idx = cols.index("trip_id") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue if parts[trip_id_idx].strip('"') not in bad_trip_ids: tmp.write(line) tmp.close() zout.write(tmp.name, "trips.txt") os.unlink(tmp.name) elif info.filename == "calendar.txt": # Cap end_date year to 2099 with zin.open(info) as f: header = f.readline() cols = _parse_csv_line(header) end_idx = cols.index("end_date") tmp = tempfile.NamedTemporaryFile( mode="wb", delete=False, suffix=".txt", dir=local_tmp_dir(), ) tmp.write(header) for line in f: parts = _parse_csv_line(line) if not parts: continue date_val = parts[end_idx].strip('"') if len(date_val) == 8: try: year = int(date_val[:4]) if year > 2099: parts[end_idx] = "20991231" except ValueError: pass tmp.write(_format_csv_row(parts)) tmp.close() zout.write(tmp.name, "calendar.txt") os.unlink(tmp.name) else: zout.writestr(info, zin.read(info)) print(f" Pass-through stops fixed: {passthrough_fixed}") print(f" Orphan stop references removed: {orphan_stops_removed}") print(f" Bad trip stop_times removed: {bad_trips_removed}") print(f" Stop sequences renumbered: {seqs_renumbered}") print(f" Bogus coordinates fixed: {coords_fixed}") print(f" Route types 714→3 fixed: {route_types_fixed}") print(f" Saved to {dst}") def _docker_run_dtd2mysql( network: str, db_container: str, volumes: list[str], args: list[str] ) -> None: """Run dtd2mysql in a Node.js container on the same Docker network as MariaDB.""" cmd = [ "docker", "run", "--rm", "--network", network, "-e", f"DATABASE_HOSTNAME={db_container}", "-e", "DATABASE_USERNAME=root", "-e", "DATABASE_PASSWORD=root", "-e", "DATABASE_NAME=dtd", ] for v in volumes: cmd.extend(["-v", v]) # Install zip (needed for --gtfs-zip) then run dtd2mysql inner = ( "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args) ) cmd.extend(["node:20", "bash", "-c", inner]) subprocess.run(cmd, check=True) def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path: """Convert National Rail CIF to GTFS using dtd2mysql + MariaDB Docker. Runs both MariaDB and dtd2mysql as Docker containers on a shared network, since Docker port forwarding is not available in all environments. Then cleans the output for R5 compatibility. """ dest = output_dir / "national_rail_gtfs.zip" if dest.exists(): print(f"National Rail GTFS already exists: {dest}") return dest raw_dest = raw_dir / "national_rail_gtfs_raw.zip" if not raw_dest.exists(): db_container = "propertymap-mariadb-temp" network = "propertymap-dtd-net" print("Creating Docker network and starting MariaDB...") subprocess.run(["docker", "network", "create", network], capture_output=True) subprocess.run( [ "docker", "run", "-d", "--name", db_container, "--network", network, "-e", "MARIADB_ROOT_PASSWORD=root", "-e", "MARIADB_DATABASE=dtd", "mariadb:latest", ], check=True, ) try: # Wait for MariaDB to be ready print(" Waiting for MariaDB to be ready...") for attempt in range(30): result = subprocess.run( [ "docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1", ], capture_output=True, ) if result.returncode == 0: break time.sleep(2) else: raise RuntimeError("MariaDB did not become ready in time") raw_abs = str(raw_dir.resolve()) print("Importing CIF timetable into MariaDB...") _docker_run_dtd2mysql( network, db_container, volumes=[f"{raw_abs}:/data:ro"], args=["--timetable", "/data/national_rail_cif.zip"], ) print("Exporting GTFS from MariaDB...") _docker_run_dtd2mysql( network, db_container, volumes=[f"{raw_abs}:/output"], args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"], ) finally: print("Cleaning up Docker resources...") subprocess.run(["docker", "stop", db_container], capture_output=True) subprocess.run(["docker", "rm", db_container], capture_output=True) subprocess.run(["docker", "network", "rm", network], capture_output=True) # Clean the raw GTFS for R5 compatibility clean_national_rail_gtfs(raw_dest, dest) return dest def main() -> None: parser = argparse.ArgumentParser( description="Download and prepare transit network data for R5 routing engine" ) parser.add_argument( "--output", type=Path, required=True, help="Output directory for transit data", ) parser.add_argument( "--skip-tfl", action="store_true", help="Skip TfL TransXChange download and conversion", ) parser.add_argument( "--skip-national-rail", action="store_true", help="Skip National Rail CIF download and conversion", ) args = parser.parse_args() output_dir: Path = args.output raw_dir = output_dir / "raw" raw_dir.mkdir(parents=True, exist_ok=True) # 1. Download, clean, and frequency-convert BODS GTFS download_osm_pbf(raw_dir) bods_raw = download_bods_gtfs(raw_dir) bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip" clean_gtfs(bods_raw, bods_cleaned) bods_final = output_dir / "bods_gtfs.zip" convert_high_freq_to_frequency_based(bods_cleaned, bods_final) # 2. TfL TransXChange → GTFS if args.skip_tfl: print("Skipping TfL (--skip-tfl)") else: download_tfl_transxchange(raw_dir) convert_tfl_to_gtfs(raw_dir, output_dir) # 3. National Rail CIF → GTFS if args.skip_national_rail: print("Skipping National Rail (--skip-national-rail)") else: cif = download_national_rail_cif(raw_dir) if cif is not None: convert_national_rail_to_gtfs(raw_dir, output_dir) # Summary print() print("Transit data ready for R5:") for f in sorted(output_dir.iterdir()): if f.is_dir() or f.name.startswith("."): continue size_mb = f.stat().st_size / (1024 * 1024) print(f" {f.name}: {size_mb:.1f} MB") print() print("IMPORTANT: If you previously built a network from London-only data,") print("delete the stale cache before running R5:") print(" rm -f property-data/r5-network/network.dat") if __name__ == "__main__": main()