354 lines
13 KiB
Python
354 lines
13 KiB
Python
"""Download and prepare transit network data for R5 routing.
|
|
|
|
Downloads:
|
|
- England OSM PBF from Geofabrik (~1.5GB)
|
|
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
|
|
|
Then processes for R5 compatibility:
|
|
- Cleans GTFS (fixes stop_times >72h, feed_info year >2100)
|
|
- Crops OSM PBF to London bounding box via osmium
|
|
- Crops GTFS to London bounding box (keeps only London-touching trips)
|
|
|
|
Requires: osmium-tool (apt install osmium-tool)
|
|
|
|
Output directory: property-data/transit/
|
|
Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready)
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import io
|
|
import os
|
|
import subprocess
|
|
import urllib.request
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
from tqdm import tqdm
|
|
|
|
ENGLAND_PBF_URL = (
|
|
"https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
|
|
)
|
|
|
|
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
|
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
|
|
|
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
|
|
|
# London + Home Counties bounding box (~50km buffer around Greater London)
|
|
LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35}
|
|
|
|
|
|
def _download_http(url: str, dest: Path, *, desc: str) -> None:
|
|
"""Stream-download a URL to a file with progress bar."""
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
|
|
|
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
|
|
with (
|
|
tqdm(unit="B", unit_scale=True, desc=desc) as bar,
|
|
urllib.request.urlopen(req) as resp,
|
|
open(tmp, "wb") as f,
|
|
):
|
|
length = resp.headers.get("Content-Length")
|
|
if length:
|
|
bar.total = int(length)
|
|
while chunk := resp.read(1 << 20):
|
|
f.write(chunk)
|
|
bar.update(len(chunk))
|
|
|
|
tmp.rename(dest)
|
|
print(f" Saved to {dest}")
|
|
|
|
|
|
def download_osm_pbf(output_dir: Path) -> Path:
|
|
"""Download England OSM PBF extract from Geofabrik."""
|
|
dest = output_dir / "england.osm.pbf"
|
|
if dest.exists():
|
|
print(f"OSM PBF already exists: {dest}")
|
|
return dest
|
|
|
|
print("Downloading England OSM PBF (~1.5 GB)...")
|
|
_download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf")
|
|
return dest
|
|
|
|
|
|
def download_bods_gtfs(output_dir: Path) -> Path:
|
|
"""Download BODS GTFS (all England bus/tram/ferry timetables)."""
|
|
dest = output_dir / "bods_gtfs_raw.zip"
|
|
if dest.exists():
|
|
print(f"BODS GTFS already exists: {dest}")
|
|
return dest
|
|
|
|
print("Downloading BODS GTFS (~1.5 GB)...")
|
|
_download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip")
|
|
return dest
|
|
|
|
|
|
def clean_gtfs(src: Path, dst: Path) -> None:
|
|
"""Fix R5-incompatible entries in GTFS.
|
|
|
|
- Removes stop_times with arrival/departure hour > 72
|
|
- Caps feed_info end_date year to 2099
|
|
"""
|
|
if dst.exists():
|
|
print(f"Cleaned GTFS already exists: {dst}")
|
|
return
|
|
|
|
print("Cleaning GTFS for R5 compatibility...")
|
|
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
|
dst, "w", zipfile.ZIP_DEFLATED
|
|
) as zout:
|
|
for info in zin.infolist():
|
|
if info.filename == "stop_times.txt":
|
|
dropped = 0
|
|
with zin.open(info) as f:
|
|
header = f.readline()
|
|
header_str = header.decode("utf-8").strip()
|
|
cols = header_str.split(",")
|
|
arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1
|
|
dep_idx = (
|
|
cols.index("departure_time") if "departure_time" in cols else -1
|
|
)
|
|
|
|
import tempfile
|
|
|
|
tmp = tempfile.NamedTemporaryFile(
|
|
mode="wb", delete=False, suffix=".txt"
|
|
)
|
|
tmp.write(header)
|
|
|
|
for line in f:
|
|
line_str = line.decode("utf-8", errors="replace").strip()
|
|
if not line_str:
|
|
continue
|
|
parts = line_str.split(",")
|
|
skip = False
|
|
for idx in [arr_idx, dep_idx]:
|
|
if 0 <= idx < len(parts):
|
|
time_val = parts[idx].strip('"')
|
|
if ":" in time_val:
|
|
try:
|
|
hour = int(time_val.split(":")[0])
|
|
if hour > 72:
|
|
skip = True
|
|
break
|
|
except ValueError:
|
|
pass
|
|
if skip:
|
|
dropped += 1
|
|
else:
|
|
tmp.write(line)
|
|
|
|
tmp.close()
|
|
print(f" stop_times: dropped {dropped} rows with hours > 72")
|
|
zout.write(tmp.name, "stop_times.txt")
|
|
os.unlink(tmp.name)
|
|
|
|
elif info.filename == "feed_info.txt":
|
|
data = zin.read(info).decode("utf-8")
|
|
lines = data.strip().split("\n")
|
|
header_line = lines[0]
|
|
feed_cols = header_line.split(",")
|
|
fixed_lines = [header_line]
|
|
for line in lines[1:]:
|
|
parts = line.split(",")
|
|
for i, col_name in enumerate(feed_cols):
|
|
if "end_date" in col_name.lower() and i < len(parts):
|
|
date_val = parts[i].strip('"')
|
|
if len(date_val) == 8:
|
|
year = int(date_val[:4])
|
|
if year > 2100:
|
|
parts[i] = "20991231"
|
|
print(f" feed_info: capped end_date {date_val} → 20991231")
|
|
fixed_lines.append(",".join(parts))
|
|
zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
|
|
else:
|
|
zout.writestr(info, zin.read(info))
|
|
|
|
print(f" Saved to {dst}")
|
|
|
|
|
|
def crop_osm_to_london(src: Path, dst: Path) -> None:
|
|
"""Extract London bounding box from England OSM PBF using osmium."""
|
|
if dst.exists():
|
|
print(f"London OSM PBF already exists: {dst}")
|
|
return
|
|
|
|
bbox = LONDON_BBOX
|
|
bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}"
|
|
|
|
print(f"Cropping OSM PBF to London bbox ({bbox_str})...")
|
|
subprocess.run(
|
|
["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"],
|
|
check=True,
|
|
)
|
|
size_mb = dst.stat().st_size / (1024 * 1024)
|
|
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
|
|
|
|
|
def crop_gtfs_to_london(src: Path, dst: Path) -> None:
|
|
"""Crop GTFS to trips touching the London bounding box."""
|
|
if dst.exists():
|
|
print(f"London GTFS already exists: {dst}")
|
|
return
|
|
|
|
bbox = LONDON_BBOX
|
|
|
|
print("Cropping GTFS to London area...")
|
|
|
|
with zipfile.ZipFile(src, "r") as zin:
|
|
# Step 1: Find stops in bbox
|
|
print(" Finding stops in bbox...")
|
|
with zin.open("stops.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
stops_in_bbox = set()
|
|
all_stops = list(reader)
|
|
for row in all_stops:
|
|
lat = float(row["stop_lat"])
|
|
lon = float(row["stop_lon"])
|
|
if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]:
|
|
stops_in_bbox.add(row["stop_id"])
|
|
print(f" {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox")
|
|
|
|
# Step 2: Find trips touching these stops
|
|
print(" Finding trips touching London stops...")
|
|
with zin.open("stop_times.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
st_fieldnames = reader.fieldnames
|
|
trips_in_bbox = set()
|
|
for row in reader:
|
|
if row["stop_id"] in stops_in_bbox:
|
|
trips_in_bbox.add(row["trip_id"])
|
|
print(f" {len(trips_in_bbox):,} trips touch London")
|
|
|
|
# Step 3: Collect all stop_times for those trips
|
|
print(" Collecting stop_times for London trips...")
|
|
stop_times_kept = []
|
|
with zin.open("stop_times.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
for row in reader:
|
|
if row["trip_id"] in trips_in_bbox:
|
|
stop_times_kept.append(row)
|
|
stops_needed = {row["stop_id"] for row in stop_times_kept}
|
|
print(f" {len(stop_times_kept):,} stop_times kept")
|
|
|
|
# Step 4: Read trips and find needed routes/services/shapes
|
|
print(" Reading trips...")
|
|
with zin.open("trips.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
trips_fieldnames = reader.fieldnames
|
|
all_trips = list(reader)
|
|
trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox]
|
|
routes_needed = {t["route_id"] for t in trips_kept}
|
|
services_needed = {t["service_id"] for t in trips_kept}
|
|
shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""}
|
|
|
|
# Step 5: Write cropped GTFS
|
|
print(" Writing cropped GTFS...")
|
|
with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
|
|
# stops
|
|
stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed]
|
|
_write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept)
|
|
|
|
# stop_times
|
|
_write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept)
|
|
|
|
# trips
|
|
_write_csv(zout, "trips.txt", trips_fieldnames, trips_kept)
|
|
|
|
# routes
|
|
with zin.open("routes.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
routes_fn = reader.fieldnames
|
|
routes_kept = [r for r in reader if r["route_id"] in routes_needed]
|
|
_write_csv(zout, "routes.txt", routes_fn, routes_kept)
|
|
|
|
# agency (copy all)
|
|
zout.writestr("agency.txt", zin.read("agency.txt"))
|
|
|
|
# calendar
|
|
with zin.open("calendar.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
cal_fn = reader.fieldnames
|
|
cal_kept = [r for r in reader if r["service_id"] in services_needed]
|
|
_write_csv(zout, "calendar.txt", cal_fn, cal_kept)
|
|
|
|
# calendar_dates
|
|
with zin.open("calendar_dates.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
cd_fn = reader.fieldnames
|
|
cd_kept = [r for r in reader if r["service_id"] in services_needed]
|
|
_write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept)
|
|
|
|
# shapes (stream — can be very large)
|
|
print(" Streaming shapes.txt...")
|
|
with zin.open("shapes.txt") as f:
|
|
reader = csv.DictReader(io.TextIOWrapper(f))
|
|
shapes_fn = reader.fieldnames
|
|
shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed]
|
|
_write_csv(zout, "shapes.txt", shapes_fn, shapes_rows)
|
|
|
|
# feed_info + frequencies (copy)
|
|
zout.writestr("feed_info.txt", zin.read("feed_info.txt"))
|
|
zout.writestr("frequencies.txt", zin.read("frequencies.txt"))
|
|
|
|
size_mb = dst.stat().st_size / (1024 * 1024)
|
|
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
|
|
|
|
|
def _write_csv(
|
|
zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict]
|
|
) -> None:
|
|
buf = io.StringIO()
|
|
w = csv.DictWriter(buf, fieldnames=fieldnames)
|
|
w.writeheader()
|
|
w.writerows(rows)
|
|
zout.writestr(name, buf.getvalue())
|
|
print(f" {name}: {len(rows):,} rows")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download and prepare transit network data for R5 routing engine"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
required=True,
|
|
help="Output directory for transit data",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
output_dir: Path = args.output
|
|
raw_dir = output_dir / "raw"
|
|
raw_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Download raw data
|
|
england_pbf = download_osm_pbf(raw_dir)
|
|
bods_raw = download_bods_gtfs(raw_dir)
|
|
|
|
# Clean GTFS (fix R5 incompatibilities)
|
|
bods_clean = raw_dir / "bods_gtfs_clean.zip"
|
|
clean_gtfs(bods_raw, bods_clean)
|
|
|
|
# Crop to London area for R5 (full England requires >30GB RAM)
|
|
london_pbf = output_dir / "london.osm.pbf"
|
|
crop_osm_to_london(england_pbf, london_pbf)
|
|
|
|
london_gtfs = output_dir / "bods_gtfs.zip"
|
|
crop_gtfs_to_london(bods_clean, london_gtfs)
|
|
|
|
# Summary
|
|
print()
|
|
print("Transit data ready for R5:")
|
|
for f in sorted(output_dir.iterdir()):
|
|
if f.is_dir() or f.name.startswith("."):
|
|
continue
|
|
size_mb = f.stat().st_size / (1024 * 1024)
|
|
print(f" {f.name}: {size_mb:.1f} MB")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|