More

2026-02-10 22:21:15 +00:00 · 2026-02-10 22:21:15 +00:00 · 3599803589
commit 3599803589
parent 1f68ca0512
43 changed files with 3578 additions and 262 deletions
--- a/pipeline/download/init.py
+++ b/pipeline/download/init.py
--- a/pipeline/download/pois.py
+++ b/pipeline/download/pois.py
@ -129,15 +129,21 @@ def main() -> None:
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
+    parser.add_argument(
+        "--pbf", type=Path, default=None, help="Path to existing PBF file (skips download)"
+    )
    args = parser.parse_args()

    with tempfile.TemporaryDirectory() as cache_dir:
-        pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
-
-        if not pbf_file.exists():
-            download_pbf(pbf_file)
+        if args.pbf and args.pbf.exists():
+            pbf_file = args.pbf
+            print(f"Using provided PBF file at {pbf_file}")
        else:
-            print(f"Using cached PBF file at {pbf_file}")
+            pbf_file = Path(cache_dir) / "great-britain-latest.osm.pbf"
+            if not pbf_file.exists():
+                download_pbf(pbf_file)
+            else:
+                print(f"Using cached PBF file at {pbf_file}")

        print(f"Tag keys: {POI_TAG_KEYS}")

--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -0,0 +1,354 @@
+"""Download and prepare transit network data for R5 routing.
+
+Downloads:
+  - England OSM PBF from Geofabrik (~1.5GB)
+  - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
+
+Then processes for R5 compatibility:
+  - Cleans GTFS (fixes stop_times >72h, feed_info year >2100)
+  - Crops OSM PBF to London bounding box via osmium
+  - Crops GTFS to London bounding box (keeps only London-touching trips)
+
+Requires: osmium-tool (apt install osmium-tool)
+
+Output directory: property-data/transit/
+  Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready)
+"""
+
+import argparse
+import csv
+import io
+import os
+import subprocess
+import urllib.request
+import zipfile
+from pathlib import Path
+
+from tqdm import tqdm
+
+ENGLAND_PBF_URL = (
+    "https://download.geofabrik.de/europe/united-kingdom/england-latest.osm.pbf"
+)
+
+# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
+BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
+
+USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
+
+# London + Home Counties bounding box (~50km buffer around Greater London)
+LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35}
+
+
+def _download_http(url: str, dest: Path, *, desc: str) -> None:
+    """Stream-download a URL to a file with progress bar."""
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    tmp = dest.with_suffix(dest.suffix + ".tmp")
+
+    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+
+    with (
+        tqdm(unit="B", unit_scale=True, desc=desc) as bar,
+        urllib.request.urlopen(req) as resp,
+        open(tmp, "wb") as f,
+    ):
+        length = resp.headers.get("Content-Length")
+        if length:
+            bar.total = int(length)
+        while chunk := resp.read(1 << 20):
+            f.write(chunk)
+            bar.update(len(chunk))
+
+    tmp.rename(dest)
+    print(f"  Saved to {dest}")
+
+
+def download_osm_pbf(output_dir: Path) -> Path:
+    """Download England OSM PBF extract from Geofabrik."""
+    dest = output_dir / "england.osm.pbf"
+    if dest.exists():
+        print(f"OSM PBF already exists: {dest}")
+        return dest
+
+    print("Downloading England OSM PBF (~1.5 GB)...")
+    _download_http(ENGLAND_PBF_URL, dest, desc="england.osm.pbf")
+    return dest
+
+
+def download_bods_gtfs(output_dir: Path) -> Path:
+    """Download BODS GTFS (all England bus/tram/ferry timetables)."""
+    dest = output_dir / "bods_gtfs_raw.zip"
+    if dest.exists():
+        print(f"BODS GTFS already exists: {dest}")
+        return dest
+
+    print("Downloading BODS GTFS (~1.5 GB)...")
+    _download_http(BODS_GTFS_URL, dest, desc="bods_gtfs_raw.zip")
+    return dest
+
+
+def clean_gtfs(src: Path, dst: Path) -> None:
+    """Fix R5-incompatible entries in GTFS.
+
+    - Removes stop_times with arrival/departure hour > 72
+    - Caps feed_info end_date year to 2099
+    """
+    if dst.exists():
+        print(f"Cleaned GTFS already exists: {dst}")
+        return
+
+    print("Cleaning GTFS for R5 compatibility...")
+    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
+        dst, "w", zipfile.ZIP_DEFLATED
+    ) as zout:
+        for info in zin.infolist():
+            if info.filename == "stop_times.txt":
+                dropped = 0
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1
+                    dep_idx = (
+                        cols.index("departure_time") if "departure_time" in cols else -1
+                    )
+
+                    import tempfile
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        skip = False
+                        for idx in [arr_idx, dep_idx]:
+                            if 0 <= idx < len(parts):
+                                time_val = parts[idx].strip('"')
+                                if ":" in time_val:
+                                    try:
+                                        hour = int(time_val.split(":")[0])
+                                        if hour > 72:
+                                            skip = True
+                                            break
+                                    except ValueError:
+                                        pass
+                        if skip:
+                            dropped += 1
+                        else:
+                            tmp.write(line)
+
+                    tmp.close()
+                    print(f"  stop_times: dropped {dropped} rows with hours > 72")
+                    zout.write(tmp.name, "stop_times.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "feed_info.txt":
+                data = zin.read(info).decode("utf-8")
+                lines = data.strip().split("\n")
+                header_line = lines[0]
+                feed_cols = header_line.split(",")
+                fixed_lines = [header_line]
+                for line in lines[1:]:
+                    parts = line.split(",")
+                    for i, col_name in enumerate(feed_cols):
+                        if "end_date" in col_name.lower() and i < len(parts):
+                            date_val = parts[i].strip('"')
+                            if len(date_val) == 8:
+                                year = int(date_val[:4])
+                                if year > 2100:
+                                    parts[i] = "20991231"
+                                    print(f"  feed_info: capped end_date {date_val} → 20991231")
+                    fixed_lines.append(",".join(parts))
+                zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
+            else:
+                zout.writestr(info, zin.read(info))
+
+    print(f"  Saved to {dst}")
+
+
+def crop_osm_to_london(src: Path, dst: Path) -> None:
+    """Extract London bounding box from England OSM PBF using osmium."""
+    if dst.exists():
+        print(f"London OSM PBF already exists: {dst}")
+        return
+
+    bbox = LONDON_BBOX
+    bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}"
+
+    print(f"Cropping OSM PBF to London bbox ({bbox_str})...")
+    subprocess.run(
+        ["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"],
+        check=True,
+    )
+    size_mb = dst.stat().st_size / (1024 * 1024)
+    print(f"  Saved to {dst} ({size_mb:.0f} MB)")
+
+
+def crop_gtfs_to_london(src: Path, dst: Path) -> None:
+    """Crop GTFS to trips touching the London bounding box."""
+    if dst.exists():
+        print(f"London GTFS already exists: {dst}")
+        return
+
+    bbox = LONDON_BBOX
+
+    print("Cropping GTFS to London area...")
+
+    with zipfile.ZipFile(src, "r") as zin:
+        # Step 1: Find stops in bbox
+        print("  Finding stops in bbox...")
+        with zin.open("stops.txt") as f:
+            reader = csv.DictReader(io.TextIOWrapper(f))
+            stops_in_bbox = set()
+            all_stops = list(reader)
+            for row in all_stops:
+                lat = float(row["stop_lat"])
+                lon = float(row["stop_lon"])
+                if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]:
+                    stops_in_bbox.add(row["stop_id"])
+        print(f"    {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox")
+
+        # Step 2: Find trips touching these stops
+        print("  Finding trips touching London stops...")
+        with zin.open("stop_times.txt") as f:
+            reader = csv.DictReader(io.TextIOWrapper(f))
+            st_fieldnames = reader.fieldnames
+            trips_in_bbox = set()
+            for row in reader:
+                if row["stop_id"] in stops_in_bbox:
+                    trips_in_bbox.add(row["trip_id"])
+        print(f"    {len(trips_in_bbox):,} trips touch London")
+
+        # Step 3: Collect all stop_times for those trips
+        print("  Collecting stop_times for London trips...")
+        stop_times_kept = []
+        with zin.open("stop_times.txt") as f:
+            reader = csv.DictReader(io.TextIOWrapper(f))
+            for row in reader:
+                if row["trip_id"] in trips_in_bbox:
+                    stop_times_kept.append(row)
+        stops_needed = {row["stop_id"] for row in stop_times_kept}
+        print(f"    {len(stop_times_kept):,} stop_times kept")
+
+        # Step 4: Read trips and find needed routes/services/shapes
+        print("  Reading trips...")
+        with zin.open("trips.txt") as f:
+            reader = csv.DictReader(io.TextIOWrapper(f))
+            trips_fieldnames = reader.fieldnames
+            all_trips = list(reader)
+        trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox]
+        routes_needed = {t["route_id"] for t in trips_kept}
+        services_needed = {t["service_id"] for t in trips_kept}
+        shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""}
+
+        # Step 5: Write cropped GTFS
+        print("  Writing cropped GTFS...")
+        with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
+            # stops
+            stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed]
+            _write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept)
+
+            # stop_times
+            _write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept)
+
+            # trips
+            _write_csv(zout, "trips.txt", trips_fieldnames, trips_kept)
+
+            # routes
+            with zin.open("routes.txt") as f:
+                reader = csv.DictReader(io.TextIOWrapper(f))
+                routes_fn = reader.fieldnames
+                routes_kept = [r for r in reader if r["route_id"] in routes_needed]
+            _write_csv(zout, "routes.txt", routes_fn, routes_kept)
+
+            # agency (copy all)
+            zout.writestr("agency.txt", zin.read("agency.txt"))
+
+            # calendar
+            with zin.open("calendar.txt") as f:
+                reader = csv.DictReader(io.TextIOWrapper(f))
+                cal_fn = reader.fieldnames
+                cal_kept = [r for r in reader if r["service_id"] in services_needed]
+            _write_csv(zout, "calendar.txt", cal_fn, cal_kept)
+
+            # calendar_dates
+            with zin.open("calendar_dates.txt") as f:
+                reader = csv.DictReader(io.TextIOWrapper(f))
+                cd_fn = reader.fieldnames
+                cd_kept = [r for r in reader if r["service_id"] in services_needed]
+            _write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept)
+
+            # shapes (stream — can be very large)
+            print("    Streaming shapes.txt...")
+            with zin.open("shapes.txt") as f:
+                reader = csv.DictReader(io.TextIOWrapper(f))
+                shapes_fn = reader.fieldnames
+                shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed]
+            _write_csv(zout, "shapes.txt", shapes_fn, shapes_rows)
+
+            # feed_info + frequencies (copy)
+            zout.writestr("feed_info.txt", zin.read("feed_info.txt"))
+            zout.writestr("frequencies.txt", zin.read("frequencies.txt"))
+
+    size_mb = dst.stat().st_size / (1024 * 1024)
+    print(f"  Saved to {dst} ({size_mb:.0f} MB)")
+
+
+def _write_csv(
+    zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict]
+) -> None:
+    buf = io.StringIO()
+    w = csv.DictWriter(buf, fieldnames=fieldnames)
+    w.writeheader()
+    w.writerows(rows)
+    zout.writestr(name, buf.getvalue())
+    print(f"    {name}: {len(rows):,} rows")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download and prepare transit network data for R5 routing engine"
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Output directory for transit data",
+    )
+    args = parser.parse_args()
+
+    output_dir: Path = args.output
+    raw_dir = output_dir / "raw"
+    raw_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download raw data
+    england_pbf = download_osm_pbf(raw_dir)
+    bods_raw = download_bods_gtfs(raw_dir)
+
+    # Clean GTFS (fix R5 incompatibilities)
+    bods_clean = raw_dir / "bods_gtfs_clean.zip"
+    clean_gtfs(bods_raw, bods_clean)
+
+    # Crop to London area for R5 (full England requires >30GB RAM)
+    london_pbf = output_dir / "london.osm.pbf"
+    crop_osm_to_london(england_pbf, london_pbf)
+
+    london_gtfs = output_dir / "bods_gtfs.zip"
+    crop_gtfs_to_london(bods_clean, london_gtfs)
+
+    # Summary
+    print()
+    print("Transit data ready for R5:")
+    for f in sorted(output_dir.iterdir()):
+        if f.is_dir() or f.name.startswith("."):
+            continue
+        size_mb = f.stat().st_size / (1024 * 1024)
+        print(f"  {f.name}: {size_mb:.1f} MB")
+
+
+if __name__ == "__main__":
+    main()