Lots of improvements

2026-03-10 22:05:51 +00:00 · 2026-03-10 22:05:51 +00:00 · 80a5a2a774
commit 80a5a2a774
parent ef921361ec
21 changed files with 489 additions and 337 deletions
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -8,6 +8,7 @@ Downloads:

 Then processes for R5 compatibility:
  - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
+  - Converts high-frequency metro/tram services to frequency-based GTFS
  - Converts TfL TransXChange to GTFS via transxchange2gtfs
  - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)

@ -20,12 +21,15 @@ Output directory: property-data/transit/
 import argparse
 import json
 import os
+import shutil
+import statistics
 import subprocess
 import tempfile
 import time
 import urllib.parse
 import urllib.request
 import zipfile
+from collections import defaultdict
 from pathlib import Path

 from tqdm import tqdm
@ -184,6 +188,229 @@ def clean_gtfs(src: Path, dst: Path) -> None:
    print(f"  Saved to {dst}")


+def _parse_gtfs_time(time_str: str) -> int | None:
+    """Parse HH:MM:SS to seconds since midnight. Returns None on failure."""
+    time_str = time_str.strip('"')
+    if ":" not in time_str:
+        return None
+    try:
+        h, m, s = time_str.split(":")
+        return int(h) * 3600 + int(m) * 60 + int(s)
+    except ValueError:
+        return None
+
+
+def _secs_to_gtfs_time(s: int) -> str:
+    """Convert seconds since midnight to HH:MM:SS."""
+    h = s // 3600
+    m = (s % 3600) // 60
+    sec = s % 60
+    return f"{h:02d}:{m:02d}:{sec:02d}"
+
+
+def convert_high_freq_to_frequency_based(
+    src: Path, dst: Path, *, max_headway_minutes: int = 15
+) -> None:
+    """Convert high-frequency scheduled services to frequency-based GTFS entries.
+
+    Identifies metro (route_type=1) and tram (route_type=0) routes with regular
+    headways under max_headway_minutes, then creates frequencies.txt entries and
+    removes redundant trips. R5's RAPTOR produces smoother percentile results for
+    frequency-based services, matching the "just turn up" reality of high-frequency
+    metro/tram services.
+    """
+    if dst.exists():
+        print(f"Frequency-converted GTFS already exists: {dst}")
+        return
+
+    print("Converting high-frequency services to frequency-based...")
+    max_headway_secs = max_headway_minutes * 60
+
+    with zipfile.ZipFile(src, "r") as zin:
+        # Step 1: Find metro/tram route IDs
+        target_route_ids: set[str] = set()
+        with zin.open("routes.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            route_id_idx = cols.index("route_id")
+            rt_idx = cols.index("route_type")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                route_type = parts[rt_idx].strip('"')
+                if route_type in ("0", "1"):  # tram, metro/subway
+                    target_route_ids.add(parts[route_id_idx].strip('"'))
+
+        if not target_route_ids:
+            print("  No metro/tram routes found, copying unchanged")
+            shutil.copy2(src, dst)
+            return
+
+        print(f"  Found {len(target_route_ids)} metro/tram routes")
+
+        # Step 2: Map target trips to grouping keys
+        trip_group_key: dict[str, tuple[str, str, str]] = {}
+        with zin.open("trips.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            trip_id_idx = cols.index("trip_id")
+            route_id_idx = cols.index("route_id")
+            dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
+            service_idx = cols.index("service_id")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                route_id = parts[route_id_idx].strip('"')
+                if route_id in target_route_ids:
+                    trip_id = parts[trip_id_idx].strip('"')
+                    direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0"
+                    service_id = parts[service_idx].strip('"')
+                    trip_group_key[trip_id] = (route_id, direction, service_id)
+
+        print(f"  Found {len(trip_group_key)} trips on target routes")
+
+        # Step 3: Get first departure time and first stop for each target trip
+        trip_first_dep: dict[str, int] = {}
+        trip_first_stop: dict[str, str] = {}
+        with zin.open("stop_times.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            trip_id_idx = cols.index("trip_id")
+            dep_idx = cols.index("departure_time")
+            seq_idx = cols.index("stop_sequence")
+            stop_id_idx = cols.index("stop_id")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                trip_id = parts[trip_id_idx].strip('"')
+                if trip_id not in trip_group_key:
+                    continue
+                if parts[seq_idx].strip('"') == "0":
+                    dep_secs = _parse_gtfs_time(parts[dep_idx])
+                    if dep_secs is not None:
+                        trip_first_dep[trip_id] = dep_secs
+                        trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
+
+    # Step 4: Group trips by (route, direction, service, first_stop) and compute headways
+    groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
+    for trip_id, dep_secs in trip_first_dep.items():
+        route_id, direction, service_id = trip_group_key[trip_id]
+        first_stop = trip_first_stop.get(trip_id, "")
+        key = (route_id, direction, service_id, first_stop)
+        groups[key].append((trip_id, dep_secs))
+
+    trips_to_remove: set[str] = set()
+    frequency_entries: list[tuple[str, int, int, int]] = []
+    groups_converted = 0
+
+    for _key, trips in groups.items():
+        if len(trips) < 4:
+            continue
+
+        trips.sort(key=lambda x: x[1])
+        headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)]
+        headways = [h for h in headways if h > 0]
+
+        if len(headways) < 3:
+            continue
+
+        median_hw = statistics.median(headways)
+        if median_hw > max_headway_secs or median_hw < 30:
+            continue
+
+        mean_hw = statistics.mean(headways)
+        if mean_hw == 0:
+            continue
+        stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0
+        if stdev_hw / mean_hw > 0.5:
+            continue
+
+        # Convert: keep first trip as template, remove the rest
+        template_trip_id = trips[0][0]
+        start_secs = trips[0][1]
+        end_secs = trips[-1][1] + int(median_hw)
+        headway_rounded = max(60, round(median_hw / 60) * 60)
+
+        frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
+        for trip_id, _ in trips[1:]:
+            trips_to_remove.add(trip_id)
+        groups_converted += 1
+
+    print(f"  Converted {groups_converted} trip groups to frequency-based")
+    print(f"  Removing {len(trips_to_remove)} redundant trips")
+    print(f"  Created {len(frequency_entries)} frequency entries")
+
+    # Step 5: Write modified GTFS
+    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
+        dst, "w", zipfile.ZIP_DEFLATED
+    ) as zout:
+        for info in zin.infolist():
+            if info.filename == "trips.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+                    for line in f:
+                        parts = (
+                            line.decode("utf-8", errors="replace").strip().split(",")
+                        )
+                        if not parts:
+                            continue
+                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
+                            tmp.write(line)
+                    tmp.close()
+                    zout.write(tmp.name, "trips.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "stop_times.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+                    for line in f:
+                        parts = (
+                            line.decode("utf-8", errors="replace").strip().split(",")
+                        )
+                        if not parts:
+                            continue
+                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
+                            tmp.write(line)
+                    tmp.close()
+                    zout.write(tmp.name, "stop_times.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "frequencies.txt":
+                pass  # we'll write our own below
+
+            else:
+                zout.writestr(info, zin.read(info))
+
+        # Write frequencies.txt
+        freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"]
+        for trip_id, start, end, headway in frequency_entries:
+            freq_lines.append(
+                f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n"
+            )
+        zout.writestr("frequencies.txt", "".join(freq_lines))
+
+    print(f"  Saved to {dst}")
+
+
 def download_tfl_transxchange(raw_dir: Path) -> Path:
    """Download TfL TransXChange timetable bundle."""
    dest = raw_dir / "tfl_transxchange.zip"
@ -655,12 +882,15 @@ def main() -> None:
    raw_dir = output_dir / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

-    # 1. Download and clean BODS GTFS
+    # 1. Download, clean, and frequency-convert BODS GTFS
    download_osm_pbf(raw_dir)
    bods_raw = download_bods_gtfs(raw_dir)

-    bods_clean = output_dir / "bods_gtfs.zip"
-    clean_gtfs(bods_raw, bods_clean)
+    bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip"
+    clean_gtfs(bods_raw, bods_cleaned)
+
+    bods_final = output_dir / "bods_gtfs.zip"
+    convert_high_freq_to_frequency_based(bods_cleaned, bods_final)

    # 2. TfL TransXChange → GTFS
    if args.skip_tfl: