Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -2,24 +2,32 @@

 Downloads:
  - England OSM PBF from Geofabrik (~1.5GB)
-  - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
-  - TfL TransXChange timetables → converted to GTFS
-  - National Rail CIF timetable → converted to GTFS (requires credentials)
+  - BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
+    plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
+  - National Rail CIF timetable → converted to GTFS (requires credentials;
+    includes the Elizabeth line, TOC "XR")

 Then processes for R5 compatibility:
  - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
  - Converts high-frequency metro/tram services to frequency-based GTFS
-  - Converts TfL TransXChange to GTFS via transxchange2gtfs
  - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
+  - Validates every produced GTFS zip (active calendar window, plausible UK
+    stop coordinates, non-empty routes/trips/stop_times)

-Requires: osmium-tool, Node.js (npx), Docker (for national rail)
+Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
+was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
+in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
+service. BODS covers all TfL modes that feed nominally provided.
+
+Requires: osmium-tool, Docker (for national rail)

 Output directory: property-data/transit/
-  raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
+  raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
 """

 import argparse
 import csv
+import datetime as dt
 import io
 import json
 import os
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
 # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
 BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"

-# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
-TFL_TRANSXCHANGE_URL = (
-    "https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
-)
-
-# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
-NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
-
 # National Rail Open Data API
 NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
 NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"

 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
-TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
+
+# GTFS validation: a feed must have service within this many days of the build
+# date, and at least this fraction of stops must have plausible UK coordinates.
+GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
+GTFS_MIN_VALID_STOP_FRACTION = 0.95
+UK_LAT_RANGE = (49.0, 61.0)
+UK_LON_RANGE = (-9.0, 2.5)


 def _download_http(
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
    print(f"  Saved to {dst}")


-def download_tfl_transxchange(raw_dir: Path) -> Path:
-    """Download TfL TransXChange timetable bundle."""
-    dest = raw_dir / "tfl_transxchange.zip"
-    if dest.exists():
-        print(f"TfL TransXChange already exists: {dest}")
-        return dest
-
-    print("Downloading TfL TransXChange timetables...")
-    _download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
-    return dest
+def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
+    """True if a GTFS file has at least one non-empty data row after the header."""
+    with z.open(filename) as f:
+        f.readline()  # header
+        for line in f:
+            if _parse_csv_line(line):
+                return True
+    return False


-def download_naptan() -> None:
-    """Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
-    dest = local_tmp_dir() / "Stops.csv"
-    if dest.exists():
-        print(f"NaPTAN Stops.csv already exists: {dest}")
-        return
+def _calendar_active_in_window(
+    z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
+) -> bool:
+    """True if calendar.txt/calendar_dates.txt have service in [start, end].

-    print("Downloading NaPTAN stops data...")
-    _download_http(NAPTAN_URL, dest, desc="Stops.csv")
-
-
-def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
-    """Convert TfL TransXChange to GTFS using transxchange2gtfs."""
-    dest = output_dir / "tfl_gtfs.zip"
-    if dest.exists():
-        print(f"TfL GTFS already exists: {dest}")
-        return dest
-
-    txc_path = raw_dir / "tfl_transxchange.zip"
-
-    # Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
-    download_naptan()
-
-    print("Converting TfL TransXChange → GTFS...")
-    # The shim patches known packaging/runtime issues in the pinned npm package
-    # before loading its CLI from npx's temporary install.
-    shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
-    subprocess.run(
-        [
-            "npx",
-            "--yes",
-            "--package",
-            TRANSXCHANGE2GTFS_PACKAGE,
-            "sh",
-            "-c",
-            "\n".join(
-                [
-                    'bin="$(command -v transxchange2gtfs)"',
-                    'script="$(readlink -f "$bin")"',
-                    'pkg_dir="$(dirname "$(dirname "$script")")"',
-                    'shim="$1"',
-                    "shift",
-                    'exec node "$shim" "$pkg_dir" "$@"',
-                ]
-            ),
-            "transxchange2gtfs",
-            str(shim_path.resolve()),
-            str(txc_path.resolve()),
-            str(dest.resolve()),
-        ],
-        check=True,
+    Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
+    date range overlaps the window AND at least one weekday flag is set; a
+    calendar_dates.txt row counts when it adds service (exception_type=1) on a
+    date inside the window.
+    """
+    weekdays = (
+        "monday",
+        "tuesday",
+        "wednesday",
+        "thursday",
+        "friday",
+        "saturday",
+        "sunday",
+    )
+    if "calendar.txt" in names:
+        with z.open("calendar.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                start_idx = cols.index("start_date")
+                end_idx = cols.index("end_date")
+            except ValueError:
+                return False
+            day_idxs = [cols.index(d) for d in weekdays if d in cols]
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                try:
+                    start = int(parts[start_idx].strip('"'))
+                    end = int(parts[end_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue
+                if start > window_end or end < window_start:
+                    continue
+                if day_idxs and not any(
+                    parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
+                ):
+                    continue
+                return True
+
+    if "calendar_dates.txt" in names:
+        with z.open("calendar_dates.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                date_idx = cols.index("date")
+                exc_idx = cols.index("exception_type")
+            except ValueError:
+                return False
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                try:
+                    date = int(parts[date_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue
+                if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
+                    continue
+                if window_start <= date <= window_end:
+                    return True
+
+    return False
+
+
+def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
+    """Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
+
+    Guards against silently shipping a feed that contributes zero service (as
+    the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
+      (a) calendar.txt/calendar_dates.txt have at least one service active
+          within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
+      (b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
+          have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
+      (c) routes.txt, trips.txt and stop_times.txt each have data rows.
+    """
+    if today is None:
+        today = dt.date.today()
+    window_start = int(today.strftime("%Y%m%d"))
+    window_end = int(
+        (today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
+    )
+
+    def fail(reason: str) -> None:
+        raise RuntimeError(
+            f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
+        )
+
+    print(f"Validating GTFS feed '{feed_name}'...")
+    if not path.exists() or not zipfile.is_zipfile(path):
+        fail("not a valid zip file")
+
+    with zipfile.ZipFile(path) as z:
+        names = set(z.namelist())
+
+        # (c) core files present and non-empty
+        for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
+            if required not in names:
+                fail(f"missing {required}")
+            if not _gtfs_has_data_row(z, required):
+                fail(f"{required} has no data rows")
+
+        # (a) at least one service active in the routing window
+        if "calendar.txt" not in names and "calendar_dates.txt" not in names:
+            fail("has neither calendar.txt nor calendar_dates.txt")
+        if not _calendar_active_in_window(z, names, window_start, window_end):
+            fail(
+                f"no service active between {window_start} and {window_end} — "
+                "the feed's calendars are stale/expired and it would contribute "
+                "zero service to routing"
+            )
+
+        # (b) stops have plausible UK coordinates
+        total_stops = 0
+        valid_stops = 0
+        with z.open("stops.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                lat_idx = cols.index("stop_lat")
+                lon_idx = cols.index("stop_lon")
+            except ValueError:
+                fail("stops.txt is missing stop_lat/stop_lon columns")
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                total_stops += 1
+                try:
+                    lat = float(parts[lat_idx].strip('"'))
+                    lon = float(parts[lon_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue  # empty/garbage coordinate → invalid
+                if lat == 0.0 and lon == 0.0:
+                    continue
+                if (
+                    UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
+                    and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
+                ):
+                    valid_stops += 1
+        if total_stops == 0:
+            fail("stops.txt has no stops")
+        fraction = valid_stops / total_stops
+        if fraction < GTFS_MIN_VALID_STOP_FRACTION:
+            fail(
+                f"only {valid_stops}/{total_stops} stops "
+                f"({fraction:.1%}) have plausible UK coordinates "
+                f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
+                f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
+                f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
+            )
+
+    print(
+        f"  OK: service active in window, {valid_stops}/{total_stops} stops "
+        f"({fraction:.1%}) with plausible UK coordinates"
    )
-    required_files = {
-        "agency.txt",
-        "calendar.txt",
-        "calendar_dates.txt",
-        "routes.txt",
-        "stop_times.txt",
-        "stops.txt",
-        "trips.txt",
-    }
-    if not dest.exists() or not zipfile.is_zipfile(dest):
-        raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
-    with zipfile.ZipFile(dest) as z:
-        missing = required_files - set(z.namelist())
-    if missing:
-        missing_str = ", ".join(sorted(missing))
-        raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
-    size_mb = dest.stat().st_size / (1024 * 1024)
-    print(f"  Saved to {dest} ({size_mb:.1f} MB)")
-    return dest


 def download_national_rail_cif(raw_dir: Path) -> Path | None:
@ -1007,18 +1099,15 @@ def main() -> None:
        required=True,
        help="Output directory for transit data",
    )
-    parser.add_argument(
-        "--skip-tfl",
-        action="store_true",
-        help="Skip TfL TransXChange download and conversion",
-    )
    args = parser.parse_args()

    output_dir: Path = args.output
    raw_dir = output_dir / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

-    # 1. Download, clean, and frequency-convert BODS GTFS
+    # 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
+    # England bus/tram/ferry plus London Underground, DLR, London Tramlink and
+    # the IFS Cloud Cable Car, so no separate TfL feed is needed.
    download_osm_pbf(raw_dir)
    bods_raw = download_bods_gtfs(raw_dir)

@ -1027,16 +1116,10 @@ def main() -> None:

    bods_final = output_dir / "bods_gtfs.zip"
    convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
+    validate_gtfs_feed(bods_final, "BODS GTFS")

-    # 2. TfL TransXChange → GTFS
-    if args.skip_tfl:
-        print("Skipping TfL (--skip-tfl)")
-    else:
-        download_tfl_transxchange(raw_dir)
-        convert_tfl_to_gtfs(raw_dir, output_dir)
-
-    # 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
-    # reach the ~2,725 railway-station destinations, so a bus/TfL-only network
+    # 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
+    # reach the ~2,725 railway-station destinations, so a bus/metro-only network
    # silently overstates every train commute. Missing credentials are a HARD
    # error, so a rail-less network can never ship.
    cif = download_national_rail_cif(raw_dir)
@ -1048,7 +1131,8 @@ def main() -> None:
            "required; without it the transit network models every train journey "
            "as bus-only and overstates commute times."
        )
-    convert_national_rail_to_gtfs(raw_dir, output_dir)
+    nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
+    validate_gtfs_feed(nr_final, "National Rail GTFS")

    # Summary
    print()