Good stuff

2026-02-22 22:36:40 +00:00 · 2026-02-22 22:36:40 +00:00 · 8032011708
commit 8032011708
parent 9da2db707f
32 changed files with 1052 additions and 374 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -16,16 +16,16 @@ from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST

 PLACE_TYPES = {
    "city",
-    "borough",
-    "town",
-    "suburb",
-    "quarter",
-    "neighbourhood",
-    "village",
-    "hamlet",
-    "locality",
-    "island",
-    "isolated_dwelling",
+    # "borough",
+    # "town",
+    # "suburb",
+    # "quarter",
+    # "neighbourhood",
+    # "village",
+    # "hamlet",
+    # "locality",
+    # "island",
+    # "isolated_dwelling",
 }

 # Suffixes to strip from raw station names before appending the typed suffix.
@ -115,11 +115,15 @@ class PlaceHandler(osmium.SimpleHandler):
            self._add(name, place_type, lat, lon, population)
            return

-        # railway=station nodes (tube, national rail, DLR, tram, etc.)
+        # Tube stations only (London Underground)
        if n.tags.get("railway") == "station":
-            display_name = _station_display_name(name, dict(n.tags))
-            self._add(display_name, "station", lat, lon, population)
-            return
+            tags = dict(n.tags)
+            station_tag = tags.get("station", "")
+            network = tags.get("network", "").lower()
+            if station_tag == "subway" or "underground" in network:
+                display_name = _station_display_name(name, tags)
+                self._add(display_name, "station", lat, lon, population)
+                return


 def main() -> None:
@ -133,7 +137,7 @@ def main() -> None:
    args = parser.parse_args()

    pbf_file = args.pbf
-    print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
+    print("Extracting place nodes: cities + tube stations")
    with tqdm(
        unit=" elements",
        unit_scale=True,
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -3,23 +3,27 @@
 Downloads:
  - England OSM PBF from Geofabrik (~1.5GB)
  - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
+  - TfL TransXChange timetables → converted to GTFS
+  - National Rail CIF timetable → converted to GTFS (requires credentials)

 Then processes for R5 compatibility:
-  - Cleans GTFS (fixes stop_times >72h, feed_info year >2100)
-  - Crops OSM PBF to London bounding box via osmium
-  - Crops GTFS to London bounding box (keeps only London-touching trips)
+  - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
+  - Converts TfL TransXChange to GTFS via transxchange2gtfs
+  - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)

-Requires: osmium-tool (apt install osmium-tool)
+Requires: osmium-tool, Node.js (npx), Docker (for national rail)

 Output directory: property-data/transit/
-  Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready)
+  raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
 """

 import argparse
-import csv
-import io
+import json
 import os
 import subprocess
+import tempfile
+import time
+import urllib.parse
 import urllib.request
 import zipfile
 from pathlib import Path
@ -33,18 +37,30 @@ ENGLAND_PBF_URL = (
 # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
 BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"

+# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
+TFL_TRANSXCHANGE_URL = (
+    "https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
+)
+
+# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
+NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
+
+# National Rail Open Data API
+NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
+NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
+
 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"

-# London + Home Counties bounding box (~50km buffer around Greater London)
-LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35}

-
-def _download_http(url: str, dest: Path, *, desc: str) -> None:
+def _download_http(url: str, dest: Path, *, desc: str, headers: dict | None = None) -> None:
    """Stream-download a URL to a file with progress bar."""
    dest.parent.mkdir(parents=True, exist_ok=True)
    tmp = dest.with_suffix(dest.suffix + ".tmp")

-    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+    req_headers = {"User-Agent": USER_AGENT}
+    if headers:
+        req_headers.update(headers)
+    req = urllib.request.Request(url, headers=req_headers)

    with (
        tqdm(unit="B", unit_scale=True, desc=desc) as bar,
@ -112,8 +128,6 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                        cols.index("departure_time") if "departure_time" in cols else -1
                    )

-                    import tempfile
-
                    tmp = tempfile.NamedTemporaryFile(
                        mode="wb", delete=False, suffix=".txt"
                    )
@ -170,143 +184,449 @@ def clean_gtfs(src: Path, dst: Path) -> None:
    print(f"  Saved to {dst}")


-def crop_osm_to_london(src: Path, dst: Path) -> None:
-    """Extract London bounding box from England OSM PBF using osmium."""
-    if dst.exists():
-        print(f"London OSM PBF already exists: {dst}")
+def download_tfl_transxchange(raw_dir: Path) -> Path:
+    """Download TfL TransXChange timetable bundle."""
+    dest = raw_dir / "tfl_transxchange.zip"
+    if dest.exists():
+        print(f"TfL TransXChange already exists: {dest}")
+        return dest
+
+    print("Downloading TfL TransXChange timetables...")
+    _download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
+    return dest
+
+
+def download_naptan() -> None:
+    """Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
+    dest = Path("/tmp/Stops.csv")
+    if dest.exists():
+        print(f"NaPTAN Stops.csv already exists: {dest}")
        return

-    bbox = LONDON_BBOX
-    bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}"
+    print("Downloading NaPTAN stops data...")
+    _download_http(NAPTAN_URL, dest, desc="Stops.csv")

-    print(f"Cropping OSM PBF to London bbox ({bbox_str})...")
+
+def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
+    """Convert TfL TransXChange to GTFS using transxchange2gtfs."""
+    dest = output_dir / "tfl_gtfs.zip"
+    if dest.exists():
+        print(f"TfL GTFS already exists: {dest}")
+        return dest
+
+    txc_path = raw_dir / "tfl_transxchange.zip"
+
+    # Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
+    download_naptan()
+
+    print("Converting TfL TransXChange → GTFS...")
    subprocess.run(
-        ["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"],
+        ["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
        check=True,
    )
-    size_mb = dst.stat().st_size / (1024 * 1024)
-    print(f"  Saved to {dst} ({size_mb:.0f} MB)")
+    size_mb = dest.stat().st_size / (1024 * 1024)
+    print(f"  Saved to {dest} ({size_mb:.1f} MB)")
+    return dest


-def crop_gtfs_to_london(src: Path, dst: Path) -> None:
-    """Crop GTFS to trips touching the London bounding box."""
+def download_national_rail_cif(raw_dir: Path) -> Path | None:
+    """Download National Rail CIF timetable (requires credentials)."""
+    dest = raw_dir / "national_rail_cif.zip"
+    if dest.exists():
+        print(f"National Rail CIF already exists: {dest}")
+        return dest
+
+    email = os.environ.get("NATIONAL_RAIL_EMAIL")
+    password = os.environ.get("NATIONAL_RAIL_PASSWORD")
+    if not email or not password:
+        print("Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail")
+        return None
+
+    print("Authenticating with National Rail Open Data...")
+    auth_data = urllib.parse.urlencode({"username": email, "password": password}).encode()
+    auth_req = urllib.request.Request(
+        NR_AUTH_URL,
+        data=auth_data,
+        headers={"User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded"},
+    )
+    with urllib.request.urlopen(auth_req) as resp:
+        token_data = json.loads(resp.read())
+    token = token_data["token"]
+    print("  Authenticated successfully")
+
+    print("Downloading National Rail CIF timetable...")
+    _download_http(
+        NR_TIMETABLE_URL,
+        dest,
+        desc="national_rail_cif.zip",
+        headers={"X-Auth-Token": token},
+    )
+    return dest
+
+
+def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
+    """Fix R5-incompatible entries in dtd2mysql-generated National Rail GTFS.
+
+    Fixes:
+    - Interior pass-through stops (pickup_type=1, drop_off_type=1) → normal stops.
+      R5 builds TripPatterns from the full stop sequence but may build shorter
+      TripSchedules when stops are non-boarding, causing ArrayIndexOutOfBoundsException.
+    - Removes stop_times referencing stops not in stops.txt.
+    - Removes trips with backwards travel times.
+    - Converts route_type=714 (rail replacement bus) to 3 (bus) for R5 compatibility.
+    - Removes non-standard links.txt file.
+    - Renumbers stop_sequence to 0-based (R5/BODS convention).
+    - Fixes bogus coordinates (lat < 0) on Irish CIE stations.
+    """
    if dst.exists():
-        print(f"London GTFS already exists: {dst}")
+        print(f"Cleaned National Rail GTFS already exists: {dst}")
        return

-    bbox = LONDON_BBOX
+    print("Cleaning National Rail GTFS for R5 compatibility...")

-    print("Cropping GTFS to London area...")
+    # First pass: collect valid stop IDs and find bad trips
+    stop_ids: set[str] = set()
+    bad_trip_ids: set[str] = set()

    with zipfile.ZipFile(src, "r") as zin:
-        # Step 1: Find stops in bbox
-        print("  Finding stops in bbox...")
+        # Load valid stop IDs
        with zin.open("stops.txt") as f:
-            reader = csv.DictReader(io.TextIOWrapper(f))
-            stops_in_bbox = set()
-            all_stops = list(reader)
-            for row in all_stops:
-                lat = float(row["stop_lat"])
-                lon = float(row["stop_lon"])
-                if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]:
-                    stops_in_bbox.add(row["stop_id"])
-        print(f"    {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox")
+            header = f.readline().decode("utf-8").strip()
+            stop_id_idx = header.split(",").index("stop_id")
+            lat_idx = header.split(",").index("stop_lat")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if parts:
+                    stop_ids.add(parts[stop_id_idx])

-        # Step 2: Find trips touching these stops
-        print("  Finding trips touching London stops...")
+        # Find trips with backwards travel times
        with zin.open("stop_times.txt") as f:
-            reader = csv.DictReader(io.TextIOWrapper(f))
-            st_fieldnames = reader.fieldnames
-            trips_in_bbox = set()
-            for row in reader:
-                if row["stop_id"] in stops_in_bbox:
-                    trips_in_bbox.add(row["trip_id"])
-        print(f"    {len(trips_in_bbox):,} trips touch London")
+            st_header = f.readline().decode("utf-8").strip()
+            st_cols = st_header.split(",")
+            trip_id_idx = st_cols.index("trip_id")
+            dep_idx = st_cols.index("departure_time")

-        # Step 3: Collect all stop_times for those trips
-        print("  Collecting stop_times for London trips...")
-        stop_times_kept = []
-        with zin.open("stop_times.txt") as f:
-            reader = csv.DictReader(io.TextIOWrapper(f))
-            for row in reader:
-                if row["trip_id"] in trips_in_bbox:
-                    stop_times_kept.append(row)
-        stops_needed = {row["stop_id"] for row in stop_times_kept}
-        print(f"    {len(stop_times_kept):,} stop_times kept")
+            prev_trip = ""
+            prev_dep_secs = -1
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                trip_id = parts[trip_id_idx].strip('"')
+                if trip_id != prev_trip:
+                    prev_trip = trip_id
+                    prev_dep_secs = -1

-        # Step 4: Read trips and find needed routes/services/shapes
-        print("  Reading trips...")
-        with zin.open("trips.txt") as f:
-            reader = csv.DictReader(io.TextIOWrapper(f))
-            trips_fieldnames = reader.fieldnames
-            all_trips = list(reader)
-        trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox]
-        routes_needed = {t["route_id"] for t in trips_kept}
-        services_needed = {t["service_id"] for t in trips_kept}
-        shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""}
+                dep_str = parts[dep_idx].strip('"')
+                if ":" in dep_str:
+                    try:
+                        h, m, s = dep_str.split(":")
+                        dep_secs = int(h) * 3600 + int(m) * 60 + int(s)
+                        if dep_secs < prev_dep_secs:
+                            bad_trip_ids.add(trip_id)
+                        prev_dep_secs = dep_secs
+                    except ValueError:
+                        pass

-        # Step 5: Write cropped GTFS
-        print("  Writing cropped GTFS...")
-        with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
-            # stops
-            stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed]
-            _write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept)
+    print(f"  Found {len(bad_trip_ids)} trips with backwards travel times")

-            # stop_times
-            _write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept)
+    # Second pass: write cleaned zip
+    passthrough_fixed = 0
+    orphan_stops_removed = 0
+    bad_trips_removed = 0
+    seqs_renumbered = 0
+    coords_fixed = 0
+    route_types_fixed = 0

-            # trips
-            _write_csv(zout, "trips.txt", trips_fieldnames, trips_kept)
+    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
+        dst, "w", zipfile.ZIP_DEFLATED
+    ) as zout:
+        for info in zin.infolist():
+            # Skip non-standard links.txt
+            if info.filename == "links.txt":
+                continue

-            # routes
-            with zin.open("routes.txt") as f:
-                reader = csv.DictReader(io.TextIOWrapper(f))
-                routes_fn = reader.fieldnames
-                routes_kept = [r for r in reader if r["route_id"] in routes_needed]
-            _write_csv(zout, "routes.txt", routes_fn, routes_kept)
+            if info.filename == "stop_times.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+                    stop_id_idx = cols.index("stop_id")
+                    seq_idx = cols.index("stop_sequence")
+                    pickup_idx = cols.index("pickup_type") if "pickup_type" in cols else -1
+                    dropoff_idx = cols.index("drop_off_type") if "drop_off_type" in cols else -1

-            # agency (copy all)
-            zout.writestr("agency.txt", zin.read("agency.txt"))
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)

-            # calendar
-            with zin.open("calendar.txt") as f:
-                reader = csv.DictReader(io.TextIOWrapper(f))
-                cal_fn = reader.fieldnames
-                cal_kept = [r for r in reader if r["service_id"] in services_needed]
-            _write_csv(zout, "calendar.txt", cal_fn, cal_kept)
+                    prev_trip = ""
+                    seq_counter = 0
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        trip_id = parts[trip_id_idx].strip('"')
+                        stop_id = parts[stop_id_idx].strip('"')

-            # calendar_dates
-            with zin.open("calendar_dates.txt") as f:
-                reader = csv.DictReader(io.TextIOWrapper(f))
-                cd_fn = reader.fieldnames
-                cd_kept = [r for r in reader if r["service_id"] in services_needed]
-            _write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept)
+                        # Skip trips with backwards times
+                        if trip_id in bad_trip_ids:
+                            bad_trips_removed += 1
+                            continue

-            # shapes (stream — can be very large)
-            print("    Streaming shapes.txt...")
-            with zin.open("shapes.txt") as f:
-                reader = csv.DictReader(io.TextIOWrapper(f))
-                shapes_fn = reader.fieldnames
-                shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed]
-            _write_csv(zout, "shapes.txt", shapes_fn, shapes_rows)
+                        # Skip stop_times referencing missing stops
+                        if stop_id not in stop_ids:
+                            orphan_stops_removed += 1
+                            continue

-            # feed_info + frequencies (copy)
-            zout.writestr("feed_info.txt", zin.read("feed_info.txt"))
-            zout.writestr("frequencies.txt", zin.read("frequencies.txt"))
+                        # Fix pass-through stops: set pickup/dropoff to 0 (normal)
+                        if pickup_idx >= 0 and dropoff_idx >= 0:
+                            pickup = parts[pickup_idx].strip('"')
+                            dropoff = parts[dropoff_idx].strip('"')
+                            if pickup == "1" and dropoff == "1":
+                                parts[pickup_idx] = "0"
+                                parts[dropoff_idx] = "0"
+                                passthrough_fixed += 1

-    size_mb = dst.stat().st_size / (1024 * 1024)
-    print(f"  Saved to {dst} ({size_mb:.0f} MB)")
+                        # Renumber stop_sequence to 0-based
+                        if trip_id != prev_trip:
+                            prev_trip = trip_id
+                            seq_counter = 0
+                        else:
+                            seq_counter += 1
+                        old_seq = parts[seq_idx].strip('"')
+                        parts[seq_idx] = str(seq_counter)
+                        if old_seq != str(seq_counter):
+                            seqs_renumbered += 1
+
+                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+
+                    tmp.close()
+                    zout.write(tmp.name, "stop_times.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "stops.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    lat_idx = cols.index("stop_lat")
+                    lon_idx = cols.index("stop_lon")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        try:
+                            lat = float(parts[lat_idx])
+                            # Fix bogus Irish CIE coordinates (South Atlantic)
+                            if lat < 0:
+                                # Set to a neutral UK coordinate that won't be routed to
+                                parts[lat_idx] = "54.0"
+                                parts[lon_idx] = "-2.0"
+                                coords_fixed += 1
+                        except ValueError:
+                            pass
+                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+
+                    tmp.close()
+                    zout.write(tmp.name, "stops.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "routes.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    rt_idx = cols.index("route_type")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        if parts[rt_idx].strip('"') == "714":
+                            parts[rt_idx] = "3"
+                            route_types_fixed += 1
+                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+
+                    tmp.close()
+                    zout.write(tmp.name, "routes.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "trips.txt":
+                # Remove trips that have backwards travel times
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        if parts[trip_id_idx].strip('"') not in bad_trip_ids:
+                            tmp.write(line)
+
+                    tmp.close()
+                    zout.write(tmp.name, "trips.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "calendar.txt":
+                # Cap end_date year to 2099
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    end_idx = cols.index("end_date")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+
+                    for line in f:
+                        line_str = line.decode("utf-8", errors="replace").strip()
+                        if not line_str:
+                            continue
+                        parts = line_str.split(",")
+                        date_val = parts[end_idx].strip('"')
+                        if len(date_val) == 8:
+                            try:
+                                year = int(date_val[:4])
+                                if year > 2099:
+                                    parts[end_idx] = "20991231"
+                            except ValueError:
+                                pass
+                        tmp.write((",".join(parts) + "\n").encode("utf-8"))
+
+                    tmp.close()
+                    zout.write(tmp.name, "calendar.txt")
+                    os.unlink(tmp.name)
+
+            else:
+                zout.writestr(info, zin.read(info))
+
+    print(f"  Pass-through stops fixed: {passthrough_fixed}")
+    print(f"  Orphan stop references removed: {orphan_stops_removed}")
+    print(f"  Bad trip stop_times removed: {bad_trips_removed}")
+    print(f"  Stop sequences renumbered: {seqs_renumbered}")
+    print(f"  Bogus coordinates fixed: {coords_fixed}")
+    print(f"  Route types 714→3 fixed: {route_types_fixed}")
+    print(f"  Saved to {dst}")


-def _write_csv(
-    zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict]
+def _docker_run_dtd2mysql(
+    network: str, db_container: str, volumes: list[str], args: list[str]
 ) -> None:
-    buf = io.StringIO()
-    w = csv.DictWriter(buf, fieldnames=fieldnames)
-    w.writeheader()
-    w.writerows(rows)
-    zout.writestr(name, buf.getvalue())
-    print(f"    {name}: {len(rows):,} rows")
+    """Run dtd2mysql in a Node.js container on the same Docker network as MariaDB."""
+    cmd = [
+        "docker", "run", "--rm", "--network", network,
+        "-e", f"DATABASE_HOSTNAME={db_container}",
+        "-e", "DATABASE_USERNAME=root",
+        "-e", "DATABASE_PASSWORD=root",
+        "-e", "DATABASE_NAME=dtd",
+    ]
+    for v in volumes:
+        cmd.extend(["-v", v])
+    # Install zip (needed for --gtfs-zip) then run dtd2mysql
+    inner = "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args)
+    cmd.extend(["node:20", "bash", "-c", inner])
+    subprocess.run(cmd, check=True)
+
+
+def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
+    """Convert National Rail CIF to GTFS using dtd2mysql + MariaDB Docker.
+
+    Runs both MariaDB and dtd2mysql as Docker containers on a shared network,
+    since Docker port forwarding is not available in all environments.
+    Then cleans the output for R5 compatibility.
+    """
+    dest = output_dir / "national_rail_gtfs.zip"
+    if dest.exists():
+        print(f"National Rail GTFS already exists: {dest}")
+        return dest
+
+    raw_dest = raw_dir / "national_rail_gtfs_raw.zip"
+
+    if not raw_dest.exists():
+        db_container = "propertymap-mariadb-temp"
+        network = "propertymap-dtd-net"
+
+        print("Creating Docker network and starting MariaDB...")
+        subprocess.run(["docker", "network", "create", network], capture_output=True)
+        subprocess.run(
+            [
+                "docker", "run", "-d",
+                "--name", db_container,
+                "--network", network,
+                "-e", "MARIADB_ROOT_PASSWORD=root",
+                "-e", "MARIADB_DATABASE=dtd",
+                "mariadb:latest",
+            ],
+            check=True,
+        )
+
+        try:
+            # Wait for MariaDB to be ready
+            print("  Waiting for MariaDB to be ready...")
+            for attempt in range(30):
+                result = subprocess.run(
+                    ["docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1"],
+                    capture_output=True,
+                )
+                if result.returncode == 0:
+                    break
+                time.sleep(2)
+            else:
+                raise RuntimeError("MariaDB did not become ready in time")
+
+            raw_abs = str(raw_dir.resolve())
+
+            print("Importing CIF timetable into MariaDB...")
+            _docker_run_dtd2mysql(
+                network, db_container,
+                volumes=[f"{raw_abs}:/data:ro"],
+                args=["--timetable", "/data/national_rail_cif.zip"],
+            )
+
+            print("Exporting GTFS from MariaDB...")
+            _docker_run_dtd2mysql(
+                network, db_container,
+                volumes=[f"{raw_abs}:/output"],
+                args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"],
+            )
+
+        finally:
+            print("Cleaning up Docker resources...")
+            subprocess.run(["docker", "stop", db_container], capture_output=True)
+            subprocess.run(["docker", "rm", db_container], capture_output=True)
+            subprocess.run(["docker", "network", "rm", network], capture_output=True)
+
+    # Clean the raw GTFS for R5 compatibility
+    clean_national_rail_gtfs(raw_dest, dest)
+    return dest


 def main() -> None:
@ -319,26 +639,43 @@ def main() -> None:
        required=True,
        help="Output directory for transit data",
    )
+    parser.add_argument(
+        "--skip-tfl",
+        action="store_true",
+        help="Skip TfL TransXChange download and conversion",
+    )
+    parser.add_argument(
+        "--skip-national-rail",
+        action="store_true",
+        help="Skip National Rail CIF download and conversion",
+    )
    args = parser.parse_args()

    output_dir: Path = args.output
    raw_dir = output_dir / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

-    # Download raw data
-    england_pbf = download_osm_pbf(raw_dir)
+    # 1. Download and clean BODS GTFS
+    download_osm_pbf(raw_dir)
    bods_raw = download_bods_gtfs(raw_dir)

-    # Clean GTFS (fix R5 incompatibilities)
-    bods_clean = raw_dir / "bods_gtfs_clean.zip"
+    bods_clean = output_dir / "bods_gtfs.zip"
    clean_gtfs(bods_raw, bods_clean)

-    # Crop to London area for R5 (full England requires >30GB RAM)
-    london_pbf = output_dir / "london.osm.pbf"
-    crop_osm_to_london(england_pbf, london_pbf)
+    # 2. TfL TransXChange → GTFS
+    if args.skip_tfl:
+        print("Skipping TfL (--skip-tfl)")
+    else:
+        download_tfl_transxchange(raw_dir)
+        convert_tfl_to_gtfs(raw_dir, output_dir)

-    london_gtfs = output_dir / "bods_gtfs.zip"
-    crop_gtfs_to_london(bods_clean, london_gtfs)
+    # 3. National Rail CIF → GTFS
+    if args.skip_national_rail:
+        print("Skipping National Rail (--skip-national-rail)")
+    else:
+        cif = download_national_rail_cif(raw_dir)
+        if cif is not None:
+            convert_national_rail_to_gtfs(raw_dir, output_dir)

    # Summary
    print()
@ -349,6 +686,11 @@ def main() -> None:
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  {f.name}: {size_mb:.1f} MB")

+    print()
+    print("IMPORTANT: If you previously built a network from London-only data,")
+    print("delete the stale cache before running R5:")
+    print("  rm -f property-data/r5-network/network.dat")
+

 if __name__ == "__main__":
    main()
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -8,37 +8,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
 MIN_FLOOR_AREA_M2 = 10


-def _join_journey_times(
-    wide: pl.LazyFrame,
-    journey_times_path: Path,
-    destination_name: str,
-) -> pl.LazyFrame:
-    """Join journey times for a single destination, renaming columns appropriately."""
-    journey_times = (
-        pl.scan_parquet(journey_times_path)
-        .select(
-            "postcode",
-            pl.col("public_transport_quick_minutes").alias(
-                f"Public transport to {destination_name} (mins)"
-            ),
-            pl.col("cycling_minutes").alias(f"Cycling to {destination_name} (mins)"),
-        )
-        .sort(f"Public transport to {destination_name} (mins)", nulls_last=True)
-        .group_by("postcode")
-        .first()
-    )
-    return wide.join(journey_times, on="postcode", how="left")
-
-
 _AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
-    # Transport
-    "Public transport to Bank (mins)",
-    "Cycling to Bank (mins)",
-    "Public transport to Fitzrovia (mins)",
-    "Cycling to Fitzrovia (mins)",
    # Deprivation
    "Income Score (rate)",
    "Employment Score (rate)",
@ -97,8 +70,6 @@ def _build(
    arcgis_path: Path,
    iod_path: Path,
    poi_proximity_path: Path,
-    journey_times_bank_path: Path,
-    journey_times_fitzrovia_path: Path,
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
@ -138,9 +109,6 @@ def _build(
    )
    wide = wide.join(arcgis, on="postcode", how="left")

-    wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
-    wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
-
    iod = pl.scan_parquet(iod_path)
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

@ -382,18 +350,6 @@ def main():
        type=Path,
        help="POI proximity counts parquet file (optional)",
    )
-    parser.add_argument(
-        "--journey-times-bank",
-        type=Path,
-        default=None,
-        help="Journey times to Bank parquet file",
-    )
-    parser.add_argument(
-        "--journey-times-fitzrovia",
-        type=Path,
-        default=None,
-        help="Journey times to Fitzrovia parquet file",
-    )
    parser.add_argument(
        "--ethnicity",
        type=Path,
@ -446,8 +402,6 @@ def main():
        arcgis_path=args.arcgis,
        iod_path=args.iod,
        poi_proximity_path=args.poi_proximity,
-        journey_times_bank_path=args.journey_times_bank,
-        journey_times_fitzrovia_path=args.journey_times_fitzrovia,
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,