Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -2,24 +2,32 @@
Downloads:
- England OSM PBF from Geofabrik (~1.5GB)
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
- TfL TransXChange timetables converted to GTFS
- National Rail CIF timetable converted to GTFS (requires credentials)
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
- National Rail CIF timetable converted to GTFS (requires credentials;
includes the Elizabeth line, TOC "XR")
Then processes for R5 compatibility:
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
- Converts high-frequency metro/tram services to frequency-based GTFS
- Converts TfL TransXChange to GTFS via transxchange2gtfs
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
- Validates every produced GTFS zip (active calendar window, plausible UK
stop coordinates, non-empty routes/trips/stop_times)
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
service. BODS covers all TfL modes that feed nominally provided.
Requires: osmium-tool, Docker (for national rail)
Output directory: property-data/transit/
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
"""
import argparse
import csv
import datetime as dt
import io
import json
import os
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
TFL_TRANSXCHANGE_URL = (
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
)
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
# National Rail Open Data API
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
# GTFS validation: a feed must have service within this many days of the build
# date, and at least this fraction of stops must have plausible UK coordinates.
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
GTFS_MIN_VALID_STOP_FRACTION = 0.95
UK_LAT_RANGE = (49.0, 61.0)
UK_LON_RANGE = (-9.0, 2.5)
def _download_http(
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
print(f" Saved to {dst}")
def download_tfl_transxchange(raw_dir: Path) -> Path:
"""Download TfL TransXChange timetable bundle."""
dest = raw_dir / "tfl_transxchange.zip"
if dest.exists():
print(f"TfL TransXChange already exists: {dest}")
return dest
print("Downloading TfL TransXChange timetables...")
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
return dest
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
"""True if a GTFS file has at least one non-empty data row after the header."""
with z.open(filename) as f:
f.readline() # header
for line in f:
if _parse_csv_line(line):
return True
return False
def download_naptan() -> None:
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
dest = local_tmp_dir() / "Stops.csv"
if dest.exists():
print(f"NaPTAN Stops.csv already exists: {dest}")
return
def _calendar_active_in_window(
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
) -> bool:
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
print("Downloading NaPTAN stops data...")
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
dest = output_dir / "tfl_gtfs.zip"
if dest.exists():
print(f"TfL GTFS already exists: {dest}")
return dest
txc_path = raw_dir / "tfl_transxchange.zip"
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
download_naptan()
print("Converting TfL TransXChange → GTFS...")
# The shim patches known packaging/runtime issues in the pinned npm package
# before loading its CLI from npx's temporary install.
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
subprocess.run(
[
"npx",
"--yes",
"--package",
TRANSXCHANGE2GTFS_PACKAGE,
"sh",
"-c",
"\n".join(
[
'bin="$(command -v transxchange2gtfs)"',
'script="$(readlink -f "$bin")"',
'pkg_dir="$(dirname "$(dirname "$script")")"',
'shim="$1"',
"shift",
'exec node "$shim" "$pkg_dir" "$@"',
]
),
"transxchange2gtfs",
str(shim_path.resolve()),
str(txc_path.resolve()),
str(dest.resolve()),
],
check=True,
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
date range overlaps the window AND at least one weekday flag is set; a
calendar_dates.txt row counts when it adds service (exception_type=1) on a
date inside the window.
"""
weekdays = (
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
)
if "calendar.txt" in names:
with z.open("calendar.txt") as f:
cols = _parse_csv_line(f.readline())
try:
start_idx = cols.index("start_date")
end_idx = cols.index("end_date")
except ValueError:
return False
day_idxs = [cols.index(d) for d in weekdays if d in cols]
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
start = int(parts[start_idx].strip('"'))
end = int(parts[end_idx].strip('"'))
except (ValueError, IndexError):
continue
if start > window_end or end < window_start:
continue
if day_idxs and not any(
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
):
continue
return True
if "calendar_dates.txt" in names:
with z.open("calendar_dates.txt") as f:
cols = _parse_csv_line(f.readline())
try:
date_idx = cols.index("date")
exc_idx = cols.index("exception_type")
except ValueError:
return False
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
date = int(parts[date_idx].strip('"'))
except (ValueError, IndexError):
continue
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
continue
if window_start <= date <= window_end:
return True
return False
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
Guards against silently shipping a feed that contributes zero service (as
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
(a) calendar.txt/calendar_dates.txt have at least one service active
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
"""
if today is None:
today = dt.date.today()
window_start = int(today.strftime("%Y%m%d"))
window_end = int(
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
)
def fail(reason: str) -> None:
raise RuntimeError(
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
)
print(f"Validating GTFS feed '{feed_name}'...")
if not path.exists() or not zipfile.is_zipfile(path):
fail("not a valid zip file")
with zipfile.ZipFile(path) as z:
names = set(z.namelist())
# (c) core files present and non-empty
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
if required not in names:
fail(f"missing {required}")
if not _gtfs_has_data_row(z, required):
fail(f"{required} has no data rows")
# (a) at least one service active in the routing window
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
fail("has neither calendar.txt nor calendar_dates.txt")
if not _calendar_active_in_window(z, names, window_start, window_end):
fail(
f"no service active between {window_start} and {window_end}"
"the feed's calendars are stale/expired and it would contribute "
"zero service to routing"
)
# (b) stops have plausible UK coordinates
total_stops = 0
valid_stops = 0
with z.open("stops.txt") as f:
cols = _parse_csv_line(f.readline())
try:
lat_idx = cols.index("stop_lat")
lon_idx = cols.index("stop_lon")
except ValueError:
fail("stops.txt is missing stop_lat/stop_lon columns")
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
total_stops += 1
try:
lat = float(parts[lat_idx].strip('"'))
lon = float(parts[lon_idx].strip('"'))
except (ValueError, IndexError):
continue # empty/garbage coordinate → invalid
if lat == 0.0 and lon == 0.0:
continue
if (
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
):
valid_stops += 1
if total_stops == 0:
fail("stops.txt has no stops")
fraction = valid_stops / total_stops
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
fail(
f"only {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) have plausible UK coordinates "
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
)
print(
f" OK: service active in window, {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) with plausible UK coordinates"
)
required_files = {
"agency.txt",
"calendar.txt",
"calendar_dates.txt",
"routes.txt",
"stop_times.txt",
"stops.txt",
"trips.txt",
}
if not dest.exists() or not zipfile.is_zipfile(dest):
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
with zipfile.ZipFile(dest) as z:
missing = required_files - set(z.namelist())
if missing:
missing_str = ", ".join(sorted(missing))
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to {dest} ({size_mb:.1f} MB)")
return dest
def download_national_rail_cif(raw_dir: Path) -> Path | None:
@ -1007,18 +1099,15 @@ def main() -> None:
required=True,
help="Output directory for transit data",
)
parser.add_argument(
"--skip-tfl",
action="store_true",
help="Skip TfL TransXChange download and conversion",
)
args = parser.parse_args()
output_dir: Path = args.output
raw_dir = output_dir / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
# 1. Download, clean, and frequency-convert BODS GTFS
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
download_osm_pbf(raw_dir)
bods_raw = download_bods_gtfs(raw_dir)
@ -1027,16 +1116,10 @@ def main() -> None:
bods_final = output_dir / "bods_gtfs.zip"
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
validate_gtfs_feed(bods_final, "BODS GTFS")
# 2. TfL TransXChange → GTFS
if args.skip_tfl:
print("Skipping TfL (--skip-tfl)")
else:
download_tfl_transxchange(raw_dir)
convert_tfl_to_gtfs(raw_dir, output_dir)
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
# silently overstates every train commute. Missing credentials are a HARD
# error, so a rail-less network can never ship.
cif = download_national_rail_cif(raw_dir)
@ -1048,7 +1131,8 @@ def main() -> None:
"required; without it the transit network models every train journey "
"as bus-only and overstates commute times."
)
convert_national_rail_to_gtfs(raw_dir, output_dir)
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
validate_gtfs_feed(nr_final, "National Rail GTFS")
# Summary
print()