Fix data pipelines once and for all
This commit is contained in:
parent
08560476c5
commit
4012e4e047
46 changed files with 4508 additions and 855 deletions
|
|
@ -2,24 +2,32 @@
|
|||
|
||||
Downloads:
|
||||
- England OSM PBF from Geofabrik (~1.5GB)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
||||
- TfL TransXChange timetables → converted to GTFS
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
|
||||
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials;
|
||||
includes the Elizabeth line, TOC "XR")
|
||||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts high-frequency metro/tram services to frequency-based GTFS
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
- Validates every produced GTFS zip (active calendar window, plausible UK
|
||||
stop coordinates, non-empty routes/trips/stop_times)
|
||||
|
||||
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
|
||||
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
|
||||
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
|
||||
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
|
||||
service. BODS covers all TfL modes that feed nominally provided.
|
||||
|
||||
Requires: osmium-tool, Docker (for national rail)
|
||||
|
||||
Output directory: property-data/transit/
|
||||
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
|
||||
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import datetime as dt
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
|
|
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
|
|||
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
||||
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
||||
|
||||
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
|
||||
TFL_TRANSXCHANGE_URL = (
|
||||
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
|
||||
)
|
||||
|
||||
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
|
||||
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
|
||||
# National Rail Open Data API
|
||||
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
|
||||
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
|
||||
|
||||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
|
||||
|
||||
# GTFS validation: a feed must have service within this many days of the build
|
||||
# date, and at least this fraction of stops must have plausible UK coordinates.
|
||||
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
|
||||
GTFS_MIN_VALID_STOP_FRACTION = 0.95
|
||||
UK_LAT_RANGE = (49.0, 61.0)
|
||||
UK_LON_RANGE = (-9.0, 2.5)
|
||||
|
||||
|
||||
def _download_http(
|
||||
|
|
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL TransXChange already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading TfL TransXChange timetables...")
|
||||
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
|
||||
return dest
|
||||
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
|
||||
"""True if a GTFS file has at least one non-empty data row after the header."""
|
||||
with z.open(filename) as f:
|
||||
f.readline() # header
|
||||
for line in f:
|
||||
if _parse_csv_line(line):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def download_naptan() -> None:
|
||||
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
|
||||
dest = local_tmp_dir() / "Stops.csv"
|
||||
if dest.exists():
|
||||
print(f"NaPTAN Stops.csv already exists: {dest}")
|
||||
return
|
||||
def _calendar_active_in_window(
|
||||
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
|
||||
) -> bool:
|
||||
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
|
||||
|
||||
print("Downloading NaPTAN stops data...")
|
||||
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
|
||||
|
||||
|
||||
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
||||
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
|
||||
dest = output_dir / "tfl_gtfs.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
txc_path = raw_dir / "tfl_transxchange.zip"
|
||||
|
||||
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
|
||||
download_naptan()
|
||||
|
||||
print("Converting TfL TransXChange → GTFS...")
|
||||
# The shim patches known packaging/runtime issues in the pinned npm package
|
||||
# before loading its CLI from npx's temporary install.
|
||||
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
|
||||
subprocess.run(
|
||||
[
|
||||
"npx",
|
||||
"--yes",
|
||||
"--package",
|
||||
TRANSXCHANGE2GTFS_PACKAGE,
|
||||
"sh",
|
||||
"-c",
|
||||
"\n".join(
|
||||
[
|
||||
'bin="$(command -v transxchange2gtfs)"',
|
||||
'script="$(readlink -f "$bin")"',
|
||||
'pkg_dir="$(dirname "$(dirname "$script")")"',
|
||||
'shim="$1"',
|
||||
"shift",
|
||||
'exec node "$shim" "$pkg_dir" "$@"',
|
||||
]
|
||||
),
|
||||
"transxchange2gtfs",
|
||||
str(shim_path.resolve()),
|
||||
str(txc_path.resolve()),
|
||||
str(dest.resolve()),
|
||||
],
|
||||
check=True,
|
||||
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
|
||||
date range overlaps the window AND at least one weekday flag is set; a
|
||||
calendar_dates.txt row counts when it adds service (exception_type=1) on a
|
||||
date inside the window.
|
||||
"""
|
||||
weekdays = (
|
||||
"monday",
|
||||
"tuesday",
|
||||
"wednesday",
|
||||
"thursday",
|
||||
"friday",
|
||||
"saturday",
|
||||
"sunday",
|
||||
)
|
||||
if "calendar.txt" in names:
|
||||
with z.open("calendar.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
start_idx = cols.index("start_date")
|
||||
end_idx = cols.index("end_date")
|
||||
except ValueError:
|
||||
return False
|
||||
day_idxs = [cols.index(d) for d in weekdays if d in cols]
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
start = int(parts[start_idx].strip('"'))
|
||||
end = int(parts[end_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if start > window_end or end < window_start:
|
||||
continue
|
||||
if day_idxs and not any(
|
||||
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
|
||||
):
|
||||
continue
|
||||
return True
|
||||
|
||||
if "calendar_dates.txt" in names:
|
||||
with z.open("calendar_dates.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
date_idx = cols.index("date")
|
||||
exc_idx = cols.index("exception_type")
|
||||
except ValueError:
|
||||
return False
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
try:
|
||||
date = int(parts[date_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
|
||||
continue
|
||||
if window_start <= date <= window_end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
|
||||
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
|
||||
|
||||
Guards against silently shipping a feed that contributes zero service (as
|
||||
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
|
||||
(a) calendar.txt/calendar_dates.txt have at least one service active
|
||||
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
|
||||
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
|
||||
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
|
||||
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
|
||||
"""
|
||||
if today is None:
|
||||
today = dt.date.today()
|
||||
window_start = int(today.strftime("%Y%m%d"))
|
||||
window_end = int(
|
||||
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
|
||||
)
|
||||
|
||||
def fail(reason: str) -> None:
|
||||
raise RuntimeError(
|
||||
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
|
||||
)
|
||||
|
||||
print(f"Validating GTFS feed '{feed_name}'...")
|
||||
if not path.exists() or not zipfile.is_zipfile(path):
|
||||
fail("not a valid zip file")
|
||||
|
||||
with zipfile.ZipFile(path) as z:
|
||||
names = set(z.namelist())
|
||||
|
||||
# (c) core files present and non-empty
|
||||
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
|
||||
if required not in names:
|
||||
fail(f"missing {required}")
|
||||
if not _gtfs_has_data_row(z, required):
|
||||
fail(f"{required} has no data rows")
|
||||
|
||||
# (a) at least one service active in the routing window
|
||||
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
|
||||
fail("has neither calendar.txt nor calendar_dates.txt")
|
||||
if not _calendar_active_in_window(z, names, window_start, window_end):
|
||||
fail(
|
||||
f"no service active between {window_start} and {window_end} — "
|
||||
"the feed's calendars are stale/expired and it would contribute "
|
||||
"zero service to routing"
|
||||
)
|
||||
|
||||
# (b) stops have plausible UK coordinates
|
||||
total_stops = 0
|
||||
valid_stops = 0
|
||||
with z.open("stops.txt") as f:
|
||||
cols = _parse_csv_line(f.readline())
|
||||
try:
|
||||
lat_idx = cols.index("stop_lat")
|
||||
lon_idx = cols.index("stop_lon")
|
||||
except ValueError:
|
||||
fail("stops.txt is missing stop_lat/stop_lon columns")
|
||||
for line in f:
|
||||
parts = _parse_csv_line(line)
|
||||
if not parts:
|
||||
continue
|
||||
total_stops += 1
|
||||
try:
|
||||
lat = float(parts[lat_idx].strip('"'))
|
||||
lon = float(parts[lon_idx].strip('"'))
|
||||
except (ValueError, IndexError):
|
||||
continue # empty/garbage coordinate → invalid
|
||||
if lat == 0.0 and lon == 0.0:
|
||||
continue
|
||||
if (
|
||||
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
|
||||
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
|
||||
):
|
||||
valid_stops += 1
|
||||
if total_stops == 0:
|
||||
fail("stops.txt has no stops")
|
||||
fraction = valid_stops / total_stops
|
||||
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
|
||||
fail(
|
||||
f"only {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) have plausible UK coordinates "
|
||||
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
|
||||
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
|
||||
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
|
||||
)
|
||||
|
||||
print(
|
||||
f" OK: service active in window, {valid_stops}/{total_stops} stops "
|
||||
f"({fraction:.1%}) with plausible UK coordinates"
|
||||
)
|
||||
required_files = {
|
||||
"agency.txt",
|
||||
"calendar.txt",
|
||||
"calendar_dates.txt",
|
||||
"routes.txt",
|
||||
"stop_times.txt",
|
||||
"stops.txt",
|
||||
"trips.txt",
|
||||
}
|
||||
if not dest.exists() or not zipfile.is_zipfile(dest):
|
||||
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
|
||||
with zipfile.ZipFile(dest) as z:
|
||||
missing = required_files - set(z.namelist())
|
||||
if missing:
|
||||
missing_str = ", ".join(sorted(missing))
|
||||
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
|
||||
size_mb = dest.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dest} ({size_mb:.1f} MB)")
|
||||
return dest
|
||||
|
||||
|
||||
def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
||||
|
|
@ -1007,18 +1099,15 @@ def main() -> None:
|
|||
required=True,
|
||||
help="Output directory for transit data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tfl",
|
||||
action="store_true",
|
||||
help="Skip TfL TransXChange download and conversion",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
|
||||
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
|
||||
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
|
|
@ -1027,16 +1116,10 @@ def main() -> None:
|
|||
|
||||
bods_final = output_dir / "bods_gtfs.zip"
|
||||
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
|
||||
validate_gtfs_feed(bods_final, "BODS GTFS")
|
||||
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
print("Skipping TfL (--skip-tfl)")
|
||||
else:
|
||||
download_tfl_transxchange(raw_dir)
|
||||
convert_tfl_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
|
||||
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
|
||||
# silently overstates every train commute. Missing credentials are a HARD
|
||||
# error, so a rail-less network can never ship.
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
|
|
@ -1048,7 +1131,8 @@ def main() -> None:
|
|||
"required; without it the transit network models every train journey "
|
||||
"as bus-only and overstates commute times."
|
||||
)
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
validate_gtfs_feed(nr_final, "National Rail GTFS")
|
||||
|
||||
# Summary
|
||||
print()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue