Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -1,11 +1,15 @@
"""Tests for transit_network GTFS processing."""
import datetime as dt
import zipfile
from pathlib import Path
import pytest
from pipeline.download.transit_network import convert_high_freq_to_frequency_based
from pipeline.download.transit_network import (
convert_high_freq_to_frequency_based,
validate_gtfs_feed,
)
def _write_gtfs(path: Path, *, stop_times: str) -> None:
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
with pytest.raises(RuntimeError, match="no first stops"):
convert_high_freq_to_frequency_based(src, dst)
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
TODAY = dt.date(2026, 6, 10)
def _make_gtfs(
path: Path,
*,
calendar: str | None = (
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
),
calendar_dates: str | None = None,
stops: str = (
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Bank,51.5133,-0.0886\n"
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
),
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
stop_times: str = (
"trip_id,stop_sequence,departure_time,stop_id\n"
"T1,0,06:00:00,STOP_A\n"
"T1,1,06:02:00,STOP_B\n"
),
) -> Path:
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
with zipfile.ZipFile(path, "w") as z:
if calendar is not None:
z.writestr("calendar.txt", calendar)
if calendar_dates is not None:
z.writestr("calendar_dates.txt", calendar_dates)
z.writestr("stops.txt", stops)
z.writestr("routes.txt", routes)
z.writestr("trips.txt", trips)
z.writestr("stop_times.txt", stop_times)
return path
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip")
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
)
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
),
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "future feed", today=TODAY)
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
tmp_path: Path,
) -> None:
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
)
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=None,
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Nowhere,0,0\n"
"STOP_B,Blank,,\n"
),
)
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,New York,40.71,-74.0\n"
"STOP_B,Sydney,-33.87,151.21\n"
),
)
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
rows.append("STOP_BAD,Broken,0,0\n")
feed = _make_gtfs(
tmp_path / "feed.zip",
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
)
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
bogus = tmp_path / "feed.zip"
bogus.write_text("not a zip")
with pytest.raises(RuntimeError, match="not a valid zip"):
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)