perfect-postcode/pipeline/download/test_transit_network.py

242 lines
8.8 KiB
Python

"""Tests for transit_network GTFS processing."""
import datetime as dt
import zipfile
from pathlib import Path
import pytest
from pipeline.download.transit_network import (
convert_high_freq_to_frequency_based,
validate_gtfs_feed,
)
def _write_gtfs(path: Path, *, stop_times: str) -> None:
"""Write a minimal GTFS zip with one metro route and several trips."""
routes = "route_id,route_type\nR1,1\n"
trips = "trip_id,route_id,direction_id,service_id\n" + "".join(
f"T{i},R1,0,S1\n" for i in range(1, 7)
)
with zipfile.ZipFile(path, "w") as z:
z.writestr("routes.txt", routes)
z.writestr("trips.txt", trips)
z.writestr("stop_times.txt", stop_times)
def _one_based_stop_times() -> str:
"""Six trips, 1-based stop_sequence (1,2,...), 5-minute headway."""
header = "trip_id,stop_sequence,departure_time,stop_id\n"
rows = []
# First departures 06:00, 06:05, ... (300s = 5 min headway, well under 15 min)
for i in range(6):
trip = f"T{i + 1}"
first_dep = 6 * 3600 + i * 300
h, m = divmod(first_dep, 3600)
m, s = divmod(m, 60)
# First stop has stop_sequence 1 (NOT 0); second stop sequence 2.
rows.append(f"{trip},1,{h:02d}:{m:02d}:{s:02d},STOP_A\n")
later = first_dep + 120
h2, m2 = divmod(later, 3600)
m2, s2 = divmod(m2, 60)
rows.append(f"{trip},2,{h2:02d}:{m2:02d}:{s2:02d},STOP_B\n")
return header + "".join(rows)
def test_one_based_stop_sequence_is_converted(tmp_path: Path) -> None:
"""First stop selection must use the minimum stop_sequence, not literal "0".
With 1-based stop_sequence the old code (keyed on stop_sequence == "0") found
zero first stops and produced an empty frequencies.txt. The fix selects the
minimum stop_sequence per trip, so the high-frequency group is converted.
"""
src = tmp_path / "in.zip"
dst = tmp_path / "out.zip"
_write_gtfs(src, stop_times=_one_based_stop_times())
convert_high_freq_to_frequency_based(src, dst)
with zipfile.ZipFile(dst, "r") as z:
freq = z.read("frequencies.txt").decode("utf-8")
freq_rows = [r for r in freq.splitlines()[1:] if r.strip()]
# The single high-frequency group must produce exactly one frequency entry.
assert len(freq_rows) == 1, freq
trip_id, start_time, end_time, headway_secs, _exact = freq_rows[0].split(",")
# Template trip is the earliest departure (T1 at 06:00) starting at first stop.
assert start_time == "06:00:00"
# Median headway of 300s rounds to a 300s headway entry.
assert headway_secs == "300"
def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
"""A non-empty target trip set with unparseable stop_sequence is loud, not silent."""
src = tmp_path / "in.zip"
dst = tmp_path / "out.zip"
bad = (
"trip_id,stop_sequence,departure_time,stop_id\n"
"T1,not_a_number,06:00:00,STOP_A\n"
)
_write_gtfs(src, stop_times=bad)
with pytest.raises(RuntimeError, match="no first stops"):
convert_high_freq_to_frequency_based(src, dst)
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
TODAY = dt.date(2026, 6, 10)
def _make_gtfs(
path: Path,
*,
calendar: str | None = (
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
),
calendar_dates: str | None = None,
stops: str = (
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Bank,51.5133,-0.0886\n"
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
),
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
stop_times: str = (
"trip_id,stop_sequence,departure_time,stop_id\n"
"T1,0,06:00:00,STOP_A\n"
"T1,1,06:02:00,STOP_B\n"
),
) -> Path:
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
with zipfile.ZipFile(path, "w") as z:
if calendar is not None:
z.writestr("calendar.txt", calendar)
if calendar_dates is not None:
z.writestr("calendar_dates.txt", calendar_dates)
z.writestr("stops.txt", stops)
z.writestr("routes.txt", routes)
z.writestr("trips.txt", trips)
z.writestr("stop_times.txt", stop_times)
return path
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip")
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
)
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
),
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "future feed", today=TODAY)
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
tmp_path: Path,
) -> None:
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
)
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=None,
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Nowhere,0,0\n"
"STOP_B,Blank,,\n"
),
)
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,New York,40.71,-74.0\n"
"STOP_B,Sydney,-33.87,151.21\n"
),
)
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
rows.append("STOP_BAD,Broken,0,0\n")
feed = _make_gtfs(
tmp_path / "feed.zip",
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
)
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
bogus = tmp_path / "feed.zip"
bogus.write_text("not a zip")
with pytest.raises(RuntimeError, match="not a valid zip"):
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)