"""Tests for transit_network GTFS processing.""" import datetime as dt import zipfile from pathlib import Path import pytest from pipeline.download.transit_network import ( convert_high_freq_to_frequency_based, validate_gtfs_feed, ) def _write_gtfs(path: Path, *, stop_times: str) -> None: """Write a minimal GTFS zip with one metro route and several trips.""" routes = "route_id,route_type\nR1,1\n" trips = "trip_id,route_id,direction_id,service_id\n" + "".join( f"T{i},R1,0,S1\n" for i in range(1, 7) ) with zipfile.ZipFile(path, "w") as z: z.writestr("routes.txt", routes) z.writestr("trips.txt", trips) z.writestr("stop_times.txt", stop_times) def _one_based_stop_times() -> str: """Six trips, 1-based stop_sequence (1,2,...), 5-minute headway.""" header = "trip_id,stop_sequence,departure_time,stop_id\n" rows = [] # First departures 06:00, 06:05, ... (300s = 5 min headway, well under 15 min) for i in range(6): trip = f"T{i + 1}" first_dep = 6 * 3600 + i * 300 h, m = divmod(first_dep, 3600) m, s = divmod(m, 60) # First stop has stop_sequence 1 (NOT 0); second stop sequence 2. rows.append(f"{trip},1,{h:02d}:{m:02d}:{s:02d},STOP_A\n") later = first_dep + 120 h2, m2 = divmod(later, 3600) m2, s2 = divmod(m2, 60) rows.append(f"{trip},2,{h2:02d}:{m2:02d}:{s2:02d},STOP_B\n") return header + "".join(rows) def test_one_based_stop_sequence_is_converted(tmp_path: Path) -> None: """First stop selection must use the minimum stop_sequence, not literal "0". With 1-based stop_sequence the old code (keyed on stop_sequence == "0") found zero first stops and produced an empty frequencies.txt. The fix selects the minimum stop_sequence per trip, so the high-frequency group is converted. """ src = tmp_path / "in.zip" dst = tmp_path / "out.zip" _write_gtfs(src, stop_times=_one_based_stop_times()) convert_high_freq_to_frequency_based(src, dst) with zipfile.ZipFile(dst, "r") as z: freq = z.read("frequencies.txt").decode("utf-8") freq_rows = [r for r in freq.splitlines()[1:] if r.strip()] # The single high-frequency group must produce exactly one frequency entry. assert len(freq_rows) == 1, freq trip_id, start_time, end_time, headway_secs, _exact = freq_rows[0].split(",") # Template trip is the earliest departure (T1 at 06:00) starting at first stop. assert start_time == "06:00:00" # Median headway of 300s rounds to a 300s headway entry. assert headway_secs == "300" def test_raises_when_no_first_stops_found(tmp_path: Path) -> None: """A non-empty target trip set with unparseable stop_sequence is loud, not silent.""" src = tmp_path / "in.zip" dst = tmp_path / "out.zip" bad = ( "trip_id,stop_sequence,departure_time,stop_id\n" "T1,not_a_number,06:00:00,STOP_A\n" ) _write_gtfs(src, stop_times=bad) with pytest.raises(RuntimeError, match="no first stops"): convert_high_freq_to_frequency_based(src, dst) # ── validate_gtfs_feed ──────────────────────────────────────────────────────── TODAY = dt.date(2026, 6, 10) def _make_gtfs( path: Path, *, calendar: str | None = ( "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday," "start_date,end_date\n" "S1,1,1,1,1,1,0,0,20260101,20271231\n" ), calendar_dates: str | None = None, stops: str = ( "stop_id,stop_name,stop_lat,stop_lon\n" "STOP_A,Bank,51.5133,-0.0886\n" "STOP_B,Liverpool Street,51.5178,-0.0823\n" ), routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n", trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n", stop_times: str = ( "trip_id,stop_sequence,departure_time,stop_id\n" "T1,0,06:00:00,STOP_A\n" "T1,1,06:02:00,STOP_B\n" ), ) -> Path: """Write a tiny synthetic GTFS zip; defaults form a valid current feed.""" with zipfile.ZipFile(path, "w") as z: if calendar is not None: z.writestr("calendar.txt", calendar) if calendar_dates is not None: z.writestr("calendar_dates.txt", calendar_dates) z.writestr("stops.txt", stops) z.writestr("routes.txt", routes) z.writestr("trips.txt", trips) z.writestr("stop_times.txt", stop_times) return path def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None: feed = _make_gtfs(tmp_path / "feed.zip") validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None: """The 2010 TfL snapshot failure mode: all calendars ended years ago.""" feed = _make_gtfs( tmp_path / "feed.zip", calendar=( "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday," "start_date,end_date\n" "S1,1,1,1,1,1,0,0,20091201,20101224\n" ), ) with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"): validate_gtfs_feed(feed, "stale tfl", today=TODAY) def test_validate_gtfs_feed_calendar_starting_after_window_fails( tmp_path: Path, ) -> None: feed = _make_gtfs( tmp_path / "feed.zip", calendar=( "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday," "start_date,end_date\n" "S1,1,1,1,1,1,0,0,20270101,20271231\n" ), ) with pytest.raises(RuntimeError, match="no service active"): validate_gtfs_feed(feed, "future feed", today=TODAY) def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar( tmp_path: Path, ) -> None: """An expired calendar.txt passes if calendar_dates.txt adds service now.""" feed = _make_gtfs( tmp_path / "feed.zip", calendar=( "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday," "start_date,end_date\n" "S1,1,1,1,1,1,0,0,20091201,20101224\n" ), calendar_dates="service_id,date,exception_type\nS1,20260615,1\n", ) validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise def test_validate_gtfs_feed_removed_service_exception_does_not_count( tmp_path: Path, ) -> None: feed = _make_gtfs( tmp_path / "feed.zip", calendar=None, calendar_dates="service_id,date,exception_type\nS1,20260615,2\n", ) with pytest.raises(RuntimeError, match="no service active"): validate_gtfs_feed(feed, "removed-only feed", today=TODAY) def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None: """The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords.""" feed = _make_gtfs( tmp_path / "feed.zip", stops=( "stop_id,stop_name,stop_lat,stop_lon\n" "STOP_A,Nowhere,0,0\n" "STOP_B,Blank,,\n" ), ) with pytest.raises(RuntimeError, match=r"plausible UK coordinates"): validate_gtfs_feed(feed, "coordless feed", today=TODAY) def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None: feed = _make_gtfs( tmp_path / "feed.zip", stops=( "stop_id,stop_name,stop_lat,stop_lon\n" "STOP_A,New York,40.71,-74.0\n" "STOP_B,Sydney,-33.87,151.21\n" ), ) with pytest.raises(RuntimeError, match="plausible UK coordinates"): validate_gtfs_feed(feed, "abroad feed", today=TODAY) def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None: """One bad stop out of 30 (3.3%) stays under the 5% tolerance.""" rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)] rows.append("STOP_BAD,Broken,0,0\n") feed = _make_gtfs( tmp_path / "feed.zip", stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows), ) validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None: feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n") with pytest.raises(RuntimeError, match="trips.txt has no data rows"): validate_gtfs_feed(feed, "tripless feed", today=TODAY) def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None: feed = _make_gtfs(tmp_path / "feed.zip", calendar=None) with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"): validate_gtfs_feed(feed, "calendarless feed", today=TODAY) def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None: bogus = tmp_path / "feed.zip" bogus.write_text("not a zip") with pytest.raises(RuntimeError, match="not a valid zip"): validate_gtfs_feed(bogus, "bogus feed", today=TODAY)