Lots of improvements

This commit is contained in:
Andras Schmelczer 2026-03-10 22:05:51 +00:00
parent ef921361ec
commit 80a5a2a774
21 changed files with 489 additions and 337 deletions

View file

@ -8,6 +8,7 @@ Downloads:
Then processes for R5 compatibility:
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
- Converts high-frequency metro/tram services to frequency-based GTFS
- Converts TfL TransXChange to GTFS via transxchange2gtfs
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
@ -20,12 +21,15 @@ Output directory: property-data/transit/
import argparse
import json
import os
import shutil
import statistics
import subprocess
import tempfile
import time
import urllib.parse
import urllib.request
import zipfile
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
@ -184,6 +188,229 @@ def clean_gtfs(src: Path, dst: Path) -> None:
print(f" Saved to {dst}")
def _parse_gtfs_time(time_str: str) -> int | None:
"""Parse HH:MM:SS to seconds since midnight. Returns None on failure."""
time_str = time_str.strip('"')
if ":" not in time_str:
return None
try:
h, m, s = time_str.split(":")
return int(h) * 3600 + int(m) * 60 + int(s)
except ValueError:
return None
def _secs_to_gtfs_time(s: int) -> str:
"""Convert seconds since midnight to HH:MM:SS."""
h = s // 3600
m = (s % 3600) // 60
sec = s % 60
return f"{h:02d}:{m:02d}:{sec:02d}"
def convert_high_freq_to_frequency_based(
src: Path, dst: Path, *, max_headway_minutes: int = 15
) -> None:
"""Convert high-frequency scheduled services to frequency-based GTFS entries.
Identifies metro (route_type=1) and tram (route_type=0) routes with regular
headways under max_headway_minutes, then creates frequencies.txt entries and
removes redundant trips. R5's RAPTOR produces smoother percentile results for
frequency-based services, matching the "just turn up" reality of high-frequency
metro/tram services.
"""
if dst.exists():
print(f"Frequency-converted GTFS already exists: {dst}")
return
print("Converting high-frequency services to frequency-based...")
max_headway_secs = max_headway_minutes * 60
with zipfile.ZipFile(src, "r") as zin:
# Step 1: Find metro/tram route IDs
target_route_ids: set[str] = set()
with zin.open("routes.txt") as f:
header = f.readline().decode("utf-8").strip()
cols = header.split(",")
route_id_idx = cols.index("route_id")
rt_idx = cols.index("route_type")
for line in f:
parts = line.decode("utf-8", errors="replace").strip().split(",")
if not parts:
continue
route_type = parts[rt_idx].strip('"')
if route_type in ("0", "1"): # tram, metro/subway
target_route_ids.add(parts[route_id_idx].strip('"'))
if not target_route_ids:
print(" No metro/tram routes found, copying unchanged")
shutil.copy2(src, dst)
return
print(f" Found {len(target_route_ids)} metro/tram routes")
# Step 2: Map target trips to grouping keys
trip_group_key: dict[str, tuple[str, str, str]] = {}
with zin.open("trips.txt") as f:
header = f.readline().decode("utf-8").strip()
cols = header.split(",")
trip_id_idx = cols.index("trip_id")
route_id_idx = cols.index("route_id")
dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
service_idx = cols.index("service_id")
for line in f:
parts = line.decode("utf-8", errors="replace").strip().split(",")
if not parts:
continue
route_id = parts[route_id_idx].strip('"')
if route_id in target_route_ids:
trip_id = parts[trip_id_idx].strip('"')
direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0"
service_id = parts[service_idx].strip('"')
trip_group_key[trip_id] = (route_id, direction, service_id)
print(f" Found {len(trip_group_key)} trips on target routes")
# Step 3: Get first departure time and first stop for each target trip
trip_first_dep: dict[str, int] = {}
trip_first_stop: dict[str, str] = {}
with zin.open("stop_times.txt") as f:
header = f.readline().decode("utf-8").strip()
cols = header.split(",")
trip_id_idx = cols.index("trip_id")
dep_idx = cols.index("departure_time")
seq_idx = cols.index("stop_sequence")
stop_id_idx = cols.index("stop_id")
for line in f:
parts = line.decode("utf-8", errors="replace").strip().split(",")
if not parts:
continue
trip_id = parts[trip_id_idx].strip('"')
if trip_id not in trip_group_key:
continue
if parts[seq_idx].strip('"') == "0":
dep_secs = _parse_gtfs_time(parts[dep_idx])
if dep_secs is not None:
trip_first_dep[trip_id] = dep_secs
trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
# Step 4: Group trips by (route, direction, service, first_stop) and compute headways
groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
for trip_id, dep_secs in trip_first_dep.items():
route_id, direction, service_id = trip_group_key[trip_id]
first_stop = trip_first_stop.get(trip_id, "")
key = (route_id, direction, service_id, first_stop)
groups[key].append((trip_id, dep_secs))
trips_to_remove: set[str] = set()
frequency_entries: list[tuple[str, int, int, int]] = []
groups_converted = 0
for _key, trips in groups.items():
if len(trips) < 4:
continue
trips.sort(key=lambda x: x[1])
headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)]
headways = [h for h in headways if h > 0]
if len(headways) < 3:
continue
median_hw = statistics.median(headways)
if median_hw > max_headway_secs or median_hw < 30:
continue
mean_hw = statistics.mean(headways)
if mean_hw == 0:
continue
stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0
if stdev_hw / mean_hw > 0.5:
continue
# Convert: keep first trip as template, remove the rest
template_trip_id = trips[0][0]
start_secs = trips[0][1]
end_secs = trips[-1][1] + int(median_hw)
headway_rounded = max(60, round(median_hw / 60) * 60)
frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
for trip_id, _ in trips[1:]:
trips_to_remove.add(trip_id)
groups_converted += 1
print(f" Converted {groups_converted} trip groups to frequency-based")
print(f" Removing {len(trips_to_remove)} redundant trips")
print(f" Created {len(frequency_entries)} frequency entries")
# Step 5: Write modified GTFS
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
dst, "w", zipfile.ZIP_DEFLATED
) as zout:
for info in zin.infolist():
if info.filename == "trips.txt":
with zin.open(info) as f:
header = f.readline()
header_str = header.decode("utf-8").strip()
cols = header_str.split(",")
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
)
tmp.write(header)
for line in f:
parts = (
line.decode("utf-8", errors="replace").strip().split(",")
)
if not parts:
continue
if parts[trip_id_idx].strip('"') not in trips_to_remove:
tmp.write(line)
tmp.close()
zout.write(tmp.name, "trips.txt")
os.unlink(tmp.name)
elif info.filename == "stop_times.txt":
with zin.open(info) as f:
header = f.readline()
header_str = header.decode("utf-8").strip()
cols = header_str.split(",")
trip_id_idx = cols.index("trip_id")
tmp = tempfile.NamedTemporaryFile(
mode="wb", delete=False, suffix=".txt"
)
tmp.write(header)
for line in f:
parts = (
line.decode("utf-8", errors="replace").strip().split(",")
)
if not parts:
continue
if parts[trip_id_idx].strip('"') not in trips_to_remove:
tmp.write(line)
tmp.close()
zout.write(tmp.name, "stop_times.txt")
os.unlink(tmp.name)
elif info.filename == "frequencies.txt":
pass # we'll write our own below
else:
zout.writestr(info, zin.read(info))
# Write frequencies.txt
freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"]
for trip_id, start, end, headway in frequency_entries:
freq_lines.append(
f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n"
)
zout.writestr("frequencies.txt", "".join(freq_lines))
print(f" Saved to {dst}")
def download_tfl_transxchange(raw_dir: Path) -> Path:
"""Download TfL TransXChange timetable bundle."""
dest = raw_dir / "tfl_transxchange.zip"
@ -655,12 +882,15 @@ def main() -> None:
raw_dir = output_dir / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
# 1. Download and clean BODS GTFS
# 1. Download, clean, and frequency-convert BODS GTFS
download_osm_pbf(raw_dir)
bods_raw = download_bods_gtfs(raw_dir)
bods_clean = output_dir / "bods_gtfs.zip"
clean_gtfs(bods_raw, bods_clean)
bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip"
clean_gtfs(bods_raw, bods_cleaned)
bods_final = output_dir / "bods_gtfs.zip"
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
# 2. TfL TransXChange → GTFS
if args.skip_tfl: