Lots of improvements
This commit is contained in:
parent
ef921361ec
commit
80a5a2a774
21 changed files with 489 additions and 337 deletions
|
|
@ -8,6 +8,7 @@ Downloads:
|
|||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts high-frequency metro/tram services to frequency-based GTFS
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
|
||||
|
|
@ -20,12 +21,15 @@ Output directory: property-data/transit/
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
|
@ -184,6 +188,229 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def _parse_gtfs_time(time_str: str) -> int | None:
|
||||
"""Parse HH:MM:SS to seconds since midnight. Returns None on failure."""
|
||||
time_str = time_str.strip('"')
|
||||
if ":" not in time_str:
|
||||
return None
|
||||
try:
|
||||
h, m, s = time_str.split(":")
|
||||
return int(h) * 3600 + int(m) * 60 + int(s)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _secs_to_gtfs_time(s: int) -> str:
|
||||
"""Convert seconds since midnight to HH:MM:SS."""
|
||||
h = s // 3600
|
||||
m = (s % 3600) // 60
|
||||
sec = s % 60
|
||||
return f"{h:02d}:{m:02d}:{sec:02d}"
|
||||
|
||||
|
||||
def convert_high_freq_to_frequency_based(
|
||||
src: Path, dst: Path, *, max_headway_minutes: int = 15
|
||||
) -> None:
|
||||
"""Convert high-frequency scheduled services to frequency-based GTFS entries.
|
||||
|
||||
Identifies metro (route_type=1) and tram (route_type=0) routes with regular
|
||||
headways under max_headway_minutes, then creates frequencies.txt entries and
|
||||
removes redundant trips. R5's RAPTOR produces smoother percentile results for
|
||||
frequency-based services, matching the "just turn up" reality of high-frequency
|
||||
metro/tram services.
|
||||
"""
|
||||
if dst.exists():
|
||||
print(f"Frequency-converted GTFS already exists: {dst}")
|
||||
return
|
||||
|
||||
print("Converting high-frequency services to frequency-based...")
|
||||
max_headway_secs = max_headway_minutes * 60
|
||||
|
||||
with zipfile.ZipFile(src, "r") as zin:
|
||||
# Step 1: Find metro/tram route IDs
|
||||
target_route_ids: set[str] = set()
|
||||
with zin.open("routes.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
route_id_idx = cols.index("route_id")
|
||||
rt_idx = cols.index("route_type")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
route_type = parts[rt_idx].strip('"')
|
||||
if route_type in ("0", "1"): # tram, metro/subway
|
||||
target_route_ids.add(parts[route_id_idx].strip('"'))
|
||||
|
||||
if not target_route_ids:
|
||||
print(" No metro/tram routes found, copying unchanged")
|
||||
shutil.copy2(src, dst)
|
||||
return
|
||||
|
||||
print(f" Found {len(target_route_ids)} metro/tram routes")
|
||||
|
||||
# Step 2: Map target trips to grouping keys
|
||||
trip_group_key: dict[str, tuple[str, str, str]] = {}
|
||||
with zin.open("trips.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
route_id_idx = cols.index("route_id")
|
||||
dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
|
||||
service_idx = cols.index("service_id")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
route_id = parts[route_id_idx].strip('"')
|
||||
if route_id in target_route_ids:
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0"
|
||||
service_id = parts[service_idx].strip('"')
|
||||
trip_group_key[trip_id] = (route_id, direction, service_id)
|
||||
|
||||
print(f" Found {len(trip_group_key)} trips on target routes")
|
||||
|
||||
# Step 3: Get first departure time and first stop for each target trip
|
||||
trip_first_dep: dict[str, int] = {}
|
||||
trip_first_stop: dict[str, str] = {}
|
||||
with zin.open("stop_times.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
dep_idx = cols.index("departure_time")
|
||||
seq_idx = cols.index("stop_sequence")
|
||||
stop_id_idx = cols.index("stop_id")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
if trip_id not in trip_group_key:
|
||||
continue
|
||||
if parts[seq_idx].strip('"') == "0":
|
||||
dep_secs = _parse_gtfs_time(parts[dep_idx])
|
||||
if dep_secs is not None:
|
||||
trip_first_dep[trip_id] = dep_secs
|
||||
trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
|
||||
|
||||
# Step 4: Group trips by (route, direction, service, first_stop) and compute headways
|
||||
groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
|
||||
for trip_id, dep_secs in trip_first_dep.items():
|
||||
route_id, direction, service_id = trip_group_key[trip_id]
|
||||
first_stop = trip_first_stop.get(trip_id, "")
|
||||
key = (route_id, direction, service_id, first_stop)
|
||||
groups[key].append((trip_id, dep_secs))
|
||||
|
||||
trips_to_remove: set[str] = set()
|
||||
frequency_entries: list[tuple[str, int, int, int]] = []
|
||||
groups_converted = 0
|
||||
|
||||
for _key, trips in groups.items():
|
||||
if len(trips) < 4:
|
||||
continue
|
||||
|
||||
trips.sort(key=lambda x: x[1])
|
||||
headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)]
|
||||
headways = [h for h in headways if h > 0]
|
||||
|
||||
if len(headways) < 3:
|
||||
continue
|
||||
|
||||
median_hw = statistics.median(headways)
|
||||
if median_hw > max_headway_secs or median_hw < 30:
|
||||
continue
|
||||
|
||||
mean_hw = statistics.mean(headways)
|
||||
if mean_hw == 0:
|
||||
continue
|
||||
stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0
|
||||
if stdev_hw / mean_hw > 0.5:
|
||||
continue
|
||||
|
||||
# Convert: keep first trip as template, remove the rest
|
||||
template_trip_id = trips[0][0]
|
||||
start_secs = trips[0][1]
|
||||
end_secs = trips[-1][1] + int(median_hw)
|
||||
headway_rounded = max(60, round(median_hw / 60) * 60)
|
||||
|
||||
frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
|
||||
for trip_id, _ in trips[1:]:
|
||||
trips_to_remove.add(trip_id)
|
||||
groups_converted += 1
|
||||
|
||||
print(f" Converted {groups_converted} trip groups to frequency-based")
|
||||
print(f" Removing {len(trips_to_remove)} redundant trips")
|
||||
print(f" Created {len(frequency_entries)} frequency entries")
|
||||
|
||||
# Step 5: Write modified GTFS
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
for info in zin.infolist():
|
||||
if info.filename == "trips.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
for line in f:
|
||||
parts = (
|
||||
line.decode("utf-8", errors="replace").strip().split(",")
|
||||
)
|
||||
if not parts:
|
||||
continue
|
||||
if parts[trip_id_idx].strip('"') not in trips_to_remove:
|
||||
tmp.write(line)
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "trips.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "stop_times.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
for line in f:
|
||||
parts = (
|
||||
line.decode("utf-8", errors="replace").strip().split(",")
|
||||
)
|
||||
if not parts:
|
||||
continue
|
||||
if parts[trip_id_idx].strip('"') not in trips_to_remove:
|
||||
tmp.write(line)
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "stop_times.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "frequencies.txt":
|
||||
pass # we'll write our own below
|
||||
|
||||
else:
|
||||
zout.writestr(info, zin.read(info))
|
||||
|
||||
# Write frequencies.txt
|
||||
freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"]
|
||||
for trip_id, start, end, headway in frequency_entries:
|
||||
freq_lines.append(
|
||||
f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n"
|
||||
)
|
||||
zout.writestr("frequencies.txt", "".join(freq_lines))
|
||||
|
||||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
|
|
@ -655,12 +882,15 @@ def main() -> None:
|
|||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Download and clean BODS GTFS
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
bods_clean = output_dir / "bods_gtfs.zip"
|
||||
clean_gtfs(bods_raw, bods_clean)
|
||||
bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip"
|
||||
clean_gtfs(bods_raw, bods_cleaned)
|
||||
|
||||
bods_final = output_dir / "bods_gtfs.zip"
|
||||
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
|
||||
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue