Good stuff
This commit is contained in:
parent
9da2db707f
commit
8032011708
32 changed files with 1052 additions and 374 deletions
|
|
@ -16,16 +16,16 @@ from .pois import UK_BBOX_EAST, UK_BBOX_NORTH, UK_BBOX_SOUTH, UK_BBOX_WEST
|
|||
|
||||
PLACE_TYPES = {
|
||||
"city",
|
||||
"borough",
|
||||
"town",
|
||||
"suburb",
|
||||
"quarter",
|
||||
"neighbourhood",
|
||||
"village",
|
||||
"hamlet",
|
||||
"locality",
|
||||
"island",
|
||||
"isolated_dwelling",
|
||||
# "borough",
|
||||
# "town",
|
||||
# "suburb",
|
||||
# "quarter",
|
||||
# "neighbourhood",
|
||||
# "village",
|
||||
# "hamlet",
|
||||
# "locality",
|
||||
# "island",
|
||||
# "isolated_dwelling",
|
||||
}
|
||||
|
||||
# Suffixes to strip from raw station names before appending the typed suffix.
|
||||
|
|
@ -115,11 +115,15 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
self._add(name, place_type, lat, lon, population)
|
||||
return
|
||||
|
||||
# railway=station nodes (tube, national rail, DLR, tram, etc.)
|
||||
# Tube stations only (London Underground)
|
||||
if n.tags.get("railway") == "station":
|
||||
display_name = _station_display_name(name, dict(n.tags))
|
||||
self._add(display_name, "station", lat, lon, population)
|
||||
return
|
||||
tags = dict(n.tags)
|
||||
station_tag = tags.get("station", "")
|
||||
network = tags.get("network", "").lower()
|
||||
if station_tag == "subway" or "underground" in network:
|
||||
display_name = _station_display_name(name, tags)
|
||||
self._add(display_name, "station", lat, lon, population)
|
||||
return
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -133,7 +137,7 @@ def main() -> None:
|
|||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
print(f"Extracting place nodes: {sorted(PLACE_TYPES)} + railway=station")
|
||||
print("Extracting place nodes: cities + tube stations")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
|
|
|
|||
|
|
@ -3,23 +3,27 @@
|
|||
Downloads:
|
||||
- England OSM PBF from Geofabrik (~1.5GB)
|
||||
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
|
||||
- TfL TransXChange timetables → converted to GTFS
|
||||
- National Rail CIF timetable → converted to GTFS (requires credentials)
|
||||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Crops OSM PBF to London bounding box via osmium
|
||||
- Crops GTFS to London bounding box (keeps only London-touching trips)
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
|
||||
Requires: osmium-tool (apt install osmium-tool)
|
||||
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
|
||||
|
||||
Output directory: property-data/transit/
|
||||
Final files: london.osm.pbf + bods_gtfs.zip (London-only, R5-ready)
|
||||
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
|
@ -33,18 +37,30 @@ ENGLAND_PBF_URL = (
|
|||
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
|
||||
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
|
||||
|
||||
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
|
||||
TFL_TRANSXCHANGE_URL = (
|
||||
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
|
||||
)
|
||||
|
||||
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
|
||||
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
||||
|
||||
# National Rail Open Data API
|
||||
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
|
||||
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
|
||||
|
||||
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
|
||||
|
||||
# London + Home Counties bounding box (~50km buffer around Greater London)
|
||||
LONDON_BBOX = {"min_lat": 51.2, "max_lat": 51.85, "min_lon": -0.65, "max_lon": 0.35}
|
||||
|
||||
|
||||
def _download_http(url: str, dest: Path, *, desc: str) -> None:
|
||||
def _download_http(url: str, dest: Path, *, desc: str, headers: dict | None = None) -> None:
|
||||
"""Stream-download a URL to a file with progress bar."""
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = dest.with_suffix(dest.suffix + ".tmp")
|
||||
|
||||
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
req_headers = {"User-Agent": USER_AGENT}
|
||||
if headers:
|
||||
req_headers.update(headers)
|
||||
req = urllib.request.Request(url, headers=req_headers)
|
||||
|
||||
with (
|
||||
tqdm(unit="B", unit_scale=True, desc=desc) as bar,
|
||||
|
|
@ -112,8 +128,6 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
cols.index("departure_time") if "departure_time" in cols else -1
|
||||
)
|
||||
|
||||
import tempfile
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
|
|
@ -170,143 +184,449 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def crop_osm_to_london(src: Path, dst: Path) -> None:
|
||||
"""Extract London bounding box from England OSM PBF using osmium."""
|
||||
if dst.exists():
|
||||
print(f"London OSM PBF already exists: {dst}")
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL TransXChange already exists: {dest}")
|
||||
return dest
|
||||
|
||||
print("Downloading TfL TransXChange timetables...")
|
||||
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
|
||||
return dest
|
||||
|
||||
|
||||
def download_naptan() -> None:
|
||||
"""Download NaPTAN stops to /tmp/Stops.csv (needed by transxchange2gtfs)."""
|
||||
dest = Path("/tmp/Stops.csv")
|
||||
if dest.exists():
|
||||
print(f"NaPTAN Stops.csv already exists: {dest}")
|
||||
return
|
||||
|
||||
bbox = LONDON_BBOX
|
||||
bbox_str = f"{bbox['min_lon']},{bbox['min_lat']},{bbox['max_lon']},{bbox['max_lat']}"
|
||||
print("Downloading NaPTAN stops data...")
|
||||
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
|
||||
|
||||
print(f"Cropping OSM PBF to London bbox ({bbox_str})...")
|
||||
|
||||
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
||||
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
|
||||
dest = output_dir / "tfl_gtfs.zip"
|
||||
if dest.exists():
|
||||
print(f"TfL GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
txc_path = raw_dir / "tfl_transxchange.zip"
|
||||
|
||||
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
|
||||
download_naptan()
|
||||
|
||||
print("Converting TfL TransXChange → GTFS...")
|
||||
subprocess.run(
|
||||
["osmium", "extract", f"--bbox={bbox_str}", str(src), "-o", str(dst), "--overwrite"],
|
||||
["npx", "--yes", "transxchange2gtfs", str(txc_path), str(dest)],
|
||||
check=True,
|
||||
)
|
||||
size_mb = dst.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
||||
size_mb = dest.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dest} ({size_mb:.1f} MB)")
|
||||
return dest
|
||||
|
||||
|
||||
def crop_gtfs_to_london(src: Path, dst: Path) -> None:
|
||||
"""Crop GTFS to trips touching the London bounding box."""
|
||||
def download_national_rail_cif(raw_dir: Path) -> Path | None:
|
||||
"""Download National Rail CIF timetable (requires credentials)."""
|
||||
dest = raw_dir / "national_rail_cif.zip"
|
||||
if dest.exists():
|
||||
print(f"National Rail CIF already exists: {dest}")
|
||||
return dest
|
||||
|
||||
email = os.environ.get("NATIONAL_RAIL_EMAIL")
|
||||
password = os.environ.get("NATIONAL_RAIL_PASSWORD")
|
||||
if not email or not password:
|
||||
print("Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail")
|
||||
return None
|
||||
|
||||
print("Authenticating with National Rail Open Data...")
|
||||
auth_data = urllib.parse.urlencode({"username": email, "password": password}).encode()
|
||||
auth_req = urllib.request.Request(
|
||||
NR_AUTH_URL,
|
||||
data=auth_data,
|
||||
headers={"User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded"},
|
||||
)
|
||||
with urllib.request.urlopen(auth_req) as resp:
|
||||
token_data = json.loads(resp.read())
|
||||
token = token_data["token"]
|
||||
print(" Authenticated successfully")
|
||||
|
||||
print("Downloading National Rail CIF timetable...")
|
||||
_download_http(
|
||||
NR_TIMETABLE_URL,
|
||||
dest,
|
||||
desc="national_rail_cif.zip",
|
||||
headers={"X-Auth-Token": token},
|
||||
)
|
||||
return dest
|
||||
|
||||
|
||||
def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
|
||||
"""Fix R5-incompatible entries in dtd2mysql-generated National Rail GTFS.
|
||||
|
||||
Fixes:
|
||||
- Interior pass-through stops (pickup_type=1, drop_off_type=1) → normal stops.
|
||||
R5 builds TripPatterns from the full stop sequence but may build shorter
|
||||
TripSchedules when stops are non-boarding, causing ArrayIndexOutOfBoundsException.
|
||||
- Removes stop_times referencing stops not in stops.txt.
|
||||
- Removes trips with backwards travel times.
|
||||
- Converts route_type=714 (rail replacement bus) to 3 (bus) for R5 compatibility.
|
||||
- Removes non-standard links.txt file.
|
||||
- Renumbers stop_sequence to 0-based (R5/BODS convention).
|
||||
- Fixes bogus coordinates (lat < 0) on Irish CIE stations.
|
||||
"""
|
||||
if dst.exists():
|
||||
print(f"London GTFS already exists: {dst}")
|
||||
print(f"Cleaned National Rail GTFS already exists: {dst}")
|
||||
return
|
||||
|
||||
bbox = LONDON_BBOX
|
||||
print("Cleaning National Rail GTFS for R5 compatibility...")
|
||||
|
||||
print("Cropping GTFS to London area...")
|
||||
# First pass: collect valid stop IDs and find bad trips
|
||||
stop_ids: set[str] = set()
|
||||
bad_trip_ids: set[str] = set()
|
||||
|
||||
with zipfile.ZipFile(src, "r") as zin:
|
||||
# Step 1: Find stops in bbox
|
||||
print(" Finding stops in bbox...")
|
||||
# Load valid stop IDs
|
||||
with zin.open("stops.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
stops_in_bbox = set()
|
||||
all_stops = list(reader)
|
||||
for row in all_stops:
|
||||
lat = float(row["stop_lat"])
|
||||
lon = float(row["stop_lon"])
|
||||
if bbox["min_lat"] <= lat <= bbox["max_lat"] and bbox["min_lon"] <= lon <= bbox["max_lon"]:
|
||||
stops_in_bbox.add(row["stop_id"])
|
||||
print(f" {len(stops_in_bbox):,} / {len(all_stops):,} stops in bbox")
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
stop_id_idx = header.split(",").index("stop_id")
|
||||
lat_idx = header.split(",").index("stop_lat")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if parts:
|
||||
stop_ids.add(parts[stop_id_idx])
|
||||
|
||||
# Step 2: Find trips touching these stops
|
||||
print(" Finding trips touching London stops...")
|
||||
# Find trips with backwards travel times
|
||||
with zin.open("stop_times.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
st_fieldnames = reader.fieldnames
|
||||
trips_in_bbox = set()
|
||||
for row in reader:
|
||||
if row["stop_id"] in stops_in_bbox:
|
||||
trips_in_bbox.add(row["trip_id"])
|
||||
print(f" {len(trips_in_bbox):,} trips touch London")
|
||||
st_header = f.readline().decode("utf-8").strip()
|
||||
st_cols = st_header.split(",")
|
||||
trip_id_idx = st_cols.index("trip_id")
|
||||
dep_idx = st_cols.index("departure_time")
|
||||
|
||||
# Step 3: Collect all stop_times for those trips
|
||||
print(" Collecting stop_times for London trips...")
|
||||
stop_times_kept = []
|
||||
with zin.open("stop_times.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
for row in reader:
|
||||
if row["trip_id"] in trips_in_bbox:
|
||||
stop_times_kept.append(row)
|
||||
stops_needed = {row["stop_id"] for row in stop_times_kept}
|
||||
print(f" {len(stop_times_kept):,} stop_times kept")
|
||||
prev_trip = ""
|
||||
prev_dep_secs = -1
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
if trip_id != prev_trip:
|
||||
prev_trip = trip_id
|
||||
prev_dep_secs = -1
|
||||
|
||||
# Step 4: Read trips and find needed routes/services/shapes
|
||||
print(" Reading trips...")
|
||||
with zin.open("trips.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
trips_fieldnames = reader.fieldnames
|
||||
all_trips = list(reader)
|
||||
trips_kept = [t for t in all_trips if t["trip_id"] in trips_in_bbox]
|
||||
routes_needed = {t["route_id"] for t in trips_kept}
|
||||
services_needed = {t["service_id"] for t in trips_kept}
|
||||
shapes_needed = {t.get("shape_id", "") for t in trips_kept} - {""}
|
||||
dep_str = parts[dep_idx].strip('"')
|
||||
if ":" in dep_str:
|
||||
try:
|
||||
h, m, s = dep_str.split(":")
|
||||
dep_secs = int(h) * 3600 + int(m) * 60 + int(s)
|
||||
if dep_secs < prev_dep_secs:
|
||||
bad_trip_ids.add(trip_id)
|
||||
prev_dep_secs = dep_secs
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Step 5: Write cropped GTFS
|
||||
print(" Writing cropped GTFS...")
|
||||
with zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout:
|
||||
# stops
|
||||
stops_kept = [s for s in all_stops if s["stop_id"] in stops_needed]
|
||||
_write_csv(zout, "stops.txt", list(all_stops[0].keys()), stops_kept)
|
||||
print(f" Found {len(bad_trip_ids)} trips with backwards travel times")
|
||||
|
||||
# stop_times
|
||||
_write_csv(zout, "stop_times.txt", st_fieldnames, stop_times_kept)
|
||||
# Second pass: write cleaned zip
|
||||
passthrough_fixed = 0
|
||||
orphan_stops_removed = 0
|
||||
bad_trips_removed = 0
|
||||
seqs_renumbered = 0
|
||||
coords_fixed = 0
|
||||
route_types_fixed = 0
|
||||
|
||||
# trips
|
||||
_write_csv(zout, "trips.txt", trips_fieldnames, trips_kept)
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
for info in zin.infolist():
|
||||
# Skip non-standard links.txt
|
||||
if info.filename == "links.txt":
|
||||
continue
|
||||
|
||||
# routes
|
||||
with zin.open("routes.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
routes_fn = reader.fieldnames
|
||||
routes_kept = [r for r in reader if r["route_id"] in routes_needed]
|
||||
_write_csv(zout, "routes.txt", routes_fn, routes_kept)
|
||||
if info.filename == "stop_times.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
stop_id_idx = cols.index("stop_id")
|
||||
seq_idx = cols.index("stop_sequence")
|
||||
pickup_idx = cols.index("pickup_type") if "pickup_type" in cols else -1
|
||||
dropoff_idx = cols.index("drop_off_type") if "drop_off_type" in cols else -1
|
||||
|
||||
# agency (copy all)
|
||||
zout.writestr("agency.txt", zin.read("agency.txt"))
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
# calendar
|
||||
with zin.open("calendar.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
cal_fn = reader.fieldnames
|
||||
cal_kept = [r for r in reader if r["service_id"] in services_needed]
|
||||
_write_csv(zout, "calendar.txt", cal_fn, cal_kept)
|
||||
prev_trip = ""
|
||||
seq_counter = 0
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
stop_id = parts[stop_id_idx].strip('"')
|
||||
|
||||
# calendar_dates
|
||||
with zin.open("calendar_dates.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
cd_fn = reader.fieldnames
|
||||
cd_kept = [r for r in reader if r["service_id"] in services_needed]
|
||||
_write_csv(zout, "calendar_dates.txt", cd_fn, cd_kept)
|
||||
# Skip trips with backwards times
|
||||
if trip_id in bad_trip_ids:
|
||||
bad_trips_removed += 1
|
||||
continue
|
||||
|
||||
# shapes (stream — can be very large)
|
||||
print(" Streaming shapes.txt...")
|
||||
with zin.open("shapes.txt") as f:
|
||||
reader = csv.DictReader(io.TextIOWrapper(f))
|
||||
shapes_fn = reader.fieldnames
|
||||
shapes_rows = [r for r in reader if r["shape_id"] in shapes_needed]
|
||||
_write_csv(zout, "shapes.txt", shapes_fn, shapes_rows)
|
||||
# Skip stop_times referencing missing stops
|
||||
if stop_id not in stop_ids:
|
||||
orphan_stops_removed += 1
|
||||
continue
|
||||
|
||||
# feed_info + frequencies (copy)
|
||||
zout.writestr("feed_info.txt", zin.read("feed_info.txt"))
|
||||
zout.writestr("frequencies.txt", zin.read("frequencies.txt"))
|
||||
# Fix pass-through stops: set pickup/dropoff to 0 (normal)
|
||||
if pickup_idx >= 0 and dropoff_idx >= 0:
|
||||
pickup = parts[pickup_idx].strip('"')
|
||||
dropoff = parts[dropoff_idx].strip('"')
|
||||
if pickup == "1" and dropoff == "1":
|
||||
parts[pickup_idx] = "0"
|
||||
parts[dropoff_idx] = "0"
|
||||
passthrough_fixed += 1
|
||||
|
||||
size_mb = dst.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved to {dst} ({size_mb:.0f} MB)")
|
||||
# Renumber stop_sequence to 0-based
|
||||
if trip_id != prev_trip:
|
||||
prev_trip = trip_id
|
||||
seq_counter = 0
|
||||
else:
|
||||
seq_counter += 1
|
||||
old_seq = parts[seq_idx].strip('"')
|
||||
parts[seq_idx] = str(seq_counter)
|
||||
if old_seq != str(seq_counter):
|
||||
seqs_renumbered += 1
|
||||
|
||||
tmp.write((",".join(parts) + "\n").encode("utf-8"))
|
||||
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "stop_times.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "stops.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
lat_idx = cols.index("stop_lat")
|
||||
lon_idx = cols.index("stop_lon")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
try:
|
||||
lat = float(parts[lat_idx])
|
||||
# Fix bogus Irish CIE coordinates (South Atlantic)
|
||||
if lat < 0:
|
||||
# Set to a neutral UK coordinate that won't be routed to
|
||||
parts[lat_idx] = "54.0"
|
||||
parts[lon_idx] = "-2.0"
|
||||
coords_fixed += 1
|
||||
except ValueError:
|
||||
pass
|
||||
tmp.write((",".join(parts) + "\n").encode("utf-8"))
|
||||
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "stops.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "routes.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
rt_idx = cols.index("route_type")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
if parts[rt_idx].strip('"') == "714":
|
||||
parts[rt_idx] = "3"
|
||||
route_types_fixed += 1
|
||||
tmp.write((",".join(parts) + "\n").encode("utf-8"))
|
||||
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "routes.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "trips.txt":
|
||||
# Remove trips that have backwards travel times
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
if parts[trip_id_idx].strip('"') not in bad_trip_ids:
|
||||
tmp.write(line)
|
||||
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "trips.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "calendar.txt":
|
||||
# Cap end_date year to 2099
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
end_idx = cols.index("end_date")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
|
||||
for line in f:
|
||||
line_str = line.decode("utf-8", errors="replace").strip()
|
||||
if not line_str:
|
||||
continue
|
||||
parts = line_str.split(",")
|
||||
date_val = parts[end_idx].strip('"')
|
||||
if len(date_val) == 8:
|
||||
try:
|
||||
year = int(date_val[:4])
|
||||
if year > 2099:
|
||||
parts[end_idx] = "20991231"
|
||||
except ValueError:
|
||||
pass
|
||||
tmp.write((",".join(parts) + "\n").encode("utf-8"))
|
||||
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "calendar.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
else:
|
||||
zout.writestr(info, zin.read(info))
|
||||
|
||||
print(f" Pass-through stops fixed: {passthrough_fixed}")
|
||||
print(f" Orphan stop references removed: {orphan_stops_removed}")
|
||||
print(f" Bad trip stop_times removed: {bad_trips_removed}")
|
||||
print(f" Stop sequences renumbered: {seqs_renumbered}")
|
||||
print(f" Bogus coordinates fixed: {coords_fixed}")
|
||||
print(f" Route types 714→3 fixed: {route_types_fixed}")
|
||||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def _write_csv(
|
||||
zout: zipfile.ZipFile, name: str, fieldnames: list[str], rows: list[dict]
|
||||
def _docker_run_dtd2mysql(
|
||||
network: str, db_container: str, volumes: list[str], args: list[str]
|
||||
) -> None:
|
||||
buf = io.StringIO()
|
||||
w = csv.DictWriter(buf, fieldnames=fieldnames)
|
||||
w.writeheader()
|
||||
w.writerows(rows)
|
||||
zout.writestr(name, buf.getvalue())
|
||||
print(f" {name}: {len(rows):,} rows")
|
||||
"""Run dtd2mysql in a Node.js container on the same Docker network as MariaDB."""
|
||||
cmd = [
|
||||
"docker", "run", "--rm", "--network", network,
|
||||
"-e", f"DATABASE_HOSTNAME={db_container}",
|
||||
"-e", "DATABASE_USERNAME=root",
|
||||
"-e", "DATABASE_PASSWORD=root",
|
||||
"-e", "DATABASE_NAME=dtd",
|
||||
]
|
||||
for v in volumes:
|
||||
cmd.extend(["-v", v])
|
||||
# Install zip (needed for --gtfs-zip) then run dtd2mysql
|
||||
inner = "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args)
|
||||
cmd.extend(["node:20", "bash", "-c", inner])
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
|
||||
def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
|
||||
"""Convert National Rail CIF to GTFS using dtd2mysql + MariaDB Docker.
|
||||
|
||||
Runs both MariaDB and dtd2mysql as Docker containers on a shared network,
|
||||
since Docker port forwarding is not available in all environments.
|
||||
Then cleans the output for R5 compatibility.
|
||||
"""
|
||||
dest = output_dir / "national_rail_gtfs.zip"
|
||||
if dest.exists():
|
||||
print(f"National Rail GTFS already exists: {dest}")
|
||||
return dest
|
||||
|
||||
raw_dest = raw_dir / "national_rail_gtfs_raw.zip"
|
||||
|
||||
if not raw_dest.exists():
|
||||
db_container = "propertymap-mariadb-temp"
|
||||
network = "propertymap-dtd-net"
|
||||
|
||||
print("Creating Docker network and starting MariaDB...")
|
||||
subprocess.run(["docker", "network", "create", network], capture_output=True)
|
||||
subprocess.run(
|
||||
[
|
||||
"docker", "run", "-d",
|
||||
"--name", db_container,
|
||||
"--network", network,
|
||||
"-e", "MARIADB_ROOT_PASSWORD=root",
|
||||
"-e", "MARIADB_DATABASE=dtd",
|
||||
"mariadb:latest",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
try:
|
||||
# Wait for MariaDB to be ready
|
||||
print(" Waiting for MariaDB to be ready...")
|
||||
for attempt in range(30):
|
||||
result = subprocess.run(
|
||||
["docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1"],
|
||||
capture_output=True,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
break
|
||||
time.sleep(2)
|
||||
else:
|
||||
raise RuntimeError("MariaDB did not become ready in time")
|
||||
|
||||
raw_abs = str(raw_dir.resolve())
|
||||
|
||||
print("Importing CIF timetable into MariaDB...")
|
||||
_docker_run_dtd2mysql(
|
||||
network, db_container,
|
||||
volumes=[f"{raw_abs}:/data:ro"],
|
||||
args=["--timetable", "/data/national_rail_cif.zip"],
|
||||
)
|
||||
|
||||
print("Exporting GTFS from MariaDB...")
|
||||
_docker_run_dtd2mysql(
|
||||
network, db_container,
|
||||
volumes=[f"{raw_abs}:/output"],
|
||||
args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"],
|
||||
)
|
||||
|
||||
finally:
|
||||
print("Cleaning up Docker resources...")
|
||||
subprocess.run(["docker", "stop", db_container], capture_output=True)
|
||||
subprocess.run(["docker", "rm", db_container], capture_output=True)
|
||||
subprocess.run(["docker", "network", "rm", network], capture_output=True)
|
||||
|
||||
# Clean the raw GTFS for R5 compatibility
|
||||
clean_national_rail_gtfs(raw_dest, dest)
|
||||
return dest
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -319,26 +639,43 @@ def main() -> None:
|
|||
required=True,
|
||||
help="Output directory for transit data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-tfl",
|
||||
action="store_true",
|
||||
help="Skip TfL TransXChange download and conversion",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-national-rail",
|
||||
action="store_true",
|
||||
help="Skip National Rail CIF download and conversion",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Download raw data
|
||||
england_pbf = download_osm_pbf(raw_dir)
|
||||
# 1. Download and clean BODS GTFS
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
# Clean GTFS (fix R5 incompatibilities)
|
||||
bods_clean = raw_dir / "bods_gtfs_clean.zip"
|
||||
bods_clean = output_dir / "bods_gtfs.zip"
|
||||
clean_gtfs(bods_raw, bods_clean)
|
||||
|
||||
# Crop to London area for R5 (full England requires >30GB RAM)
|
||||
london_pbf = output_dir / "london.osm.pbf"
|
||||
crop_osm_to_london(england_pbf, london_pbf)
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
print("Skipping TfL (--skip-tfl)")
|
||||
else:
|
||||
download_tfl_transxchange(raw_dir)
|
||||
convert_tfl_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
london_gtfs = output_dir / "bods_gtfs.zip"
|
||||
crop_gtfs_to_london(bods_clean, london_gtfs)
|
||||
# 3. National Rail CIF → GTFS
|
||||
if args.skip_national_rail:
|
||||
print("Skipping National Rail (--skip-national-rail)")
|
||||
else:
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
if cif is not None:
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# Summary
|
||||
print()
|
||||
|
|
@ -349,6 +686,11 @@ def main() -> None:
|
|||
size_mb = f.stat().st_size / (1024 * 1024)
|
||||
print(f" {f.name}: {size_mb:.1f} MB")
|
||||
|
||||
print()
|
||||
print("IMPORTANT: If you previously built a network from London-only data,")
|
||||
print("delete the stale cache before running R5:")
|
||||
print(" rm -f property-data/r5-network/network.dat")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -8,37 +8,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
||||
|
||||
def _join_journey_times(
|
||||
wide: pl.LazyFrame,
|
||||
journey_times_path: Path,
|
||||
destination_name: str,
|
||||
) -> pl.LazyFrame:
|
||||
"""Join journey times for a single destination, renaming columns appropriately."""
|
||||
journey_times = (
|
||||
pl.scan_parquet(journey_times_path)
|
||||
.select(
|
||||
"postcode",
|
||||
pl.col("public_transport_quick_minutes").alias(
|
||||
f"Public transport to {destination_name} (mins)"
|
||||
),
|
||||
pl.col("cycling_minutes").alias(f"Cycling to {destination_name} (mins)"),
|
||||
)
|
||||
.sort(f"Public transport to {destination_name} (mins)", nulls_last=True)
|
||||
.group_by("postcode")
|
||||
.first()
|
||||
)
|
||||
return wide.join(journey_times, on="postcode", how="left")
|
||||
|
||||
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
"lat",
|
||||
"lon",
|
||||
# Transport
|
||||
"Public transport to Bank (mins)",
|
||||
"Cycling to Bank (mins)",
|
||||
"Public transport to Fitzrovia (mins)",
|
||||
"Cycling to Fitzrovia (mins)",
|
||||
# Deprivation
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
|
|
@ -97,8 +70,6 @@ def _build(
|
|||
arcgis_path: Path,
|
||||
iod_path: Path,
|
||||
poi_proximity_path: Path,
|
||||
journey_times_bank_path: Path,
|
||||
journey_times_fitzrovia_path: Path,
|
||||
ethnicity_path: Path,
|
||||
crime_path: Path,
|
||||
noise_path: Path,
|
||||
|
|
@ -138,9 +109,6 @@ def _build(
|
|||
)
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
|
||||
wide = _join_journey_times(wide, journey_times_bank_path, "Bank")
|
||||
wide = _join_journey_times(wide, journey_times_fitzrovia_path, "Fitzrovia")
|
||||
|
||||
iod = pl.scan_parquet(iod_path)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
|
|
@ -382,18 +350,6 @@ def main():
|
|||
type=Path,
|
||||
help="POI proximity counts parquet file (optional)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--journey-times-bank",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Journey times to Bank parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--journey-times-fitzrovia",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Journey times to Fitzrovia parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ethnicity",
|
||||
type=Path,
|
||||
|
|
@ -446,8 +402,6 @@ def main():
|
|||
arcgis_path=args.arcgis,
|
||||
iod_path=args.iod,
|
||||
poi_proximity_path=args.poi_proximity,
|
||||
journey_times_bank_path=args.journey_times_bank,
|
||||
journey_times_fitzrovia_path=args.journey_times_fitzrovia,
|
||||
ethnicity_path=args.ethnicity,
|
||||
crime_path=args.crime,
|
||||
noise_path=args.noise,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue