Lots of improvements
This commit is contained in:
parent
ef921361ec
commit
80a5a2a774
21 changed files with 489 additions and 337 deletions
|
|
@ -115,15 +115,17 @@ class PlaceHandler(osmium.SimpleHandler):
|
|||
self._add(name, place_type, lat, lon, population)
|
||||
return
|
||||
|
||||
# Tube stations only (London Underground)
|
||||
# Railway stations (tube, national rail, DLR, overground, Elizabeth line)
|
||||
if n.tags.get("railway") == "station":
|
||||
tags = dict(n.tags)
|
||||
station_tag = tags.get("station", "")
|
||||
network = tags.get("network", "").lower()
|
||||
if station_tag == "subway" or "underground" in network:
|
||||
display_name = _station_display_name(name, tags)
|
||||
self._add(display_name, "station", lat, lon, population)
|
||||
# Skip tram stops
|
||||
if station_tag == "light_rail" or "tramlink" in network or "tram" in network:
|
||||
return
|
||||
display_name = _station_display_name(name, tags)
|
||||
self._add(display_name, "station", lat, lon, population)
|
||||
return
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -137,7 +139,7 @@ def main() -> None:
|
|||
args = parser.parse_args()
|
||||
|
||||
pbf_file = args.pbf
|
||||
print("Extracting place nodes: cities + tube stations")
|
||||
print("Extracting place nodes: cities + railway stations")
|
||||
with tqdm(
|
||||
unit=" elements",
|
||||
unit_scale=True,
|
||||
|
|
|
|||
74
pipeline/download/rightmove_outcodes.py
Normal file
74
pipeline/download/rightmove_outcodes.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
import polars as pl
|
||||
|
||||
|
||||
TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
|
||||
|
||||
|
||||
def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
|
||||
df = pl.read_parquet(postcodes_path, columns=["Postcode"])
|
||||
outcodes = sorted(
|
||||
set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
|
||||
)
|
||||
print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
|
||||
|
||||
mapping: dict[str, str] = {}
|
||||
missed: list[str] = []
|
||||
client = httpx.Client(timeout=10)
|
||||
|
||||
for i, oc in enumerate(outcodes):
|
||||
try:
|
||||
resp = client.get(TYPEAHEAD_URL, params={"query": oc, "limit": "5"})
|
||||
data = resp.json()
|
||||
found = False
|
||||
for m in data.get("matches", []):
|
||||
if (
|
||||
m["type"] == "OUTCODE"
|
||||
and m["displayName"].upper().replace(" ", "")
|
||||
== oc.upper().replace(" ", "")
|
||||
):
|
||||
mapping[oc] = str(m["id"])
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
missed.append(oc)
|
||||
except Exception as e:
|
||||
missed.append(oc)
|
||||
print(f" Error for {oc}: {e}")
|
||||
|
||||
if (i + 1) % 200 == 0:
|
||||
print(f" {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
|
||||
|
||||
client.close()
|
||||
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output, "w") as f:
|
||||
json.dump(mapping, f, sort_keys=True)
|
||||
|
||||
print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
|
||||
if missed:
|
||||
print(f"Missed: {missed}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch Rightmove outcode ID mapping"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes", type=Path, required=True, help="postcode.parquet path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output JSON file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
fetch_outcode_ids(args.postcodes, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -8,6 +8,7 @@ Downloads:
|
|||
|
||||
Then processes for R5 compatibility:
|
||||
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
|
||||
- Converts high-frequency metro/tram services to frequency-based GTFS
|
||||
- Converts TfL TransXChange to GTFS via transxchange2gtfs
|
||||
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
|
||||
|
||||
|
|
@ -20,12 +21,15 @@ Output directory: property-data/transit/
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import statistics
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import zipfile
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
|
@ -184,6 +188,229 @@ def clean_gtfs(src: Path, dst: Path) -> None:
|
|||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def _parse_gtfs_time(time_str: str) -> int | None:
|
||||
"""Parse HH:MM:SS to seconds since midnight. Returns None on failure."""
|
||||
time_str = time_str.strip('"')
|
||||
if ":" not in time_str:
|
||||
return None
|
||||
try:
|
||||
h, m, s = time_str.split(":")
|
||||
return int(h) * 3600 + int(m) * 60 + int(s)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _secs_to_gtfs_time(s: int) -> str:
|
||||
"""Convert seconds since midnight to HH:MM:SS."""
|
||||
h = s // 3600
|
||||
m = (s % 3600) // 60
|
||||
sec = s % 60
|
||||
return f"{h:02d}:{m:02d}:{sec:02d}"
|
||||
|
||||
|
||||
def convert_high_freq_to_frequency_based(
|
||||
src: Path, dst: Path, *, max_headway_minutes: int = 15
|
||||
) -> None:
|
||||
"""Convert high-frequency scheduled services to frequency-based GTFS entries.
|
||||
|
||||
Identifies metro (route_type=1) and tram (route_type=0) routes with regular
|
||||
headways under max_headway_minutes, then creates frequencies.txt entries and
|
||||
removes redundant trips. R5's RAPTOR produces smoother percentile results for
|
||||
frequency-based services, matching the "just turn up" reality of high-frequency
|
||||
metro/tram services.
|
||||
"""
|
||||
if dst.exists():
|
||||
print(f"Frequency-converted GTFS already exists: {dst}")
|
||||
return
|
||||
|
||||
print("Converting high-frequency services to frequency-based...")
|
||||
max_headway_secs = max_headway_minutes * 60
|
||||
|
||||
with zipfile.ZipFile(src, "r") as zin:
|
||||
# Step 1: Find metro/tram route IDs
|
||||
target_route_ids: set[str] = set()
|
||||
with zin.open("routes.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
route_id_idx = cols.index("route_id")
|
||||
rt_idx = cols.index("route_type")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
route_type = parts[rt_idx].strip('"')
|
||||
if route_type in ("0", "1"): # tram, metro/subway
|
||||
target_route_ids.add(parts[route_id_idx].strip('"'))
|
||||
|
||||
if not target_route_ids:
|
||||
print(" No metro/tram routes found, copying unchanged")
|
||||
shutil.copy2(src, dst)
|
||||
return
|
||||
|
||||
print(f" Found {len(target_route_ids)} metro/tram routes")
|
||||
|
||||
# Step 2: Map target trips to grouping keys
|
||||
trip_group_key: dict[str, tuple[str, str, str]] = {}
|
||||
with zin.open("trips.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
route_id_idx = cols.index("route_id")
|
||||
dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
|
||||
service_idx = cols.index("service_id")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
route_id = parts[route_id_idx].strip('"')
|
||||
if route_id in target_route_ids:
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0"
|
||||
service_id = parts[service_idx].strip('"')
|
||||
trip_group_key[trip_id] = (route_id, direction, service_id)
|
||||
|
||||
print(f" Found {len(trip_group_key)} trips on target routes")
|
||||
|
||||
# Step 3: Get first departure time and first stop for each target trip
|
||||
trip_first_dep: dict[str, int] = {}
|
||||
trip_first_stop: dict[str, str] = {}
|
||||
with zin.open("stop_times.txt") as f:
|
||||
header = f.readline().decode("utf-8").strip()
|
||||
cols = header.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
dep_idx = cols.index("departure_time")
|
||||
seq_idx = cols.index("stop_sequence")
|
||||
stop_id_idx = cols.index("stop_id")
|
||||
for line in f:
|
||||
parts = line.decode("utf-8", errors="replace").strip().split(",")
|
||||
if not parts:
|
||||
continue
|
||||
trip_id = parts[trip_id_idx].strip('"')
|
||||
if trip_id not in trip_group_key:
|
||||
continue
|
||||
if parts[seq_idx].strip('"') == "0":
|
||||
dep_secs = _parse_gtfs_time(parts[dep_idx])
|
||||
if dep_secs is not None:
|
||||
trip_first_dep[trip_id] = dep_secs
|
||||
trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
|
||||
|
||||
# Step 4: Group trips by (route, direction, service, first_stop) and compute headways
|
||||
groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
|
||||
for trip_id, dep_secs in trip_first_dep.items():
|
||||
route_id, direction, service_id = trip_group_key[trip_id]
|
||||
first_stop = trip_first_stop.get(trip_id, "")
|
||||
key = (route_id, direction, service_id, first_stop)
|
||||
groups[key].append((trip_id, dep_secs))
|
||||
|
||||
trips_to_remove: set[str] = set()
|
||||
frequency_entries: list[tuple[str, int, int, int]] = []
|
||||
groups_converted = 0
|
||||
|
||||
for _key, trips in groups.items():
|
||||
if len(trips) < 4:
|
||||
continue
|
||||
|
||||
trips.sort(key=lambda x: x[1])
|
||||
headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)]
|
||||
headways = [h for h in headways if h > 0]
|
||||
|
||||
if len(headways) < 3:
|
||||
continue
|
||||
|
||||
median_hw = statistics.median(headways)
|
||||
if median_hw > max_headway_secs or median_hw < 30:
|
||||
continue
|
||||
|
||||
mean_hw = statistics.mean(headways)
|
||||
if mean_hw == 0:
|
||||
continue
|
||||
stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0
|
||||
if stdev_hw / mean_hw > 0.5:
|
||||
continue
|
||||
|
||||
# Convert: keep first trip as template, remove the rest
|
||||
template_trip_id = trips[0][0]
|
||||
start_secs = trips[0][1]
|
||||
end_secs = trips[-1][1] + int(median_hw)
|
||||
headway_rounded = max(60, round(median_hw / 60) * 60)
|
||||
|
||||
frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
|
||||
for trip_id, _ in trips[1:]:
|
||||
trips_to_remove.add(trip_id)
|
||||
groups_converted += 1
|
||||
|
||||
print(f" Converted {groups_converted} trip groups to frequency-based")
|
||||
print(f" Removing {len(trips_to_remove)} redundant trips")
|
||||
print(f" Created {len(frequency_entries)} frequency entries")
|
||||
|
||||
# Step 5: Write modified GTFS
|
||||
with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
|
||||
dst, "w", zipfile.ZIP_DEFLATED
|
||||
) as zout:
|
||||
for info in zin.infolist():
|
||||
if info.filename == "trips.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
for line in f:
|
||||
parts = (
|
||||
line.decode("utf-8", errors="replace").strip().split(",")
|
||||
)
|
||||
if not parts:
|
||||
continue
|
||||
if parts[trip_id_idx].strip('"') not in trips_to_remove:
|
||||
tmp.write(line)
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "trips.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "stop_times.txt":
|
||||
with zin.open(info) as f:
|
||||
header = f.readline()
|
||||
header_str = header.decode("utf-8").strip()
|
||||
cols = header_str.split(",")
|
||||
trip_id_idx = cols.index("trip_id")
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
mode="wb", delete=False, suffix=".txt"
|
||||
)
|
||||
tmp.write(header)
|
||||
for line in f:
|
||||
parts = (
|
||||
line.decode("utf-8", errors="replace").strip().split(",")
|
||||
)
|
||||
if not parts:
|
||||
continue
|
||||
if parts[trip_id_idx].strip('"') not in trips_to_remove:
|
||||
tmp.write(line)
|
||||
tmp.close()
|
||||
zout.write(tmp.name, "stop_times.txt")
|
||||
os.unlink(tmp.name)
|
||||
|
||||
elif info.filename == "frequencies.txt":
|
||||
pass # we'll write our own below
|
||||
|
||||
else:
|
||||
zout.writestr(info, zin.read(info))
|
||||
|
||||
# Write frequencies.txt
|
||||
freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"]
|
||||
for trip_id, start, end, headway in frequency_entries:
|
||||
freq_lines.append(
|
||||
f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n"
|
||||
)
|
||||
zout.writestr("frequencies.txt", "".join(freq_lines))
|
||||
|
||||
print(f" Saved to {dst}")
|
||||
|
||||
|
||||
def download_tfl_transxchange(raw_dir: Path) -> Path:
|
||||
"""Download TfL TransXChange timetable bundle."""
|
||||
dest = raw_dir / "tfl_transxchange.zip"
|
||||
|
|
@ -655,12 +882,15 @@ def main() -> None:
|
|||
raw_dir = output_dir / "raw"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 1. Download and clean BODS GTFS
|
||||
# 1. Download, clean, and frequency-convert BODS GTFS
|
||||
download_osm_pbf(raw_dir)
|
||||
bods_raw = download_bods_gtfs(raw_dir)
|
||||
|
||||
bods_clean = output_dir / "bods_gtfs.zip"
|
||||
clean_gtfs(bods_raw, bods_clean)
|
||||
bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip"
|
||||
clean_gtfs(bods_raw, bods_cleaned)
|
||||
|
||||
bods_final = output_dir / "bods_gtfs.zip"
|
||||
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
|
||||
|
||||
# 2. TfL TransXChange → GTFS
|
||||
if args.skip_tfl:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue