Lots of improvements

2026-03-10 22:05:51 +00:00 · 2026-03-10 22:05:51 +00:00 · 80a5a2a774
commit 80a5a2a774
parent ef921361ec
21 changed files with 489 additions and 337 deletions
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -115,15 +115,17 @@ class PlaceHandler(osmium.SimpleHandler):
            self._add(name, place_type, lat, lon, population)
            return

-        # Tube stations only (London Underground)
+        # Railway stations (tube, national rail, DLR, overground, Elizabeth line)
        if n.tags.get("railway") == "station":
            tags = dict(n.tags)
            station_tag = tags.get("station", "")
            network = tags.get("network", "").lower()
-            if station_tag == "subway" or "underground" in network:
-                display_name = _station_display_name(name, tags)
-                self._add(display_name, "station", lat, lon, population)
+            # Skip tram stops
+            if station_tag == "light_rail" or "tramlink" in network or "tram" in network:
                return
+            display_name = _station_display_name(name, tags)
+            self._add(display_name, "station", lat, lon, population)
+            return


 def main() -> None:
@ -137,7 +139,7 @@ def main() -> None:
    args = parser.parse_args()

    pbf_file = args.pbf
-    print("Extracting place nodes: cities + tube stations")
+    print("Extracting place nodes: cities + railway stations")
    with tqdm(
        unit=" elements",
        unit_scale=True,
--- a/pipeline/download/rightmove_outcodes.py
+++ b/pipeline/download/rightmove_outcodes.py
@ -0,0 +1,74 @@
+"""Fetch Rightmove outcode→ID mapping for all outcodes in postcode.parquet."""
+
+import argparse
+import json
+from pathlib import Path
+
+import httpx
+import polars as pl
+
+
+TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"
+
+
+def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
+    df = pl.read_parquet(postcodes_path, columns=["Postcode"])
+    outcodes = sorted(
+        set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
+    )
+    print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")
+
+    mapping: dict[str, str] = {}
+    missed: list[str] = []
+    client = httpx.Client(timeout=10)
+
+    for i, oc in enumerate(outcodes):
+        try:
+            resp = client.get(TYPEAHEAD_URL, params={"query": oc, "limit": "5"})
+            data = resp.json()
+            found = False
+            for m in data.get("matches", []):
+                if (
+                    m["type"] == "OUTCODE"
+                    and m["displayName"].upper().replace(" ", "")
+                    == oc.upper().replace(" ", "")
+                ):
+                    mapping[oc] = str(m["id"])
+                    found = True
+                    break
+            if not found:
+                missed.append(oc)
+        except Exception as e:
+            missed.append(oc)
+            print(f"  Error for {oc}: {e}")
+
+        if (i + 1) % 200 == 0:
+            print(f"  {i + 1}/{len(outcodes)} done ({len(mapping)} found)")
+
+    client.close()
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with open(output, "w") as f:
+        json.dump(mapping, f, sort_keys=True)
+
+    print(f"Wrote {output} ({len(mapping)} outcodes, {len(missed)} missed)")
+    if missed:
+        print(f"Missed: {missed}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Fetch Rightmove outcode ID mapping"
+    )
+    parser.add_argument(
+        "--postcodes", type=Path, required=True, help="postcode.parquet path"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output JSON file path"
+    )
+    args = parser.parse_args()
+    fetch_outcode_ids(args.postcodes, args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -8,6 +8,7 @@ Downloads:

 Then processes for R5 compatibility:
  - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
+  - Converts high-frequency metro/tram services to frequency-based GTFS
  - Converts TfL TransXChange to GTFS via transxchange2gtfs
  - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)

@ -20,12 +21,15 @@ Output directory: property-data/transit/
 import argparse
 import json
 import os
+import shutil
+import statistics
 import subprocess
 import tempfile
 import time
 import urllib.parse
 import urllib.request
 import zipfile
+from collections import defaultdict
 from pathlib import Path

 from tqdm import tqdm
@ -184,6 +188,229 @@ def clean_gtfs(src: Path, dst: Path) -> None:
    print(f"  Saved to {dst}")


+def _parse_gtfs_time(time_str: str) -> int | None:
+    """Parse HH:MM:SS to seconds since midnight. Returns None on failure."""
+    time_str = time_str.strip('"')
+    if ":" not in time_str:
+        return None
+    try:
+        h, m, s = time_str.split(":")
+        return int(h) * 3600 + int(m) * 60 + int(s)
+    except ValueError:
+        return None
+
+
+def _secs_to_gtfs_time(s: int) -> str:
+    """Convert seconds since midnight to HH:MM:SS."""
+    h = s // 3600
+    m = (s % 3600) // 60
+    sec = s % 60
+    return f"{h:02d}:{m:02d}:{sec:02d}"
+
+
+def convert_high_freq_to_frequency_based(
+    src: Path, dst: Path, *, max_headway_minutes: int = 15
+) -> None:
+    """Convert high-frequency scheduled services to frequency-based GTFS entries.
+
+    Identifies metro (route_type=1) and tram (route_type=0) routes with regular
+    headways under max_headway_minutes, then creates frequencies.txt entries and
+    removes redundant trips. R5's RAPTOR produces smoother percentile results for
+    frequency-based services, matching the "just turn up" reality of high-frequency
+    metro/tram services.
+    """
+    if dst.exists():
+        print(f"Frequency-converted GTFS already exists: {dst}")
+        return
+
+    print("Converting high-frequency services to frequency-based...")
+    max_headway_secs = max_headway_minutes * 60
+
+    with zipfile.ZipFile(src, "r") as zin:
+        # Step 1: Find metro/tram route IDs
+        target_route_ids: set[str] = set()
+        with zin.open("routes.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            route_id_idx = cols.index("route_id")
+            rt_idx = cols.index("route_type")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                route_type = parts[rt_idx].strip('"')
+                if route_type in ("0", "1"):  # tram, metro/subway
+                    target_route_ids.add(parts[route_id_idx].strip('"'))
+
+        if not target_route_ids:
+            print("  No metro/tram routes found, copying unchanged")
+            shutil.copy2(src, dst)
+            return
+
+        print(f"  Found {len(target_route_ids)} metro/tram routes")
+
+        # Step 2: Map target trips to grouping keys
+        trip_group_key: dict[str, tuple[str, str, str]] = {}
+        with zin.open("trips.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            trip_id_idx = cols.index("trip_id")
+            route_id_idx = cols.index("route_id")
+            dir_idx = cols.index("direction_id") if "direction_id" in cols else -1
+            service_idx = cols.index("service_id")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                route_id = parts[route_id_idx].strip('"')
+                if route_id in target_route_ids:
+                    trip_id = parts[trip_id_idx].strip('"')
+                    direction = parts[dir_idx].strip('"') if dir_idx >= 0 else "0"
+                    service_id = parts[service_idx].strip('"')
+                    trip_group_key[trip_id] = (route_id, direction, service_id)
+
+        print(f"  Found {len(trip_group_key)} trips on target routes")
+
+        # Step 3: Get first departure time and first stop for each target trip
+        trip_first_dep: dict[str, int] = {}
+        trip_first_stop: dict[str, str] = {}
+        with zin.open("stop_times.txt") as f:
+            header = f.readline().decode("utf-8").strip()
+            cols = header.split(",")
+            trip_id_idx = cols.index("trip_id")
+            dep_idx = cols.index("departure_time")
+            seq_idx = cols.index("stop_sequence")
+            stop_id_idx = cols.index("stop_id")
+            for line in f:
+                parts = line.decode("utf-8", errors="replace").strip().split(",")
+                if not parts:
+                    continue
+                trip_id = parts[trip_id_idx].strip('"')
+                if trip_id not in trip_group_key:
+                    continue
+                if parts[seq_idx].strip('"') == "0":
+                    dep_secs = _parse_gtfs_time(parts[dep_idx])
+                    if dep_secs is not None:
+                        trip_first_dep[trip_id] = dep_secs
+                        trip_first_stop[trip_id] = parts[stop_id_idx].strip('"')
+
+    # Step 4: Group trips by (route, direction, service, first_stop) and compute headways
+    groups: dict[tuple[str, ...], list[tuple[str, int]]] = defaultdict(list)
+    for trip_id, dep_secs in trip_first_dep.items():
+        route_id, direction, service_id = trip_group_key[trip_id]
+        first_stop = trip_first_stop.get(trip_id, "")
+        key = (route_id, direction, service_id, first_stop)
+        groups[key].append((trip_id, dep_secs))
+
+    trips_to_remove: set[str] = set()
+    frequency_entries: list[tuple[str, int, int, int]] = []
+    groups_converted = 0
+
+    for _key, trips in groups.items():
+        if len(trips) < 4:
+            continue
+
+        trips.sort(key=lambda x: x[1])
+        headways = [trips[i + 1][1] - trips[i][1] for i in range(len(trips) - 1)]
+        headways = [h for h in headways if h > 0]
+
+        if len(headways) < 3:
+            continue
+
+        median_hw = statistics.median(headways)
+        if median_hw > max_headway_secs or median_hw < 30:
+            continue
+
+        mean_hw = statistics.mean(headways)
+        if mean_hw == 0:
+            continue
+        stdev_hw = statistics.stdev(headways) if len(headways) > 1 else 0
+        if stdev_hw / mean_hw > 0.5:
+            continue
+
+        # Convert: keep first trip as template, remove the rest
+        template_trip_id = trips[0][0]
+        start_secs = trips[0][1]
+        end_secs = trips[-1][1] + int(median_hw)
+        headway_rounded = max(60, round(median_hw / 60) * 60)
+
+        frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
+        for trip_id, _ in trips[1:]:
+            trips_to_remove.add(trip_id)
+        groups_converted += 1
+
+    print(f"  Converted {groups_converted} trip groups to frequency-based")
+    print(f"  Removing {len(trips_to_remove)} redundant trips")
+    print(f"  Created {len(frequency_entries)} frequency entries")
+
+    # Step 5: Write modified GTFS
+    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
+        dst, "w", zipfile.ZIP_DEFLATED
+    ) as zout:
+        for info in zin.infolist():
+            if info.filename == "trips.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+                    for line in f:
+                        parts = (
+                            line.decode("utf-8", errors="replace").strip().split(",")
+                        )
+                        if not parts:
+                            continue
+                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
+                            tmp.write(line)
+                    tmp.close()
+                    zout.write(tmp.name, "trips.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "stop_times.txt":
+                with zin.open(info) as f:
+                    header = f.readline()
+                    header_str = header.decode("utf-8").strip()
+                    cols = header_str.split(",")
+                    trip_id_idx = cols.index("trip_id")
+
+                    tmp = tempfile.NamedTemporaryFile(
+                        mode="wb", delete=False, suffix=".txt"
+                    )
+                    tmp.write(header)
+                    for line in f:
+                        parts = (
+                            line.decode("utf-8", errors="replace").strip().split(",")
+                        )
+                        if not parts:
+                            continue
+                        if parts[trip_id_idx].strip('"') not in trips_to_remove:
+                            tmp.write(line)
+                    tmp.close()
+                    zout.write(tmp.name, "stop_times.txt")
+                    os.unlink(tmp.name)
+
+            elif info.filename == "frequencies.txt":
+                pass  # we'll write our own below
+
+            else:
+                zout.writestr(info, zin.read(info))
+
+        # Write frequencies.txt
+        freq_lines = ["trip_id,start_time,end_time,headway_secs,exact_times\n"]
+        for trip_id, start, end, headway in frequency_entries:
+            freq_lines.append(
+                f"{trip_id},{_secs_to_gtfs_time(start)},{_secs_to_gtfs_time(end)},{headway},0\n"
+            )
+        zout.writestr("frequencies.txt", "".join(freq_lines))
+
+    print(f"  Saved to {dst}")
+
+
 def download_tfl_transxchange(raw_dir: Path) -> Path:
    """Download TfL TransXChange timetable bundle."""
    dest = raw_dir / "tfl_transxchange.zip"
@ -655,12 +882,15 @@ def main() -> None:
    raw_dir = output_dir / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

-    # 1. Download and clean BODS GTFS
+    # 1. Download, clean, and frequency-convert BODS GTFS
    download_osm_pbf(raw_dir)
    bods_raw = download_bods_gtfs(raw_dir)

-    bods_clean = output_dir / "bods_gtfs.zip"
-    clean_gtfs(bods_raw, bods_clean)
+    bods_cleaned = raw_dir / "bods_gtfs_cleaned.zip"
+    clean_gtfs(bods_raw, bods_cleaned)
+
+    bods_final = output_dir / "bods_gtfs.zip"
+    convert_high_freq_to_frequency_based(bods_cleaned, bods_final)

    # 2. TfL TransXChange → GTFS
    if args.skip_tfl: