Improve journey time fetching

2026-01-27 22:06:45 +00:00 · 2026-01-27 22:06:45 +00:00 · bd0dd34b6e
commit bd0dd34b6e
parent 31bfc76b58
7 changed files with 165 additions and 54 deletions
--- a/pipeline/journey_times/init.py
+++ b/pipeline/journey_times/init.py
@ -14,7 +14,6 @@ from .results import results_to_dataframe, save_results
 from .tfl_client import fetch_journey_times

 __all__ = [
-    # Config
    "DATA_DIR",
    "OUTPUT_DIR",
    "MAX_DELAY",
@ -24,7 +23,6 @@ __all__ = [
    "DESTINATIONS",
    "Destination",
    "JourneyResult",
-    "load_all_postcodes",
    "fetch_journey_times",
    "results_to_dataframe",
    "save_results",
--- a/pipeline/journey_times/main.py
+++ b/pipeline/journey_times/main.py
@ -10,7 +10,6 @@ from .results import results_to_dataframe, save_results
 from .tfl_client import fetch_journey_times


-
 def main():
    destination = DESTINATIONS["bank"]

@ -18,7 +17,7 @@ def main():
    today = date.today()
    days_until_monday = (7 - today.weekday()) % 7 or 7
    journey_date = today + timedelta(days=days_until_monday)
-    journey_time = "0800"
+    journey_time = "0845"

    print(f"Destination: {destination.name}")
    print(
@ -26,7 +25,7 @@ def main():
        f"at {journey_time[:2]}:{journey_time[2:]}"
    )

-    postcodes_df =  pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet")
+    postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet")
    print(f"Loaded {postcodes_df.height:,} postcodes")

    postcode_data = list(
@ -67,7 +66,7 @@ def main():
        pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"),
    )

-    successful = results_df.filter(pl.col("fastest_minutes").is_not_null()).height
+    successful = results_df.filter(pl.col("cycling").is_not_null()).height
    print(f"Completed: {successful}/{len(results)} successful")

    parquet_path = save_results(results_df, destination.name)
--- a/pipeline/journey_times/models.py
+++ b/pipeline/journey_times/models.py
@ -24,9 +24,7 @@ class JourneyResult:
    """Result of a journey time calculation for a postcode."""

    postcode: str
-    walking_minutes: int | None = None
+    public_transport_easy_minutes: int | None = None
    cycling_minutes: int | None = None
-    public_transport_minutes: int | None = None
-    fastest_minutes: int | None = None
-    fastest_mode: str | None = None
+    public_transport_quick_minutes: int | None = None
    error: str | None = None
--- a/pipeline/journey_times/rate_limiter.py
+++ b/pipeline/journey_times/rate_limiter.py
@ -17,10 +17,12 @@ class RateLimiter:
        """Wait until we can make a request within rate limits."""
        async with self._lock:
            now = asyncio.get_event_loop().time()
-            cutoff = now - 60.0
+            cutoff = now - 10.0  # 10 seconds
            self.request_times = [t for t in self.request_times if t > cutoff]

-            if len(self.request_times) >= REQUESTS_PER_MIN:
+            if (
+                len(self.request_times) >= REQUESTS_PER_MIN // 6
+            ):  # we look at it every 10 seconds instead of minutes
                wait_time = self.request_times[0] - cutoff
                if wait_time > 0:
                    warnings.warn(
--- a/pipeline/journey_times/results.py
+++ b/pipeline/journey_times/results.py
@ -11,11 +11,9 @@ def results_to_dataframe(results: list[JourneyResult]) -> pl.DataFrame:
        [
            {
                "postcode": r.postcode,
-                "walking_minutes": r.walking_minutes,
+                "public_transport_easy_minutes": r.public_transport_easy_minutes,
+                "public_transport_quick_minutes": r.public_transport_quick_minutes,
                "cycling_minutes": r.cycling_minutes,
-                "public_transport_minutes": r.public_transport_minutes,
-                "fastest_minutes": r.fastest_minutes,
-                "fastest_mode": r.fastest_mode,
                "error": r.error,
            }
            for r in results
--- a/pipeline/journey_times/tfl_client.py
+++ b/pipeline/journey_times/tfl_client.py
@ -1,10 +1,10 @@
-"""TfL API client for fetching journey times."""
-
 import asyncio
+from typing import Literal
 import warnings
 from collections.abc import Callable
 from http import HTTPStatus

+from httpx import Timeout
 from journey_client import Client
 from journey_client.api.journey import (
    journey_journey_results_by_path_from_path_to_query_via_query_national_search_query_date_qu as journey_api,
@ -12,7 +12,16 @@ from journey_client.api.journey import (
 from journey_client.models import (
    JourneyJourneyResultsByPathFromPathToQueryViaQueryNationalSearchQueryDateQuTimeIs as TimeIs,
 )
-from journey_client.types import UNSET, Unset
+from journey_client.models import (
+    JourneyJourneyResultsByPathFromPathToQueryViaQueryNationalSearchQueryDateQuJourneyPreference as JourneyPreference,
+)
+from journey_client.models import (
+    JourneyJourneyResultsByPathFromPathToQueryViaQueryNationalSearchQueryDateQuCyclePreference as CyclePreference,
+)
+from journey_client.models import (
+    JourneyJourneyResultsByPathFromPathToQueryViaQueryNationalSearchQueryDateQuBikeProficiency as BikeProficiency,
+)
+from journey_client.types import Unset

 from .config import MAX_DELAY
 from .models import Destination, JourneyResult
@ -24,25 +33,66 @@ async def fetch_journey_for_mode(
    rate_limiter: RateLimiter,
    from_location: str,
    to_location: str,
-    mode: list[str] | Unset,
    journey_date: str,
    journey_time: str,
+    journey_type: Literal["quick"] | Literal["easy"] | Literal["cycle"],
    retry_count: int = 5,
 ) -> int | None:
    """Fetch journey time for a specific mode with rate limiting."""
-    mode_name = ",".join(mode) if not isinstance(mode, Unset) else "public_transport"
    backoff = 1.0
    for attempt in range(retry_count):
        try:
            await rate_limiter.acquire()

+            cycle_preference = {
+                "quick": CyclePreference.TAKEONTRANSPORT,
+                "easy": CyclePreference.NONE,
+                "cycle": CyclePreference.ALLTHEWAY,
+            }[journey_type]
+
+            # options: public-bus,overground,train,tube,coach,dlr,cablecar,tram,river,walking,cycle
+            mode = {
+                "quick": [
+                    "public-bus",
+                    "overground",
+                    "train",
+                    "tube",
+                    "coach",
+                    "dlr",
+                    "cablecar",
+                    "tram",
+                    "river",
+                    "walking",
+                    "cycle",
+                ],
+                "easy": [
+                    "public-bus",
+                    "overground",
+                    "train",
+                    "tube",
+                    "coach",
+                    "dlr",
+                    "cablecar",
+                    "tram",
+                    "river",
+                ],
+                "cycle": ["cycle"],
+            }[journey_type]
+
            response = await journey_api.asyncio_detailed(
                from_=from_location,
                to=to_location,
                client=client,
                date=journey_date,
                time=journey_time,
-                time_is=TimeIs.DEPARTING,
+                national_search=True,
+                time_is=TimeIs.ARRIVING,
+                journey_preference=JourneyPreference.LEASTINTERCHANGE
+                if journey_type == "easy"
+                else JourneyPreference.LEASTINTERCHANGE,
+                cycle_preference=cycle_preference,
+                bike_proficiency=BikeProficiency.FAST,
+                walking_optimization=journey_type == "quick",
                mode=mode,
            )

@ -65,7 +115,7 @@ async def fetch_journey_for_mode(
                HTTPStatus.GATEWAY_TIMEOUT,
            ):
                warnings.warn(
-                    f"HTTP {response.status_code.value} for {mode_name} from {from_location}, "
+                    f"HTTP {response.status_code.value} for {journey_type} from {from_location}, "
                    f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
                    stacklevel=2,
                )
@ -76,7 +126,7 @@ async def fetch_journey_for_mode(
                return None
        except Exception as e:
            warnings.warn(
-                f"Network error for {mode_name} from {from_location}: {e}, "
+                f"Network error for {journey_type} from {from_location}: {e}, "
                f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
                stacklevel=2,
            )
@ -84,7 +134,7 @@ async def fetch_journey_for_mode(
            backoff = min(backoff * 2, MAX_DELAY)
            continue
    warnings.warn(
-        f"Failed to fetch {mode_name} from {from_location} after {retry_count} attempts",
+        f"Failed to fetch {journey_type} from {from_location} after {retry_count} attempts",
        stacklevel=2,
    )
    return None
@ -106,55 +156,42 @@ async def fetch_all_modes(
        try:
            from_location = f"{lat},{lon}"

-            walking = await fetch_journey_for_mode(
+            easy = await fetch_journey_for_mode(
                client,
                rate_limiter,
                from_location,
                to_location,
-                ["walking"],
                journey_date,
                journey_time,
+                journey_type="easy",
+            )
+            quick = await fetch_journey_for_mode(
+                client,
+                rate_limiter,
+                from_location,
+                to_location,
+                journey_date,
+                journey_time,
+                journey_type="quick",
            )
            cycling = await fetch_journey_for_mode(
                client,
                rate_limiter,
                from_location,
                to_location,
-                ["cycle"],
                journey_date,
                journey_time,
+                journey_type="cycle",
            )
-            public = await fetch_journey_for_mode(
-                client,
-                rate_limiter,
-                from_location,
-                to_location,
-                UNSET,
-                journey_date,
-                journey_time,
-            )
-
-            options = [
-                ("walking", walking),
-                ("cycling", cycling),
-                ("public_transport", public),
-            ]
-            valid_options = [(mode, time) for mode, time in options if time is not None]
-
-            if valid_options:
-                fastest_mode, fastest_time = min(valid_options, key=lambda x: x[1])
-            else:
-                fastest_mode, fastest_time = None, None

            return JourneyResult(
                postcode=postcode,
-                walking_minutes=walking,
+                public_transport_easy_minutes=easy,
+                public_transport_quick_minutes=quick,
                cycling_minutes=cycling,
-                public_transport_minutes=public,
-                fastest_minutes=fastest_time,
-                fastest_mode=fastest_mode,
            )
        except Exception as e:
+            print(f"Error: {e}")
            return JourneyResult(postcode=postcode, error=str(e))


@ -183,7 +220,7 @@ async def fetch_journey_times(
    to_location = dest.to_tfl_location()
    rate_limiter = RateLimiter()

-    client = Client(base_url="https://api.tfl.gov.uk", timeout=30.0)
+    client = Client(base_url="https://api.tfl.gov.uk").with_timeout(Timeout(30))
    async with client as client:
        tasks = [
            fetch_all_modes(
--- a/pipeline/processors/journey_times_aggregator.py
+++ b/pipeline/processors/journey_times_aggregator.py
@ -0,0 +1,79 @@
+"""Aggregate journey times data by H3 hexagonal cells."""
+
+from pathlib import Path
+
+import polars as pl
+
+from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS, PROCESSED_DIR
+
+
+def aggregate_journey_times(
+    journey_times_path: Path | None = None,
+    postcodes_h3_path: Path | None = None,
+    output_dir: Path | None = None,
+) -> list[Path]:
+    """
+    Aggregate journey times by H3 cells at all resolutions.
+
+    Joins journey_times_bank.parquet with postcodes_h3.parquet on postcode,
+    then groups by H3 cell to compute median journey time.
+    """
+    journey_times_path = journey_times_path or PROCESSED_DIR / "journey_times_bank.parquet"
+    postcodes_h3_path = postcodes_h3_path or PROCESSED_DIR / "postcodes_h3.parquet"
+    output_dir = output_dir or AGGREGATES_DIR
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Load journey times data
+    journey_df = pl.read_parquet(journey_times_path).select(
+        ["postcode", "public_transport_minutes"]
+    )
+
+    # Filter out null journey times
+    journey_df = journey_df.filter(pl.col("public_transport_minutes").is_not_null())
+
+    if journey_df.height == 0:
+        print("No valid journey times found")
+        return []
+
+    # Load postcodes with H3 indices
+    postcodes_df = pl.read_parquet(postcodes_h3_path)
+
+    # Join on postcode to get H3 indices
+    joined_df = journey_df.join(postcodes_df, on="postcode", how="inner")
+
+    if joined_df.height == 0:
+        print("No matching postcodes found")
+        return []
+
+    print(f"Joined {joined_df.height} postcodes with journey times")
+
+    saved_paths = []
+
+    for resolution in H3_RESOLUTIONS:
+        h3_col = f"h3_res{resolution}"
+
+        if h3_col not in joined_df.columns:
+            print(f"Skipping resolution {resolution} - column {h3_col} not found")
+            continue
+
+        # Aggregate by H3 cell - compute median journey time
+        agg_df = (
+            joined_df.group_by(h3_col)
+            .agg(
+                pl.col("public_transport_minutes").median().alias("median_journey_minutes"),
+                pl.col("public_transport_minutes").count().alias("journey_count"),
+            )
+            .rename({h3_col: "h3"})
+        )
+
+        output_path = output_dir / f"journey_times_res{resolution}.parquet"
+        agg_df.write_parquet(output_path)
+        saved_paths.append(output_path)
+        print(f"Saved {agg_df.height} cells to {output_path}")
+
+    return saved_paths
+
+
+if __name__ == "__main__":
+    aggregate_journey_times()