Remove journey times
This commit is contained in:
parent
7a12e6c09a
commit
9da2db707f
7 changed files with 0 additions and 616 deletions
|
|
@ -1,25 +0,0 @@
|
|||
"""Journey times calculation module for TfL transit data."""
|
||||
|
||||
from .config import (
|
||||
DESTINATIONS,
|
||||
MAX_CONCURRENT,
|
||||
MAX_DELAY,
|
||||
MAX_POSTCODES,
|
||||
REQUESTS_PER_MIN,
|
||||
)
|
||||
from .models import Destination, JourneyResult
|
||||
from .results import results_to_dataframe, save_results
|
||||
from .tfl_client import fetch_journey_times
|
||||
|
||||
__all__ = [
|
||||
"MAX_DELAY",
|
||||
"REQUESTS_PER_MIN",
|
||||
"MAX_POSTCODES",
|
||||
"MAX_CONCURRENT",
|
||||
"DESTINATIONS",
|
||||
"Destination",
|
||||
"JourneyResult",
|
||||
"fetch_journey_times",
|
||||
"results_to_dataframe",
|
||||
"save_results",
|
||||
]
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
import random
|
||||
from datetime import date, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from .config import (
|
||||
DESTINATIONS,
|
||||
MAX_CONCURRENT,
|
||||
MAX_POSTCODES,
|
||||
MAX_DISTANCE_KM,
|
||||
)
|
||||
from .models import JourneyResult
|
||||
from .results import CheckpointSaver, results_to_dataframe, save_results
|
||||
from .tfl_client import fetch_journey_times
|
||||
from pipeline.utils import haversine_km_expr
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fetch TfL journey times")
|
||||
parser.add_argument(
|
||||
"--destination",
|
||||
required=True,
|
||||
choices=list(DESTINATIONS.keys()),
|
||||
help="Destination key",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
required=True,
|
||||
type=Path,
|
||||
help="Directory for output and checkpoint files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcodes",
|
||||
required=True,
|
||||
type=Path,
|
||||
help="ArcGIS postcode parquet file",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
destination = DESTINATIONS[args.destination]
|
||||
output_dir = args.output_dir
|
||||
|
||||
# Calculate next Monday at 8am
|
||||
today = date.today()
|
||||
days_until_monday = (7 - today.weekday()) % 7 or 7
|
||||
journey_date = today + timedelta(days=days_until_monday)
|
||||
journey_time = "0845"
|
||||
|
||||
print(f"Destination: {destination.name}")
|
||||
print(
|
||||
f"Journey: {journey_date.strftime('%A %Y-%m-%d')} "
|
||||
f"at {journey_time[:2]}:{journey_time[2:]}"
|
||||
)
|
||||
|
||||
postcodes_df = pl.read_parquet(args.postcodes).select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
"long",
|
||||
)
|
||||
print(f"Loaded {postcodes_df.height:,} postcodes")
|
||||
|
||||
# Filter to postcodes within range of destination
|
||||
postcodes_df = postcodes_df.with_columns(
|
||||
haversine_km_expr("lat", "long", destination.lat, destination.lon).alias(
|
||||
"distance_km"
|
||||
)
|
||||
).filter(pl.col("distance_km") <= MAX_DISTANCE_KM)
|
||||
|
||||
print(f"Filtered to {postcodes_df.height:,} postcodes within {MAX_DISTANCE_KM}km")
|
||||
|
||||
postcode_data = list(
|
||||
zip(
|
||||
postcodes_df["postcode"].to_list(),
|
||||
postcodes_df["lat"].to_list(),
|
||||
postcodes_df["long"].to_list(),
|
||||
)
|
||||
)
|
||||
|
||||
if MAX_POSTCODES is not None and len(postcode_data) > MAX_POSTCODES:
|
||||
postcode_data = random.sample(postcode_data, MAX_POSTCODES)
|
||||
print(f"Randomly sampled {MAX_POSTCODES} postcodes")
|
||||
|
||||
checkpoint_saver = CheckpointSaver(
|
||||
destination_name=destination.name,
|
||||
output_dir=output_dir,
|
||||
on_save=lambda path, count: print(
|
||||
f"Checkpoint saved: {count:,} results to {path}"
|
||||
),
|
||||
)
|
||||
|
||||
# Resume from checkpoint if one exists
|
||||
checkpoint_path = checkpoint_saver._checkpoint_path()
|
||||
prior_results: list[JourneyResult] = []
|
||||
if checkpoint_path.exists():
|
||||
checkpoint_df = pl.read_parquet(checkpoint_path)
|
||||
# Deduplicate checkpoint rows per postcode, preferring rows with data
|
||||
checkpoint_df = checkpoint_df.sort(
|
||||
"public_transport_quick_minutes", nulls_last=True
|
||||
).unique(subset=["postcode"], keep="first")
|
||||
completed_postcodes = set(checkpoint_df["postcode"].to_list())
|
||||
prior_results = [
|
||||
JourneyResult(
|
||||
postcode=row["postcode"],
|
||||
public_transport_easy_minutes=row["public_transport_easy_minutes"],
|
||||
public_transport_quick_minutes=row["public_transport_quick_minutes"],
|
||||
cycling_minutes=row["cycling_minutes"],
|
||||
error=row["error"],
|
||||
)
|
||||
for row in checkpoint_df.iter_rows(named=True)
|
||||
]
|
||||
checkpoint_saver.results = prior_results
|
||||
checkpoint_saver._last_save_count = len(prior_results)
|
||||
postcode_data = [
|
||||
(pc, lat, lon)
|
||||
for pc, lat, lon in postcode_data
|
||||
if pc not in completed_postcodes
|
||||
]
|
||||
print(
|
||||
f"Resumed from checkpoint: {len(prior_results):,} already done, "
|
||||
f"{len(postcode_data):,} remaining"
|
||||
)
|
||||
|
||||
def on_result(result):
|
||||
pbar.update(1)
|
||||
checkpoint_saver.add_result(result)
|
||||
|
||||
with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar:
|
||||
new_results = asyncio.run(
|
||||
fetch_journey_times(
|
||||
postcode_data,
|
||||
destination,
|
||||
journey_date.strftime("%Y%m%d"),
|
||||
journey_time,
|
||||
MAX_CONCURRENT,
|
||||
progress_callback=on_result,
|
||||
)
|
||||
)
|
||||
|
||||
all_results = prior_results + new_results
|
||||
results_df = results_to_dataframe(all_results)
|
||||
|
||||
all_postcodes = {r.postcode for r in all_results}
|
||||
coords_df = postcodes_df.filter(pl.col("postcode").is_in(all_postcodes)).select(
|
||||
["postcode", "lat", "long"]
|
||||
)
|
||||
results_df = coords_df.join(results_df, on="postcode", how="left")
|
||||
|
||||
results_df = results_df.with_columns(
|
||||
pl.lit(destination.name).alias("destination"),
|
||||
pl.lit(journey_date.strftime("%Y-%m-%d")).alias("journey_date"),
|
||||
pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"),
|
||||
)
|
||||
|
||||
successful = results_df.filter(pl.col("cycling_minutes").is_not_null()).height
|
||||
print(f"Completed: {successful}/{len(all_results)} successful")
|
||||
|
||||
parquet_path = save_results(results_df, destination.name, output_dir)
|
||||
checkpoint_saver.cleanup_checkpoint()
|
||||
print(f"Saved to {parquet_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
"""Configuration constants for journey times processing."""
|
||||
|
||||
from .models import Destination
|
||||
|
||||
MAX_DELAY = 10
|
||||
REQUESTS_PER_MIN = 500
|
||||
MAX_POSTCODES = None
|
||||
MAX_CONCURRENT = 80
|
||||
MAX_DISTANCE_KM = 110
|
||||
CHECKPOINT_INTERVAL = 10000
|
||||
|
||||
|
||||
DESTINATIONS = {
|
||||
"bank": Destination(51.5133, -0.0886, "Bank", "940GZZLUBNK"),
|
||||
"waterloo": Destination(51.5031, -0.1132, "Waterloo", "940GZZLUWLO"),
|
||||
"kings-cross": Destination(51.5308, -0.1238, "King's Cross", "940GZZLUKSX"),
|
||||
"liverpool-street": Destination(
|
||||
51.5178, -0.0823, "Liverpool Street", "940GZZLULVS"
|
||||
),
|
||||
"paddington": Destination(51.5154, -0.1755, "Paddington", "940GZZLUPAC"),
|
||||
"victoria": Destination(51.4965, -0.1447, "Victoria", "940GZZLUVIC"),
|
||||
"fitzrovia": Destination(51.5165, -0.1310, "Fitzrovia", "940GZZLUTCR"),
|
||||
}
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
"""Data models for journey times processing."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Destination:
|
||||
"""A destination point for journey planning."""
|
||||
|
||||
lat: float
|
||||
lon: float
|
||||
name: str
|
||||
naptan_id: str | None = None
|
||||
|
||||
def to_tfl_location(self) -> str:
|
||||
"""Convert to TfL API location string."""
|
||||
if self.naptan_id:
|
||||
return self.naptan_id
|
||||
return f"{self.lat},{self.lon}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class JourneyResult:
|
||||
"""Result of a journey time calculation for a postcode."""
|
||||
|
||||
postcode: str
|
||||
public_transport_easy_minutes: int | None = None
|
||||
cycling_minutes: int | None = None
|
||||
public_transport_quick_minutes: int | None = None
|
||||
error: str | None = None
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
"""Rate limiting for TfL API requests."""
|
||||
|
||||
import asyncio
|
||||
import warnings
|
||||
|
||||
from .config import REQUESTS_PER_MIN
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Rate limiter enforcing max requests per minute."""
|
||||
|
||||
def __init__(self):
|
||||
self.request_times: list[float] = []
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self):
|
||||
"""Wait until we can make a request within rate limits."""
|
||||
async with self._lock:
|
||||
now = asyncio.get_event_loop().time()
|
||||
cutoff = now - 10.0 # 10 seconds
|
||||
self.request_times = [t for t in self.request_times if t > cutoff]
|
||||
|
||||
if (
|
||||
len(self.request_times) >= REQUESTS_PER_MIN // 6
|
||||
): # we look at it every 10 seconds instead of minutes
|
||||
wait_time = self.request_times[0] - cutoff
|
||||
if wait_time > 0:
|
||||
warnings.warn(
|
||||
f"Rate limit reached ({REQUESTS_PER_MIN}/min), "
|
||||
f"waiting {wait_time:.1f}s",
|
||||
stacklevel=1,
|
||||
)
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
self.request_times.append(asyncio.get_event_loop().time())
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
import polars as pl
|
||||
|
||||
from .config import CHECKPOINT_INTERVAL
|
||||
from .models import JourneyResult
|
||||
|
||||
|
||||
def results_to_dataframe(results: list[JourneyResult]) -> pl.DataFrame:
|
||||
return pl.DataFrame(
|
||||
[
|
||||
{
|
||||
"postcode": r.postcode,
|
||||
"public_transport_easy_minutes": r.public_transport_easy_minutes,
|
||||
"public_transport_quick_minutes": r.public_transport_quick_minutes,
|
||||
"cycling_minutes": r.cycling_minutes,
|
||||
"error": r.error,
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class CheckpointSaver:
|
||||
"""Collects results and saves checkpoints at regular intervals."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
destination_name: str,
|
||||
output_dir: Path,
|
||||
interval: int = CHECKPOINT_INTERVAL,
|
||||
on_save: Callable[[Path, int], None] | None = None,
|
||||
):
|
||||
self.destination_name = destination_name
|
||||
self.output_dir = output_dir
|
||||
self.interval = interval
|
||||
self.on_save = on_save
|
||||
self.results: list[JourneyResult] = []
|
||||
self._last_save_count = 0
|
||||
|
||||
def add_result(self, result: JourneyResult) -> None:
|
||||
"""Add a result and save checkpoint if interval is reached."""
|
||||
self.results.append(result)
|
||||
if len(self.results) - self._last_save_count >= self.interval:
|
||||
self.save_checkpoint()
|
||||
|
||||
def save_checkpoint(self) -> Path:
|
||||
"""Save current results to checkpoint file."""
|
||||
df = results_to_dataframe(self.results)
|
||||
path = self._checkpoint_path()
|
||||
df.write_parquet(path)
|
||||
self._last_save_count = len(self.results)
|
||||
if self.on_save:
|
||||
self.on_save(path, len(self.results))
|
||||
return path
|
||||
|
||||
def _checkpoint_path(self) -> Path:
|
||||
safe_name = self.destination_name.lower().replace(" ", "-")
|
||||
return self.output_dir / f"journey_times_{safe_name}_checkpoint.parquet"
|
||||
|
||||
def get_results(self) -> list[JourneyResult]:
|
||||
"""Return all collected results."""
|
||||
return self.results
|
||||
|
||||
def cleanup_checkpoint(self) -> None:
|
||||
"""Remove the checkpoint file after successful completion."""
|
||||
path = self._checkpoint_path()
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
|
||||
|
||||
def save_results(
|
||||
results: pl.DataFrame,
|
||||
destination_name: str,
|
||||
output_dir: Path,
|
||||
) -> Path:
|
||||
safe_name = destination_name.lower().replace(" ", "-")
|
||||
parquet_path = output_dir / f"journey_times_{safe_name}.parquet"
|
||||
results.write_parquet(parquet_path)
|
||||
|
||||
return parquet_path
|
||||
|
|
@ -1,254 +0,0 @@
|
|||
import asyncio
|
||||
import os
|
||||
from typing import Literal
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from http import HTTPStatus
|
||||
|
||||
import httpx
|
||||
|
||||
from .config import MAX_DELAY
|
||||
from .models import Destination, JourneyResult
|
||||
from .rate_limiter import RateLimiter
|
||||
|
||||
|
||||
BASE_URL = "https://api.tfl.gov.uk"
|
||||
|
||||
|
||||
async def fetch_journey_for_mode(
|
||||
client: httpx.AsyncClient,
|
||||
rate_limiter: RateLimiter,
|
||||
from_location: str,
|
||||
to_location: str,
|
||||
journey_date: str,
|
||||
journey_time: str,
|
||||
journey_type: Literal["quick"] | Literal["easy"] | Literal["cycle"],
|
||||
retry_count: int = 5,
|
||||
) -> int | None:
|
||||
"""Fetch journey time for a specific mode with rate limiting."""
|
||||
backoff = 1.0
|
||||
for attempt in range(retry_count):
|
||||
try:
|
||||
await rate_limiter.acquire()
|
||||
|
||||
journey_preference = {
|
||||
"quick": "LeastTime",
|
||||
"easy": "LeastInterchange",
|
||||
"cycle": None,
|
||||
}[journey_type]
|
||||
|
||||
cycle_preference = {
|
||||
"quick": None,
|
||||
"easy": None,
|
||||
"cycle": "AllTheWay",
|
||||
}[journey_type]
|
||||
|
||||
# curl -s "https://api.tfl.gov.uk/Journey/Meta/Modes" | jq '.[].modeName'
|
||||
mode = {
|
||||
"quick": [
|
||||
"bus",
|
||||
"overground",
|
||||
"national-rail",
|
||||
"international-rail",
|
||||
"elizabeth-line",
|
||||
"tube",
|
||||
"coach",
|
||||
"dlr",
|
||||
"cable-car",
|
||||
"replacement-bus",
|
||||
"tram",
|
||||
"river-bus",
|
||||
"walking",
|
||||
"cycle",
|
||||
],
|
||||
"easy": [
|
||||
"bus",
|
||||
"overground",
|
||||
"national-rail",
|
||||
"international-rail",
|
||||
"elizabeth-line",
|
||||
"replacement-bus",
|
||||
"tube",
|
||||
"coach",
|
||||
"dlr",
|
||||
"cable-car",
|
||||
"tram",
|
||||
"river-bus",
|
||||
],
|
||||
"cycle": ["cycle"],
|
||||
}[journey_type]
|
||||
|
||||
params: dict = {
|
||||
"date": journey_date,
|
||||
"time": journey_time,
|
||||
"nationalSearch": "true",
|
||||
"timeIs": "Arriving",
|
||||
"cyclePreference": cycle_preference,
|
||||
"bikeProficiency": "Fast",
|
||||
"walkingOptimization": str(journey_type == "quick").lower(),
|
||||
"mode": ",".join(mode),
|
||||
}
|
||||
if journey_preference:
|
||||
params["journeyPreference"] = journey_preference
|
||||
|
||||
url = f"/Journey/JourneyResults/{from_location}/to/{to_location}"
|
||||
response = await client.get(url, params=params)
|
||||
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
data = response.json()
|
||||
journeys = data.get("journeys", [])
|
||||
if journeys:
|
||||
durations = [
|
||||
j["duration"] for j in journeys if j.get("duration") is not None
|
||||
]
|
||||
if durations:
|
||||
return min(durations)
|
||||
return None
|
||||
elif response.status_code in (
|
||||
HTTPStatus.TOO_MANY_REQUESTS,
|
||||
HTTPStatus.INTERNAL_SERVER_ERROR,
|
||||
HTTPStatus.BAD_GATEWAY,
|
||||
HTTPStatus.SERVICE_UNAVAILABLE,
|
||||
HTTPStatus.GATEWAY_TIMEOUT,
|
||||
):
|
||||
warnings.warn(
|
||||
f"HTTP {response.status_code} for {journey_type} from {from_location}, "
|
||||
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
|
||||
stacklevel=2,
|
||||
)
|
||||
await asyncio.sleep(backoff)
|
||||
backoff = min(backoff * 2, MAX_DELAY)
|
||||
continue
|
||||
else:
|
||||
return None
|
||||
except Exception as e:
|
||||
warnings.warn(
|
||||
f"Network error for {journey_type} from {from_location}: {e}, "
|
||||
f"retrying in {backoff:.1f}s (attempt {attempt + 1}/{retry_count})",
|
||||
stacklevel=2,
|
||||
)
|
||||
await asyncio.sleep(backoff)
|
||||
backoff = min(backoff * 2, MAX_DELAY)
|
||||
continue
|
||||
warnings.warn(
|
||||
f"Failed to fetch {journey_type} from {from_location} after {retry_count} attempts",
|
||||
stacklevel=2,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_all_modes(
|
||||
client: httpx.AsyncClient,
|
||||
rate_limiter: RateLimiter,
|
||||
postcode: str,
|
||||
lat: float,
|
||||
lon: float,
|
||||
to_location: str,
|
||||
journey_date: str,
|
||||
journey_time: str,
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> JourneyResult:
|
||||
"""Fetch journey times for all transport modes using coordinates."""
|
||||
async with semaphore:
|
||||
try:
|
||||
from_location = f"{lat},{lon}"
|
||||
|
||||
easy = await fetch_journey_for_mode(
|
||||
client,
|
||||
rate_limiter,
|
||||
from_location,
|
||||
to_location,
|
||||
journey_date,
|
||||
journey_time,
|
||||
journey_type="easy",
|
||||
)
|
||||
quick = await fetch_journey_for_mode(
|
||||
client,
|
||||
rate_limiter,
|
||||
from_location,
|
||||
to_location,
|
||||
journey_date,
|
||||
journey_time,
|
||||
journey_type="quick",
|
||||
)
|
||||
cycling = await fetch_journey_for_mode(
|
||||
client,
|
||||
rate_limiter,
|
||||
from_location,
|
||||
to_location,
|
||||
journey_date,
|
||||
journey_time,
|
||||
journey_type="cycle",
|
||||
)
|
||||
|
||||
return JourneyResult(
|
||||
postcode=postcode,
|
||||
public_transport_easy_minutes=easy,
|
||||
public_transport_quick_minutes=quick,
|
||||
cycling_minutes=cycling,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return JourneyResult(postcode=postcode, error=str(e))
|
||||
|
||||
|
||||
async def fetch_journey_times(
|
||||
postcode_data: list[tuple[str, float, float]],
|
||||
dest: Destination,
|
||||
journey_date: str,
|
||||
journey_time: str,
|
||||
max_concurrent: int = 2,
|
||||
progress_callback: Callable[[JourneyResult], None] | None = None,
|
||||
) -> list[JourneyResult]:
|
||||
"""Fetch journey times for all postcodes with rate limiting.
|
||||
|
||||
Args:
|
||||
postcode_data: List of (postcode, lat, lon) tuples
|
||||
dest: Destination for journey planning
|
||||
journey_date: Date in YYYYMMDD format
|
||||
journey_time: Time in HHMM format
|
||||
max_concurrent: Maximum concurrent API requests
|
||||
progress_callback: Optional callback called with each result
|
||||
|
||||
Returns:
|
||||
List of JourneyResult objects in the same order as postcode_data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
to_location = dest.to_tfl_location()
|
||||
rate_limiter = RateLimiter()
|
||||
|
||||
# TFL API authentication via app_key query parameter
|
||||
tfl_token = os.environ.get("TFL_TOKEN")
|
||||
if not tfl_token:
|
||||
raise RuntimeError("TFL_TOKEN environment variable not set")
|
||||
params = {"app_key": tfl_token}
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
base_url=BASE_URL,
|
||||
params=params,
|
||||
timeout=httpx.Timeout(30),
|
||||
) as client:
|
||||
tasks = [
|
||||
fetch_all_modes(
|
||||
client,
|
||||
rate_limiter,
|
||||
pc,
|
||||
lat,
|
||||
lon,
|
||||
to_location,
|
||||
journey_date,
|
||||
journey_time,
|
||||
semaphore,
|
||||
)
|
||||
for pc, lat, lon in postcode_data
|
||||
]
|
||||
|
||||
results = []
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
result = await coro
|
||||
results.append(result)
|
||||
if progress_callback:
|
||||
progress_callback(result)
|
||||
|
||||
postcode_to_result = {r.postcode: r for r in results}
|
||||
return [postcode_to_result[pc] for pc, _, _ in postcode_data]
|
||||
Loading…
Add table
Add a link
Reference in a new issue