Fix tfl scraping

This commit is contained in:
Andras Schmelczer 2026-02-01 08:49:40 +00:00
parent ac45af8514
commit cf7449e38b
2 changed files with 42 additions and 6 deletions

View file

@ -12,6 +12,7 @@ from .config import (
OUTPUT_DIR,
MAX_DISTANCE_KM,
)
from .models import JourneyResult
from .results import CheckpointSaver, results_to_dataframe, save_results
from .tfl_client import fetch_journey_times
from pipeline.utils import haversine_km_expr
@ -63,12 +64,44 @@ def main():
),
)
# 25556/76273
# Resume from checkpoint if one exists
checkpoint_path = checkpoint_saver._checkpoint_path()
prior_results: list[JourneyResult] = []
if checkpoint_path.exists():
checkpoint_df = pl.read_parquet(checkpoint_path)
completed_postcodes = set(
checkpoint_df.filter(pl.col("public_transport_easy_minutes").is_not_null())["postcode"].to_list()
)
prior_results = [
JourneyResult(
postcode=row["postcode"],
public_transport_easy_minutes=row["public_transport_easy_minutes"],
public_transport_quick_minutes=row["public_transport_quick_minutes"],
cycling_minutes=row["cycling_minutes"],
error=row["error"],
)
for row in checkpoint_df.iter_rows(named=True)
]
checkpoint_saver.results = prior_results
checkpoint_saver._last_save_count = len(prior_results)
postcode_data = [
(pc, lat, lon)
for pc, lat, lon in postcode_data
if pc not in completed_postcodes
]
print(
f"Resumed from checkpoint: {len(prior_results):,} already done, "
f"{len(postcode_data):,} remaining"
)
def on_result(result):
pbar.update(1)
checkpoint_saver.add_result(result)
with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar:
results = asyncio.run(
new_results = asyncio.run(
fetch_journey_times(
postcode_data,
destination,
@ -79,11 +112,12 @@ def main():
)
)
results_df = results_to_dataframe(results)
all_results = prior_results + new_results
results_df = results_to_dataframe(all_results)
postcodes_processed = [pc for pc, _, _ in postcode_data]
all_postcodes = {r.postcode for r in all_results}
coords_df = postcodes_df.filter(
pl.col("postcode").is_in(postcodes_processed)
pl.col("postcode").is_in(all_postcodes)
).select(["postcode", "lat", "long"])
results_df = coords_df.join(results_df, on="postcode", how="left")
@ -94,7 +128,7 @@ def main():
)
successful = results_df.filter(pl.col("cycling_minutes").is_not_null()).height
print(f"Completed: {successful}/{len(results)} successful")
print(f"Completed: {successful}/{len(all_results)} successful")
parquet_path = save_results(results_df, destination.name)
checkpoint_saver.cleanup_checkpoint()

View file

@ -219,7 +219,9 @@ async def fetch_journey_times(
# TFL API authentication via app_key query parameter
tfl_token = os.environ.get("TFL_TOKEN")
params = {"app_key": tfl_token} if tfl_token else {}
if not tfl_token:
raise RuntimeError("TFL_TOKEN environment variable not set")
params = {"app_key": tfl_token}
async with httpx.AsyncClient(
base_url=BASE_URL,