import asyncio import random from datetime import date, timedelta import polars as pl from tqdm import tqdm from .config import ( DESTINATIONS, MAX_CONCURRENT, MAX_POSTCODES, OUTPUT_DIR, MAX_DISTANCE_KM, ) from .results import CheckpointSaver, results_to_dataframe, save_results from .tfl_client import fetch_journey_times from pipeline.utils import haversine_km_expr def main(): destination = DESTINATIONS["bank"] # Calculate next Monday at 8am today = date.today() days_until_monday = (7 - today.weekday()) % 7 or 7 journey_date = today + timedelta(days=days_until_monday) journey_time = "0845" print(f"Destination: {destination.name}") print( f"Journey: {journey_date.strftime('%A %Y-%m-%d')} " f"at {journey_time[:2]}:{journey_time[2:]}" ) postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet") print(f"Loaded {postcodes_df.height:,} postcodes") # Filter to postcodes within range of destination postcodes_df = postcodes_df.with_columns( haversine_km_expr("lat", "long", destination.lat, destination.lon).alias( "distance_km" ) ).filter(pl.col("distance_km") <= MAX_DISTANCE_KM) print(f"Filtered to {postcodes_df.height:,} postcodes within {MAX_DISTANCE_KM}km") postcode_data = list( zip( postcodes_df["postcode"].to_list(), postcodes_df["lat"].to_list(), postcodes_df["long"].to_list(), ) ) if MAX_POSTCODES is not None and len(postcode_data) > MAX_POSTCODES: postcode_data = random.sample(postcode_data, MAX_POSTCODES) print(f"Randomly sampled {MAX_POSTCODES} postcodes") checkpoint_saver = CheckpointSaver( destination_name=destination.name, on_save=lambda path, count: print( f"Checkpoint saved: {count:,} results to {path}" ), ) def on_result(result): pbar.update(1) checkpoint_saver.add_result(result) with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar: results = asyncio.run( fetch_journey_times( postcode_data, destination, journey_date.strftime("%Y%m%d"), journey_time, MAX_CONCURRENT, progress_callback=on_result, ) ) results_df = results_to_dataframe(results) postcodes_processed = [pc for pc, _, _ in postcode_data] coords_df = postcodes_df.filter( pl.col("postcode").is_in(postcodes_processed) ).select(["postcode", "lat", "long"]) results_df = coords_df.join(results_df, on="postcode", how="left") results_df = results_df.with_columns( pl.lit(destination.name).alias("destination"), pl.lit(journey_date.strftime("%Y-%m-%d")).alias("journey_date"), pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"), ) successful = results_df.filter(pl.col("cycling_minutes").is_not_null()).height print(f"Completed: {successful}/{len(results)} successful") parquet_path = save_results(results_df, destination.name) checkpoint_saver.cleanup_checkpoint() print(f"Saved to {parquet_path}") if __name__ == "__main__": main()