import asyncio import random from datetime import date, timedelta import polars as pl from tqdm import tqdm from .config import DESTINATIONS, MAX_CONCURRENT, MAX_POSTCODES, OUTPUT_DIR, MAX_DISTANCE_KM from .results import CheckpointSaver, results_to_dataframe, save_results from .tfl_client import fetch_journey_times def main(): destination = DESTINATIONS["bank"] # Calculate next Monday at 8am today = date.today() days_until_monday = (7 - today.weekday()) % 7 or 7 journey_date = today + timedelta(days=days_until_monday) journey_time = "0845" print(f"Destination: {destination.name}") print( f"Journey: {journey_date.strftime('%A %Y-%m-%d')} " f"at {journey_time[:2]}:{journey_time[2:]}" ) postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet") print(f"Loaded {postcodes_df.height:,} postcodes") # Filter to postcodes within 150km of destination using Haversine formula earth_radius_km = 6371 dest_lat_rad = destination.lat * 3.14159265359 / 180 dest_lon_rad = destination.lon * 3.14159265359 / 180 postcodes_df = postcodes_df.with_columns( ( 2 * earth_radius_km * ( ( ((pl.lit(dest_lat_rad) - pl.col("lat") * 3.14159265359 / 180) / 2).sin() ** 2 + pl.lit(dest_lat_rad).cos() * (pl.col("lat") * 3.14159265359 / 180).cos() * ( (pl.lit(dest_lon_rad) - pl.col("long") * 3.14159265359 / 180) / 2 ).sin() ** 2 ) .sqrt() .arcsin() ) ).alias("distance_km") ).filter(pl.col("distance_km") <= MAX_DISTANCE_KM) print(f"Filtered to {postcodes_df.height:,} postcodes within {MAX_DISTANCE_KM}km") postcode_data = list( zip( postcodes_df["postcode"].to_list(), postcodes_df["lat"].to_list(), postcodes_df["long"].to_list(), ) ) if MAX_POSTCODES is not None and len(postcode_data) > MAX_POSTCODES: postcode_data = random.sample(postcode_data, MAX_POSTCODES) print(f"Randomly sampled {MAX_POSTCODES} postcodes") checkpoint_saver = CheckpointSaver( destination_name=destination.name, on_save=lambda path, count: print(f"Checkpoint saved: {count:,} results to {path}"), ) def on_result(result): pbar.update(1) checkpoint_saver.add_result(result) with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar: results = asyncio.run( fetch_journey_times( postcode_data, destination, journey_date.strftime("%Y%m%d"), journey_time, MAX_CONCURRENT, progress_callback=on_result, ) ) results_df = results_to_dataframe(results) postcodes_processed = [pc for pc, _, _ in postcode_data] coords_df = postcodes_df.filter( pl.col("postcode").is_in(postcodes_processed) ).select(["postcode", "lat", "long"]) results_df = coords_df.join(results_df, on="postcode", how="left") results_df = results_df.with_columns( pl.lit(destination.name).alias("destination"), pl.lit(journey_date.strftime("%Y-%m-%d")).alias("journey_date"), pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"), ) successful = results_df.filter(pl.col("cycling_minutes").is_not_null()).height print(f"Completed: {successful}/{len(results)} successful") parquet_path = save_results(results_df, destination.name) checkpoint_saver.cleanup_checkpoint() print(f"Saved to {parquet_path}") if __name__ == "__main__": main()