perfect-postcode/pipeline/journey_times/__main__.py

77 lines
2.4 KiB
Python

import asyncio
import random
from datetime import date, timedelta
import polars as pl
from tqdm import tqdm
from .config import DESTINATIONS, MAX_CONCURRENT, MAX_POSTCODES, OUTPUT_DIR
from .results import results_to_dataframe, save_results
from .tfl_client import fetch_journey_times
def main():
destination = DESTINATIONS["bank"]
# Calculate next Monday at 8am
today = date.today()
days_until_monday = (7 - today.weekday()) % 7 or 7
journey_date = today + timedelta(days=days_until_monday)
journey_time = "0845"
print(f"Destination: {destination.name}")
print(
f"Journey: {journey_date.strftime('%A %Y-%m-%d')} "
f"at {journey_time[:2]}:{journey_time[2:]}"
)
postcodes_df = pl.read_parquet(OUTPUT_DIR / "postcodes_h3.parquet")
print(f"Loaded {postcodes_df.height:,} postcodes")
postcode_data = list(
zip(
postcodes_df["postcode"].to_list(),
postcodes_df["lat"].to_list(),
postcodes_df["long"].to_list(),
)
)
if MAX_POSTCODES is not None and len(postcode_data) > MAX_POSTCODES:
postcode_data = random.sample(postcode_data, MAX_POSTCODES)
print(f"Randomly sampled {MAX_POSTCODES} postcodes")
with tqdm(total=len(postcode_data), desc="Fetching journeys") as pbar:
results = asyncio.run(
fetch_journey_times(
postcode_data,
destination,
journey_date.strftime("%Y%m%d"),
journey_time,
MAX_CONCURRENT,
progress_callback=lambda _: pbar.update(1),
)
)
results_df = results_to_dataframe(results)
postcodes_processed = [pc for pc, _, _ in postcode_data]
coords_df = postcodes_df.filter(
pl.col("postcode").is_in(postcodes_processed)
).select(["postcode", "lat", "long"])
results_df = coords_df.join(results_df, on="postcode", how="left")
results_df = results_df.with_columns(
pl.lit(destination.name).alias("destination"),
pl.lit(journey_date.strftime("%Y-%m-%d")).alias("journey_date"),
pl.lit(f"{journey_time[:2]}:{journey_time[2:]}").alias("journey_time"),
)
successful = results_df.filter(pl.col("cycling").is_not_null()).height
print(f"Completed: {successful}/{len(results)} successful")
parquet_path = save_results(results_df, destination.name)
print(f"Saved to {parquet_path}")
if __name__ == "__main__":
main()