diff --git a/pipeline/journey_times/__main__.py b/pipeline/journey_times/__main__.py index cde9480..6297f09 100644 --- a/pipeline/journey_times/__main__.py +++ b/pipeline/journey_times/__main__.py @@ -71,9 +71,12 @@ def main(): prior_results: list[JourneyResult] = [] if checkpoint_path.exists(): checkpoint_df = pl.read_parquet(checkpoint_path) - completed_postcodes = set( - checkpoint_df.filter(pl.col("public_transport_easy_minutes").is_not_null())["postcode"].to_list() + # Deduplicate checkpoint rows per postcode, preferring rows with data + checkpoint_df = ( + checkpoint_df.sort("public_transport_quick_minutes", nulls_last=True) + .unique(subset=["postcode"], keep="first") ) + completed_postcodes = set(checkpoint_df["postcode"].to_list()) prior_results = [ JourneyResult( postcode=row["postcode"],