Fix checkpoint postcode duplication

This commit is contained in:
Andras Schmelczer 2026-02-01 13:10:56 +00:00
parent 627ba5496a
commit ccb7c8fbd7

View file

@ -71,9 +71,12 @@ def main():
prior_results: list[JourneyResult] = []
if checkpoint_path.exists():
checkpoint_df = pl.read_parquet(checkpoint_path)
completed_postcodes = set(
checkpoint_df.filter(pl.col("public_transport_easy_minutes").is_not_null())["postcode"].to_list()
# Deduplicate checkpoint rows per postcode, preferring rows with data
checkpoint_df = (
checkpoint_df.sort("public_transport_quick_minutes", nulls_last=True)
.unique(subset=["postcode"], keep="first")
)
completed_postcodes = set(checkpoint_df["postcode"].to_list())
prior_results = [
JourneyResult(
postcode=row["postcode"],