Update H3 pipeline

This commit is contained in:
Andras Schmelczer 2026-01-30 18:33:48 +00:00
parent 68b6dcf65e
commit 6122ee44da
13 changed files with 291 additions and 420 deletions

View file

@ -9,7 +9,6 @@ pp = (
pl.scan_parquet("data_sources/pp-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.collect()
.unique()
.sort("saon")
.with_columns(
@ -27,14 +26,10 @@ epc = (
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.collect()
.unique("ADDRESS")
.sort("ADDRESS")
)
print(f"Price paid: {len(pp)} unique addresses")
print(f"EPC: {len(epc)} unique addresses")
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
@ -42,9 +37,7 @@ result = fuzzy_join_on_postcode(
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
score_threshold=80,
)
).collect()
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")