Update H3 pipeline
This commit is contained in:
parent
68b6dcf65e
commit
6122ee44da
13 changed files with 291 additions and 420 deletions
|
|
@ -9,7 +9,6 @@ pp = (
|
|||
pl.scan_parquet("data_sources/pp-complete.parquet")
|
||||
.filter(pl.col("postcode") == POSTCODE)
|
||||
.select("paon", "saon", "street", "postcode")
|
||||
.collect()
|
||||
.unique()
|
||||
.sort("saon")
|
||||
.with_columns(
|
||||
|
|
@ -27,14 +26,10 @@ epc = (
|
|||
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
|
||||
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
|
||||
.sort("INSPECTION_DATE", descending=True)
|
||||
.collect()
|
||||
.unique("ADDRESS")
|
||||
.sort("ADDRESS")
|
||||
)
|
||||
|
||||
print(f"Price paid: {len(pp)} unique addresses")
|
||||
print(f"EPC: {len(epc)} unique addresses")
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=pp,
|
||||
right=epc,
|
||||
|
|
@ -42,9 +37,7 @@ result = fuzzy_join_on_postcode(
|
|||
right_address_col="ADDRESS",
|
||||
left_postcode_col="postcode",
|
||||
right_postcode_col="POSTCODE",
|
||||
score_threshold=80,
|
||||
|
||||
)
|
||||
).collect()
|
||||
|
||||
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue