perfect-postcode/pipeline/test_fuzzy_join.py
2026-01-30 14:44:48 +00:00

52 lines
1.3 KiB
Python

import polars as pl
from fuzzy_join import fuzzy_join_on_postcode
POSTCODE = "E14 2DG"
# Price paid: unique addresses for this postcode
pp = (
pl.scan_parquet("data_sources/pp-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.collect()
.unique()
.sort("saon")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
)
# EPC: latest inspection per address for this postcode
epc = (
pl.scan_csv("data_sources/epc/certificates.csv")
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.collect()
.unique("ADDRESS")
.sort("ADDRESS")
)
print(f"Price paid: {len(pp)} unique addresses")
print(f"EPC: {len(epc)} unique addresses")
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
left_address_col="pp_address",
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
score_threshold=80,
)
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
print(snapshot)