perfect-postcode/pipeline/utils/test_fuzzy_join.py
2026-01-31 13:07:09 +00:00

46 lines
1.2 KiB
Python

import polars as pl
from pipeline.utils import fuzzy_join_on_postcode
POSTCODE = "E14 2DG"
# Price paid: unique addresses for this postcode
pp = (
pl.scan_parquet("data/price-paid-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.unique()
.sort("saon")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
)
)
# EPC: latest inspection per address for this postcode
epc = (
pl.scan_csv("data/epc/certificates.csv")
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.unique("ADDRESS")
.sort("ADDRESS")
)
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
left_address_col="pp_address",
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
).collect()
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
print("Testing the matching between EPC and PP addresses")
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
print(snapshot)