This commit is contained in:
Andras Schmelczer 2026-05-12 22:30:36 +01:00
parent 81a16f543c
commit 63713c3a2b
15 changed files with 492 additions and 159 deletions

View file

@ -2,45 +2,72 @@ import polars as pl
from pipeline.utils import fuzzy_join_on_postcode
POSTCODE = "E14 2DG"
# Price paid: unique addresses for this postcode
pp = (
pl.scan_parquet("data/price-paid-complete.parquet")
.filter(pl.col("postcode") == POSTCODE)
.select("paon", "saon", "street", "postcode")
.unique()
.sort("saon")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
left = pl.LazyFrame(
{
"left_id": ["flat", "house", "unmatched"],
"left_address": [
"Flat 2, 10 High Street",
"12 High Street",
"99 Other Road",
],
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_id": ["flat_epc", "house_epc", "other_postcode"],
"right_address": [
"10 HIGH STREET FLAT 2",
"12 High-Street",
"99 Other Road",
],
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
}
)
)
# EPC: latest inspection per address for this postcode
epc = (
pl.scan_csv("data/epc/certificates.csv")
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
.sort("INSPECTION_DATE", descending=True)
.unique("ADDRESS")
.sort("ADDRESS")
)
result = (
fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
)
.sort("left_id")
.collect()
)
result = fuzzy_join_on_postcode(
left=pp,
right=epc,
left_address_col="pp_address",
right_address_col="ADDRESS",
left_postcode_col="postcode",
right_postcode_col="POSTCODE",
).collect()
assert result.select("left_id", "right_id").to_dicts() == [
{"left_id": "flat", "right_id": "flat_epc"},
{"left_id": "house", "right_id": "house_epc"},
{"left_id": "unmatched", "right_id": None},
]
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
print("Testing the matching between EPC and PP addresses")
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
print(snapshot)
def test_fuzzy_join_on_postcode_requires_matching_numbers():
left = pl.LazyFrame(
{
"left_address": ["10 High Street"],
"left_postcode": ["AB1 2CD"],
}
)
right = pl.LazyFrame(
{
"right_address": ["11 High Street"],
"right_postcode": ["AB1 2CD"],
}
)
result = fuzzy_join_on_postcode(
left=left,
right=right,
left_address_col="left_address",
right_address_col="right_address",
left_postcode_col="left_postcode",
right_postcode_col="right_postcode",
).collect()
assert result["right_address"].to_list() == [None]