good
This commit is contained in:
parent
81a16f543c
commit
63713c3a2b
15 changed files with 492 additions and 159 deletions
|
|
@ -2,45 +2,72 @@ import polars as pl
|
|||
|
||||
from pipeline.utils import fuzzy_join_on_postcode
|
||||
|
||||
POSTCODE = "E14 2DG"
|
||||
|
||||
# Price paid: unique addresses for this postcode
|
||||
pp = (
|
||||
pl.scan_parquet("data/price-paid-complete.parquet")
|
||||
.filter(pl.col("postcode") == POSTCODE)
|
||||
.select("paon", "saon", "street", "postcode")
|
||||
.unique()
|
||||
.sort("saon")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
).alias("pp_address"),
|
||||
def test_fuzzy_join_on_postcode_matches_addresses_within_postcode():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_id": ["flat", "house", "unmatched"],
|
||||
"left_address": [
|
||||
"Flat 2, 10 High Street",
|
||||
"12 High Street",
|
||||
"99 Other Road",
|
||||
],
|
||||
"left_postcode": ["AB1 2CD", "AB1 2CD", "AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_id": ["flat_epc", "house_epc", "other_postcode"],
|
||||
"right_address": [
|
||||
"10 HIGH STREET FLAT 2",
|
||||
"12 High-Street",
|
||||
"99 Other Road",
|
||||
],
|
||||
"right_postcode": [" AB1 2CD ", "AB1 2CD", "ZZ9 9ZZ"],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# EPC: latest inspection per address for this postcode
|
||||
epc = (
|
||||
pl.scan_csv("data/epc/certificates.csv")
|
||||
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
|
||||
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
|
||||
.sort("INSPECTION_DATE", descending=True)
|
||||
.unique("ADDRESS")
|
||||
.sort("ADDRESS")
|
||||
)
|
||||
result = (
|
||||
fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
)
|
||||
.sort("left_id")
|
||||
.collect()
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=pp,
|
||||
right=epc,
|
||||
left_address_col="pp_address",
|
||||
right_address_col="ADDRESS",
|
||||
left_postcode_col="postcode",
|
||||
right_postcode_col="POSTCODE",
|
||||
).collect()
|
||||
assert result.select("left_id", "right_id").to_dicts() == [
|
||||
{"left_id": "flat", "right_id": "flat_epc"},
|
||||
{"left_id": "house", "right_id": "house_epc"},
|
||||
{"left_id": "unmatched", "right_id": None},
|
||||
]
|
||||
|
||||
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
||||
|
||||
print("Testing the matching between EPC and PP addresses")
|
||||
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
||||
print(snapshot)
|
||||
def test_fuzzy_join_on_postcode_requires_matching_numbers():
|
||||
left = pl.LazyFrame(
|
||||
{
|
||||
"left_address": ["10 High Street"],
|
||||
"left_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
right = pl.LazyFrame(
|
||||
{
|
||||
"right_address": ["11 High Street"],
|
||||
"right_postcode": ["AB1 2CD"],
|
||||
}
|
||||
)
|
||||
|
||||
result = fuzzy_join_on_postcode(
|
||||
left=left,
|
||||
right=right,
|
||||
left_address_col="left_address",
|
||||
right_address_col="right_address",
|
||||
left_postcode_col="left_postcode",
|
||||
right_postcode_col="right_postcode",
|
||||
).collect()
|
||||
|
||||
assert result["right_address"].to_list() == [None]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue