46 lines
1.2 KiB
Python
46 lines
1.2 KiB
Python
import polars as pl
|
|
|
|
from pipeline.utils import fuzzy_join_on_postcode
|
|
|
|
POSTCODE = "E14 2DG"
|
|
|
|
# Price paid: unique addresses for this postcode
|
|
pp = (
|
|
pl.scan_parquet("data/price-paid-complete.parquet")
|
|
.filter(pl.col("postcode") == POSTCODE)
|
|
.select("paon", "saon", "street", "postcode")
|
|
.unique()
|
|
.sort("saon")
|
|
.with_columns(
|
|
pl.concat_str(
|
|
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
|
separator=" ",
|
|
ignore_nulls=True,
|
|
).alias("pp_address"),
|
|
)
|
|
)
|
|
|
|
# EPC: latest inspection per address for this postcode
|
|
epc = (
|
|
pl.scan_csv("data/epc/certificates.csv")
|
|
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
|
|
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
|
|
.sort("INSPECTION_DATE", descending=True)
|
|
.unique("ADDRESS")
|
|
.sort("ADDRESS")
|
|
)
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=pp,
|
|
right=epc,
|
|
left_address_col="pp_address",
|
|
right_address_col="ADDRESS",
|
|
left_postcode_col="postcode",
|
|
right_postcode_col="POSTCODE",
|
|
).collect()
|
|
|
|
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
|
|
|
print("Testing the matching between EPC and PP addresses")
|
|
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
|
print(snapshot)
|