import polars as pl from pipeline.utils import fuzzy_join_on_postcode POSTCODE = "E14 2DG" # Price paid: unique addresses for this postcode pp = ( pl.scan_parquet("data_sources/pp-complete.parquet") .filter(pl.col("postcode") == POSTCODE) .select("paon", "saon", "street", "postcode") .unique() .sort("saon") .with_columns( pl.concat_str( [pl.col("saon"), pl.col("paon"), pl.col("street")], separator=" ", ignore_nulls=True, ).alias("pp_address"), ) ) # EPC: latest inspection per address for this postcode epc = ( pl.scan_csv("data_sources/epc/certificates.csv") .select("ADDRESS", "POSTCODE", "INSPECTION_DATE") .filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE) .sort("INSPECTION_DATE", descending=True) .unique("ADDRESS") .sort("ADDRESS") ) result = fuzzy_join_on_postcode( left=pp, right=epc, left_address_col="pp_address", right_address_col="ADDRESS", left_postcode_col="postcode", right_postcode_col="POSTCODE", ).collect() snapshot = result.select("pp_address", "ADDRESS").sort("pp_address") print('Testing the matching between EPC and PP addresses') with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80): print(snapshot)