52 lines
1.3 KiB
Python
52 lines
1.3 KiB
Python
import polars as pl
|
|
|
|
from fuzzy_join import fuzzy_join_on_postcode
|
|
|
|
POSTCODE = "E14 2DG"
|
|
|
|
# Price paid: unique addresses for this postcode
|
|
pp = (
|
|
pl.scan_parquet("data_sources/pp-complete.parquet")
|
|
.filter(pl.col("postcode") == POSTCODE)
|
|
.select("paon", "saon", "street", "postcode")
|
|
.collect()
|
|
.unique()
|
|
.sort("saon")
|
|
.with_columns(
|
|
pl.concat_str(
|
|
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
|
separator=" ",
|
|
ignore_nulls=True,
|
|
).alias("pp_address"),
|
|
)
|
|
)
|
|
|
|
# EPC: latest inspection per address for this postcode
|
|
epc = (
|
|
pl.scan_csv("data_sources/epc/certificates.csv")
|
|
.select("ADDRESS", "POSTCODE", "INSPECTION_DATE")
|
|
.filter(pl.col("POSTCODE").str.strip_chars() == POSTCODE)
|
|
.sort("INSPECTION_DATE", descending=True)
|
|
.collect()
|
|
.unique("ADDRESS")
|
|
.sort("ADDRESS")
|
|
)
|
|
|
|
print(f"Price paid: {len(pp)} unique addresses")
|
|
print(f"EPC: {len(epc)} unique addresses")
|
|
|
|
result = fuzzy_join_on_postcode(
|
|
left=pp,
|
|
right=epc,
|
|
left_address_col="pp_address",
|
|
right_address_col="ADDRESS",
|
|
left_postcode_col="postcode",
|
|
right_postcode_col="POSTCODE",
|
|
score_threshold=80,
|
|
|
|
)
|
|
|
|
snapshot = result.select("pp_address", "ADDRESS").sort("pp_address")
|
|
|
|
with pl.Config(tbl_rows=-1, tbl_cols=-1, fmt_str_lengths=80):
|
|
print(snapshot)
|