Update H3 pipeline

This commit is contained in:
Andras Schmelczer 2026-01-30 18:33:48 +00:00
parent 68b6dcf65e
commit 6122ee44da
13 changed files with 291 additions and 420 deletions

View file

@ -18,7 +18,7 @@ epc = pl.scan_csv('data_sources/epc/certificates.csv').select(
'NUMBER_HABITABLE_ROOMS',
'FLOOR_HEIGHT',
'CONSTRUCTION_AGE_BAND'
).sort('INSPECTION_DATE', descending=True).group_by('epc_address').first()
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
print("EPC dataset")
@ -39,7 +39,8 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
'locality',
'town_city',
pl.col('duration').replace(duration_map)
).filter(pl.col('pp_property_type') != 'Other').with_columns(
)
.filter(pl.col('pp_property_type') != 'Other').with_columns(
pl.concat_str(
[pl.col('saon'), pl.col('paon'), pl.col('street')],
separator=' ',
@ -58,30 +59,27 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
pl.col('price').last().alias('latest_price'),
pl.col('date_of_transfer').last(),
)
)
).filter(pl.col('pp_address').is_not_null())
print("Price paid dataset")
print(price_paid.head().collect())
price_paid_df = price_paid.collect()
epc_df = epc.collect()
joined = fuzzy_join_on_postcode(
left=price_paid_df,
right=epc_df,
left=price_paid,
right=epc,
left_address_col='pp_address',
right_address_col='epc_address',
left_postcode_col='postcode',
right_postcode_col='POSTCODE',
score_threshold=80,
).drop('POSTCODE')
).drop('POSTCODE').collect()
matched_count = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()).height
print(f"Unique properties: {price_paid_df.height}")
print(f"Matched: {matched_count} ({100 * matched_count / price_paid_df.height:.1f}%)")
print(f"Unmatched: {price_paid_df.height - matched_count}")
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
total = joined.height
print(f"Unique properties: {total}")
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
print(f"Unmatched: {total - matched.height}")
joined = joined.rename({col: col.lower() for col in joined.columns})
matched = matched.rename({col: col.lower() for col in joined.columns})
print(joined.head())
joined.write_parquet('data_sources/processed/epc_pp.parquet')
print(matched.head())
matched.write_parquet('data_sources/processed/epc_pp.parquet')