Update H3 pipeline

2026-01-30 18:33:48 +00:00 · 2026-01-30 18:33:48 +00:00 · 6122ee44da
commit 6122ee44da
parent 68b6dcf65e
13 changed files with 291 additions and 420 deletions
--- a/pipeline/epc_pp.py
+++ b/pipeline/epc_pp.py
@ -18,7 +18,7 @@ epc = pl.scan_csv('data_sources/epc/certificates.csv').select(
    'NUMBER_HABITABLE_ROOMS',
    'FLOOR_HEIGHT',
    'CONSTRUCTION_AGE_BAND'
-).sort('INSPECTION_DATE', descending=True).group_by('epc_address').first()
+).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()


 print("EPC dataset")
@ -39,7 +39,8 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
    'locality',
    'town_city',
    pl.col('duration').replace(duration_map)
-).filter(pl.col('pp_property_type') != 'Other').with_columns(
+)
+.filter(pl.col('pp_property_type') != 'Other').with_columns(
        pl.concat_str(
            [pl.col('saon'), pl.col('paon'), pl.col('street')],
            separator=' ',
@ -58,30 +59,27 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
        pl.col('price').last().alias('latest_price'),
        pl.col('date_of_transfer').last(),
    )   
-)
+).filter(pl.col('pp_address').is_not_null())

 print("Price paid dataset")
 print(price_paid.head().collect())

-price_paid_df = price_paid.collect()
-epc_df = epc.collect()
-
 joined = fuzzy_join_on_postcode(
-    left=price_paid_df,
-    right=epc_df,
+    left=price_paid,
+    right=epc,
    left_address_col='pp_address',
    right_address_col='epc_address',
    left_postcode_col='postcode',
    right_postcode_col='POSTCODE',
-    score_threshold=80,
-).drop('POSTCODE')
+).drop('POSTCODE').collect()

-matched_count = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()).height
-print(f"Unique properties: {price_paid_df.height}")
-print(f"Matched: {matched_count} ({100 * matched_count / price_paid_df.height:.1f}%)")
-print(f"Unmatched: {price_paid_df.height - matched_count}")
+matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
+total = joined.height
+print(f"Unique properties: {total}")
+print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
+print(f"Unmatched: {total - matched.height}")

-joined = joined.rename({col: col.lower() for col in joined.columns})
+matched = matched.rename({col: col.lower() for col in joined.columns})

-print(joined.head())
-joined.write_parquet('data_sources/processed/epc_pp.parquet')
+print(matched.head())
+matched.write_parquet('data_sources/processed/epc_pp.parquet')