Update H3 pipeline
This commit is contained in:
parent
68b6dcf65e
commit
6122ee44da
13 changed files with 291 additions and 420 deletions
|
|
@ -18,7 +18,7 @@ epc = pl.scan_csv('data_sources/epc/certificates.csv').select(
|
|||
'NUMBER_HABITABLE_ROOMS',
|
||||
'FLOOR_HEIGHT',
|
||||
'CONSTRUCTION_AGE_BAND'
|
||||
).sort('INSPECTION_DATE', descending=True).group_by('epc_address').first()
|
||||
).filter(pl.col('epc_address').is_not_null()).sort('INSPECTION_DATE', descending=True).group_by('epc_address', 'POSTCODE').first()
|
||||
|
||||
|
||||
print("EPC dataset")
|
||||
|
|
@ -39,7 +39,8 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
|
|||
'locality',
|
||||
'town_city',
|
||||
pl.col('duration').replace(duration_map)
|
||||
).filter(pl.col('pp_property_type') != 'Other').with_columns(
|
||||
)
|
||||
.filter(pl.col('pp_property_type') != 'Other').with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col('saon'), pl.col('paon'), pl.col('street')],
|
||||
separator=' ',
|
||||
|
|
@ -58,30 +59,27 @@ price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select(
|
|||
pl.col('price').last().alias('latest_price'),
|
||||
pl.col('date_of_transfer').last(),
|
||||
)
|
||||
)
|
||||
).filter(pl.col('pp_address').is_not_null())
|
||||
|
||||
print("Price paid dataset")
|
||||
print(price_paid.head().collect())
|
||||
|
||||
price_paid_df = price_paid.collect()
|
||||
epc_df = epc.collect()
|
||||
|
||||
joined = fuzzy_join_on_postcode(
|
||||
left=price_paid_df,
|
||||
right=epc_df,
|
||||
left=price_paid,
|
||||
right=epc,
|
||||
left_address_col='pp_address',
|
||||
right_address_col='epc_address',
|
||||
left_postcode_col='postcode',
|
||||
right_postcode_col='POSTCODE',
|
||||
score_threshold=80,
|
||||
).drop('POSTCODE')
|
||||
).drop('POSTCODE').collect()
|
||||
|
||||
matched_count = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()).height
|
||||
print(f"Unique properties: {price_paid_df.height}")
|
||||
print(f"Matched: {matched_count} ({100 * matched_count / price_paid_df.height:.1f}%)")
|
||||
print(f"Unmatched: {price_paid_df.height - matched_count}")
|
||||
matched = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null())
|
||||
total = joined.height
|
||||
print(f"Unique properties: {total}")
|
||||
print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
|
||||
print(f"Unmatched: {total - matched.height}")
|
||||
|
||||
joined = joined.rename({col: col.lower() for col in joined.columns})
|
||||
matched = matched.rename({col: col.lower() for col in joined.columns})
|
||||
|
||||
print(joined.head())
|
||||
joined.write_parquet('data_sources/processed/epc_pp.parquet')
|
||||
print(matched.head())
|
||||
matched.write_parquet('data_sources/processed/epc_pp.parquet')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue