import polars as pl from .fuzzy_join import fuzzy_join_on_postcode pl.Config.set_tbl_cols(-1) epc = pl.scan_csv('data_sources/epc/certificates.csv').select( pl.col('ADDRESS').alias('epc_address'), 'POSTCODE', 'CURRENT_ENERGY_RATING', 'POTENTIAL_ENERGY_RATING', pl.col('PROPERTY_TYPE').alias('epc_property_type'), 'BUILT_FORM', 'INSPECTION_DATE', 'TOTAL_FLOOR_AREA', 'NUMBER_HABITABLE_ROOMS', 'FLOOR_HEIGHT', 'CONSTRUCTION_AGE_BAND' ).sort('INSPECTION_DATE', descending=True).group_by('epc_address').first() print("EPC dataset") print(epc.head().collect()) # https://www.gov.uk/guidance/about-the-price-paid-data property_type_map = {"D": "Detached", "S": "Semi-Detached", "T": "Terraced", "F": "Flats/Maisonettes", "O": "Other"} duration_map = {"F": "Freehold", "L": "Leasehold"} price_paid = (pl.scan_parquet('data_sources/pp-complete.parquet').select( "price", "date_of_transfer", pl.col('property_type').alias("pp_property_type").replace(property_type_map), "postcode", 'paon', 'saon', 'street', 'locality', 'town_city', pl.col('duration').replace(duration_map) ).filter(pl.col('pp_property_type') != 'Other').with_columns( pl.concat_str( [pl.col('saon'), pl.col('paon'), pl.col('street')], separator=' ', ignore_nulls=True, ).alias('pp_address'), ) .sort('date_of_transfer') .group_by('pp_address', 'postcode', maintain_order=True) .agg( pl.struct( pl.col('date_of_transfer').dt.year().alias('year'), 'price', ).alias('historical_prices'), pl.col('pp_property_type').last(), pl.col('duration').last(), pl.col('price').last().alias('latest_price'), pl.col('date_of_transfer').last(), ) ) print("Price paid dataset") print(price_paid.head().collect()) price_paid_df = price_paid.collect() epc_df = epc.collect() joined = fuzzy_join_on_postcode( left=price_paid_df, right=epc_df, left_address_col='pp_address', right_address_col='epc_address', left_postcode_col='postcode', right_postcode_col='POSTCODE', score_threshold=80, ).drop('POSTCODE') matched_count = joined.filter(pl.col('epc_address').is_not_null() & pl.col('pp_address').is_not_null()).height print(f"Unique properties: {price_paid_df.height}") print(f"Matched: {matched_count} ({100 * matched_count / price_paid_df.height:.1f}%)") print(f"Unmatched: {price_paid_df.height - matched_count}") joined = joined.rename({col: col.lower() for col in joined.columns}) print(joined.head()) joined.write_parquet('data_sources/processed/epc_pp.parquet')