perfect-postcode/pipeline/transform/merge.py

import argparse

import polars as pl
from pathlib import Path

from pipeline.utils.postcode_mapping import build_postcode_mapping

MIN_FLOOR_AREA_M2 = 10


_AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
    # Deprivation
    "Income Score (rate)",
    "Employment Score (rate)",
    "Education, Skills and Training Score",
    "Health Deprivation and Disability Score",
    "Living Environment Score",
    "Indoors Sub-domain Score",
    "Outdoors Sub-domain Score",
    # Ethnicity
    "% South Asian",
    "% East Asian",
    "% Black",
    "% Mixed",
    "% White",
    "% Other",
    # Crime
    "Anti-social behaviour (avg/yr)",
    "Violence and sexual offences (avg/yr)",
    "Criminal damage and arson (avg/yr)",
    "Burglary (avg/yr)",
    "Vehicle crime (avg/yr)",
    "Robbery (avg/yr)",
    "Other theft (avg/yr)",
    "Shoplifting (avg/yr)",
    "Drugs (avg/yr)",
    "Possession of weapons (avg/yr)",
    "Public order (avg/yr)",
    "Bicycle theft (avg/yr)",
    "Theft from the person (avg/yr)",
    "Other crime (avg/yr)",
    "Serious crime (avg/yr)",
    "Minor crime (avg/yr)",
    "Serious crime per 1k residents (avg/yr)",
    "Minor crime per 1k residents (avg/yr)",
    # Amenities
    "Number of restaurants within 2km",
    "Number of grocery shops and supermarkets within 2km",
    "Number of parks within 1km",
    "Distance to nearest train or tube station (km)",
    "Distance to nearest park (km)",
    # Environment
    "Noise (dB)",
    "Max available download speed (Mbps)",
    # Schools
    "Good+ primary schools within 5km",
    "Good+ secondary schools within 5km",
    "Good+ primary schools within 2km",
    "Good+ secondary schools within 2km",
    # Demographics
    "Median age",
    # Politics
    "Voter turnout (%)",
    "% Labour",
    "% Conservative",
    "% Liberal Democrat",
    "% Reform UK",
    "% Green",
    "% Other parties",
]


def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
    iod_path: Path,
    poi_proximity_path: Path,
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
    school_proximity_path: Path,
    broadband_path: Path,
    rental_prices_path: Path,
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

    Returns (postcode_df, properties_df).
    """
    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
    )

    # Remap terminated postcodes to nearest active successor
    postcode_mapping = build_postcode_mapping(arcgis_path)
    wide = (
        wide.join(
            postcode_mapping.lazy(),
            left_on="postcode",
            right_on="old_postcode",
            how="left",
        )
        .with_columns(
            pl.coalesce("new_postcode", "postcode").alias("postcode"),
        )
        .drop("new_postcode")
    )

    arcgis = (
        pl.scan_parquet(arcgis_path)
        .filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
        # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
        # Alias them back to the short canonical names used across the
        # pipeline so downstream joins don't need to know about NSPL's
        # versioning scheme.
        .select(
            pl.col("pcds").alias("postcode"),
            "lat",
            pl.col("long").alias("lon"),
            pl.col("lsoa21cd").alias("lsoa21"),
            pl.col("oa21cd").alias("oa21"),
            pl.col("pcon24cd").alias("pcon"),
        )
    )
    wide = wide.join(arcgis, on="postcode", how="left")

    iod = pl.scan_parquet(iod_path)
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

    # Invert deprivation scores so that higher values = less deprived (better)
    iod_score_cols = [
        "Education, Skills and Training Score",
        "Income Score (rate)",
        "Employment Score (rate)",
        "Health Deprivation and Disability Score",
        "Living Environment Score",
        "Indoors Sub-domain Score",
        "Outdoors Sub-domain Score",
    ]
    wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))

    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,
        left_on="Local Authority District code (2024)",
        right_on="Geography_code",
        how="left",
    )

    # Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
    wide = wide.with_columns(
        (pl.col("number_habitable_rooms") - 1)
        .clip(0, 4)
        .cast(pl.UInt8)
        .alias("_bedrooms"),
    )
    rental = pl.scan_parquet(rental_prices_path)
    wide = wide.join(
        rental,
        left_on=["Local Authority District code (2024)", "_bedrooms"],
        right_on=["area_code", "bedrooms"],
        how="left",
    )

    crime = pl.scan_parquet(crime_path)
    wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")

    wide = wide.with_columns(
        pl.sum_horizontal(
            "Violence and sexual offences (avg/yr)",
            "Robbery (avg/yr)",
            "Burglary (avg/yr)",
            "Possession of weapons (avg/yr)",
        ).alias("serious_crime_avg_yr"),
        pl.sum_horizontal(
            "Anti-social behaviour (avg/yr)",
            "Criminal damage and arson (avg/yr)",
            "Shoplifting (avg/yr)",
            "Bicycle theft (avg/yr)",
            "Theft from the person (avg/yr)",
            "Other theft (avg/yr)",
            "Vehicle crime (avg/yr)",
            "Public order (avg/yr)",
            "Drugs (avg/yr)",
            "Other crime (avg/yr)",
        ).alias("minor_crime_avg_yr"),
    )

    lsoa_pop = pl.scan_parquet(lsoa_population_path)
    wide = wide.join(lsoa_pop, on="lsoa21", how="left")
    wide = wide.with_columns(
        pl.when(pl.col("population") > 0)
        .then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("serious_crime_per_1k"),
        pl.when(pl.col("population") > 0)
        .then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
        .alias("minor_crime_per_1k"),
    ).drop("population")

    median_age = pl.scan_parquet(median_age_path)
    wide = wide.join(median_age, on="lsoa21", how="left")

    election = pl.scan_parquet(election_results_path)
    wide = wide.join(election, on="pcon", how="left")

    poi_counts = pl.scan_parquet(poi_proximity_path)
    wide = wide.join(poi_counts, on="postcode", how="left")

    noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
    noise = (
        pl.scan_parquet(noise_path)
        .with_columns(
            # NaN → null so max_horizontal ignores missing instead of propagating NaN
            *[pl.col(c).fill_nan(None) for c in noise_cols],
        )
        .with_columns(
            pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
        )
        .select("postcode", "noise_lden_db")
    )
    wide = wide.join(noise, on="postcode", how="left")

    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
    # UFBB(100) ≥100, SFBB ≥30 Mbps.  Stored as string enum.
    broadband = (
        pl.scan_parquet(broadband_path)
        .select(
            pl.col("postcode_space").alias("bb_postcode"),
            pl.when(pl.col("Gigabit availability (% premises)") > 0)
            .then(1000)
            .when(pl.col("UFBB availability (% premises)") > 0)
            .then(300)
            .when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
            .then(100)
            .when(pl.col("SFBB availability (% premises)") > 0)
            .then(30)
            .otherwise(10)
            .cast(pl.UInt16)
            .alias("max_download_speed"),
        )
        .group_by("bb_postcode")
        .agg(pl.col("max_download_speed").max())
        .with_columns(pl.col("max_download_speed").cast(pl.Utf8))
    )
    wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")

    # Derive property_type: prefer EPC data, fall back to price-paid.
    # For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
    bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
        ["NO DATA!", "Not Recorded"]
    )
    has_epc = pl.col("epc_property_type").is_not_null()
    is_house = pl.col("epc_property_type") == "House"
    wide = wide.with_columns(
        pl.when(has_epc & is_house & ~bad_built_form)
        .then(pl.col("built_form"))
        .when(has_epc & is_house)
        .then(pl.col("pp_property_type"))
        .when(has_epc)
        .then(pl.col("epc_property_type"))
        .otherwise(pl.col("pp_property_type"))
        # Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
        # collapse terrace sub-types, and fold rare types into "Other"
        .replace(
            {
                "Flat": "Flats/Maisonettes",
                "Maisonette": "Flats/Maisonettes",
                "End-Terrace": "Terraced",
                "Mid-Terrace": "Terraced",
                "Enclosed End-Terrace": "Terraced",
                "Enclosed Mid-Terrace": "Terraced",
                "Bungalow": "Other",
                "Park home": "Other",
            }
        )
        .alias("property_type")
    )

    wide = (
        wide.with_columns(
            pl.when(pl.col("duration") == "U")
            .then(None)
            .otherwise(pl.col("duration"))
            .alias("duration"),
            pl.when(pl.col("current_energy_rating") == "INVALID!")
            .then(None)
            .otherwise(pl.col("current_energy_rating"))
            .alias("current_energy_rating"),
        )
        .with_columns(
            (pl.col("latest_price") / pl.col("total_floor_area"))
            .round(0)
            .cast(pl.Int32)
            .alias("Price per sqm"),
        )
        .drop(
            "inspection_date",
            "_bedrooms",
            "LSOA name (2021)",
            "Local Authority District code (2024)",
            "Local Authority District name (2024)",
            "Wider Barriers Sub-domain Score",
            "Geographical Barriers Sub-domain Score",
            "Adult Skills Sub-domain Score",
            "Children and Young People Sub-domain Score",
            "Crime Score",
            "Index of Multiple Deprivation (IMD) Score",
            "Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
            "Income Deprivation Affecting Children Index (IDACI) Score (rate)",
            "Barriers to Housing and Services Score",
            "lsoa21",
            "oa21",
            "pcon",
            "epc_property_type",
            "pp_property_type",
            "built_form",
        )
        .rename(
            {
                "date_of_transfer": "Date of last transaction",
                "construction_age_band": "Construction year",
                "is_construction_date_approximate": "Is construction date approximate",
                "pp_address": "Address per Property Register",
                "epc_address": "Address per EPC",
                "postcode": "Postcode",
                "duration": "Leasehold/Freehold",
                "current_energy_rating": "Current energy rating",
                "potential_energy_rating": "Potential energy rating",
                "total_floor_area": "Total floor area (sqm)",
                "property_type": "Property type",
                "restaurants_2km": "Number of restaurants within 2km",
                "groceries_2km": "Number of grocery shops and supermarkets within 2km",
                "parks_1km": "Number of parks within 1km",
                "train_tube_nearest_km": "Distance to nearest train or tube station (km)",
                "parks_nearest_km": "Distance to nearest park (km)",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
                "good_primary_5km": "Good+ primary schools within 5km",
                "good_secondary_5km": "Good+ secondary schools within 5km",
                "good_primary_2km": "Good+ primary schools within 2km",
                "good_secondary_2km": "Good+ secondary schools within 2km",
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
                "serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
                "minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
                "mean_monthly_rent": "Estimated monthly rent",
                "floor_height": "Interior height (m)",
                "was_council_house": "Former council house",
                "median_age": "Median age",
                "turnout_pct": "Voter turnout (%)",
            }
        )
    )

    print("Collecting with streaming engine...")
    df = wide.collect(engine="streaming")

    # Split into postcode-level and property-level dataframes
    area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
    postcode_df = df.select(area_cols).group_by("Postcode").first()
    print(f"Postcode rows: {postcode_df.height} (unique postcodes)")

    property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
    properties_df = df.select(property_cols)
    print(f"Property rows: {properties_df.height}")

    return postcode_df, properties_df


def main():
    parser = argparse.ArgumentParser(
        description="Build wide property dataframe with all joins"
    )
    parser.add_argument(
        "--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
    )
    parser.add_argument(
        "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
    )
    parser.add_argument(
        "--iod",
        type=Path,
        required=True,
        help="Index of Deprivation parquet file (optional)",
    )
    parser.add_argument(
        "--poi-proximity",
        type=Path,
        help="POI proximity counts parquet file (optional)",
    )
    parser.add_argument(
        "--ethnicity",
        type=Path,
        required=True,
        help="Ethnicity by local authority parquet file (optional)",
    )
    parser.add_argument(
        "--crime",
        type=Path,
        required=True,
        help="Crime by LSOA parquet file (optional)",
    )
    parser.add_argument(
        "--noise", type=Path, required=True, help="Road noise by postcode parquet file"
    )
    parser.add_argument(
        "--school-proximity",
        type=Path,
        required=True,
        help="School proximity counts parquet file",
    )
    parser.add_argument(
        "--broadband",
        type=Path,
        required=True,
        help="Broadband performance by output area parquet file",
    )
    parser.add_argument(
        "--rental-prices",
        type=Path,
        required=True,
        help="ONS rental prices by LA and bedroom count parquet file",
    )
    parser.add_argument(
        "--lsoa-population",
        type=Path,
        required=True,
        help="Census 2021 population by LSOA parquet file",
    )
    parser.add_argument(
        "--median-age",
        type=Path,
        required=True,
        help="Census 2021 median age by LSOA parquet file",
    )
    parser.add_argument(
        "--election-results",
        type=Path,
        required=True,
        help="2024 General Election results by constituency parquet file",
    )
    parser.add_argument(
        "--output-postcodes",
        type=Path,
        required=True,
        help="Output postcode parquet file path",
    )
    parser.add_argument(
        "--output-properties",
        type=Path,
        required=True,
        help="Output properties parquet file path",
    )
    args = parser.parse_args()

    postcode_df, properties_df = _build(
        epc_pp_path=args.epc_pp,
        arcgis_path=args.arcgis,
        iod_path=args.iod,
        poi_proximity_path=args.poi_proximity,
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,
        school_proximity_path=args.school_proximity,
        broadband_path=args.broadband,
        rental_prices_path=args.rental_prices,
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")
    print(f"Postcode rows: {postcode_df.height}")
    postcode_df.write_parquet(args.output_postcodes)
    size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")

    print(f"\nProperty columns: {properties_df.columns}")
    print(f"Property rows: {properties_df.height}")
    properties_df.write_parquet(args.output_properties)
    size_mb = args.output_properties.stat().st_size / (1024 * 1024)
    print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")


if __name__ == "__main__":
    main()