perfect-postcode/pipeline/transform/join_epc_pp.py

import argparse
import csv
import io
import tempfile
import zipfile
from pathlib import Path

import polars as pl
import pyarrow as pa
import pyarrow.csv as pa_csv
import pyarrow.parquet as pq

from pipeline.local_temp import local_tmp_dir

from ..utils import (
    fuzzy_join_on_postcode,
    normalize_address_key,
    normalize_postcode_key,
)

pl.Config.set_tbl_cols(-1)

RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
# Value-quality floor for price aggregations. A flat nominal floor is a blunt
# tool against a deflating threshold — £50k was completely normal for a 1990s
# house, so a 50k floor wrongly discarded ~a third of legitimate 1990s
# open-market sales (and deleted properties whose only sales were old/cheap),
# biasing early-year price history upward. 10k recovers the large [10k,50k)
# band of genuine cheaper sales while still excluding the nominal/junk transfers
# (£1 etc.). A small tail of real sub-10k sales is still dropped — a deliberate
# conservative tradeoff to keep clearly-implausible transfers out.
MIN_PRICE = 10_000

# Time-aware consecutive-sale jump guard. Price-paid contains keyed-in price
# errors that pass the MIN_PRICE/category filters — e.g. 13 QUICKSETTS HR2 7PP,
# a 93 m² terrace, sold £140,000 in 2016 then "£207,500,000" in 2026 (clearly
# £207,500 with extra digits, lodged as category A) — and would otherwise
# become latest_price. A quality sale is flagged when it exceeds its
# neighbouring sale by more than JUMP_TOLERANCE * JUMP_GROWTH_PER_YEAR ** years
# between the two sales. Calibration: genuine extreme appreciation (prime
# London 1995->2026 is roughly x50 over 31 years) stays comfortably under
# 12 * 1.10**31 ≈ 230, while the HR2 case (x1,482 over 10 years against a
# threshold of 12 * 1.10**10 ≈ 31) is caught. JUMP_MIN_PRICE is an absolute
# floor on the flagged price itself so right-to-buy resales and other
# legitimate x20-50 jumps on cheap properties are never flagged.
JUMP_TOLERANCE = 12.0
JUMP_GROWTH_PER_YEAR = 1.10
JUMP_MIN_PRICE = 2_000_000

# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030

# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
# habitable rooms) that otherwise propagate verbatim into the published per-
# property columns. Values outside these bands are nulled (treated as unknown)
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
MIN_FLOOR_HEIGHT_M = 1.5  # below this a storey is not habitable
MAX_FLOOR_HEIGHT_M = 6.0  # above this is a data error, not a normal storey
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0  # ~21,500 sqft; larger is a bulk/garbage record
MAX_HABITABLE_ROOMS = 20  # dwellings above this are data errors


def epc_band_to_year(band: pl.Expr) -> pl.Expr:
    """Map an EPC construction age band to a single representative build year.

    EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
    (1958) rather than the lower bound, which previously biased every band-derived
    year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
    wide to pin to a year and return null. Single-year / ``... onwards`` bands use
    that year. Already-numeric inputs (a year produced by an earlier call) pass
    through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
    """
    text = (
        band.cast(pl.Utf8)
        .str.replace("England and Wales: ", "")
        .str.replace(" onwards", "")
    )
    low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
    high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
    year = (
        pl.when(text.str.starts_with("before "))
        .then(None)
        .when(high.is_not_null())
        .then(((low + high) / 2).round(0).cast(pl.Int32))
        .otherwise(low)
    )
    return (
        pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
        .then(year)
        .otherwise(None)
        .cast(pl.UInt16, strict=False)
    )


EPC_SOURCE_COLUMNS = [
    "address",
    "postcode",
    "uprn",
    "current_energy_rating",
    "potential_energy_rating",
    "property_type",
    "built_form",
    "inspection_date",
    "total_floor_area",
    "number_habitable_rooms",
    "floor_height",
    "construction_age_band",
    "tenure",
]


def _normalise_csv_columns(columns: list[str]) -> list[str]:
    return [column.strip().lower() for column in columns]


def _clean_string(column: str) -> pl.Expr:
    stripped = pl.col(column).cast(pl.String).str.strip_chars()
    return pl.when(stripped == "").then(None).otherwise(stripped)


def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
    return _clean_string(column).cast(dtype, strict=False)


def _join_address_parts(*columns: str) -> pl.Expr:
    """Join address components into one display address, single-spaced.

    Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
    saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
    skips only nulls, so empty components still contributed their separator
    (``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
    Convert ``''``→null per component so ignore_nulls works as intended, then
    defensively collapse residual whitespace runs and strip the result. A
    fully-empty address becomes null (dropped by the downstream
    ``pp_address.is_not_null()`` filter) instead of whitespace junk.
    """
    joined = pl.concat_str(
        [_clean_string(column) for column in columns],
        separator=" ",
        ignore_nulls=True,
    )
    cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
    return pl.when(cleaned == "").then(None).otherwise(cleaned)


def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
    return (
        raw.select(
            _clean_string("address").alias("epc_address"),
            _clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
            # UPRN keys an exact listing->EPC join downstream (~99% populated).
            _clean_string("uprn").alias("uprn"),
            _clean_string("current_energy_rating")
            .str.to_uppercase()
            .alias("current_energy_rating"),
            _clean_string("potential_energy_rating")
            .str.to_uppercase()
            .alias("potential_energy_rating"),
            _clean_string("property_type").alias("epc_property_type"),
            _clean_string("built_form").alias("built_form"),
            # Parse to a real Date once (unparseable/blank -> null) so dedup can
            # sort newest-first with nulls_last and _event_year can use dt.year();
            # a lexicographic string sort would let a null/garbled date win under
            # Polars' default nulls-first descending order. EPC inspection dates
            # are ISO (YYYY-MM-DD).
            _clean_string("inspection_date")
            .str.to_date(format="%Y-%m-%d", strict=False)
            .alias("inspection_date"),
            _clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
            _clean_number("number_habitable_rooms", pl.Int16).alias(
                "number_habitable_rooms"
            ),
            _clean_number("floor_height", pl.Float64).alias("floor_height"),
            _clean_string("construction_age_band").alias("construction_age_band"),
            _clean_string("tenure").alias("tenure"),
        )
        .filter(pl.col("epc_address").is_not_null())
        .with_columns(
            # Null implausible EPC dimensions so data-entry errors don't reach
            # the published per-property columns (Interior height, Total floor
            # area, Number of bedrooms & living rooms). Treated as unknown.
            pl.when(
                (pl.col("number_habitable_rooms") >= 1)
                & (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
            )
            .then(pl.col("number_habitable_rooms"))
            .otherwise(None)
            .alias("number_habitable_rooms"),
            pl.when(
                pl.col("floor_height").is_between(
                    MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
                )
            )
            .then(pl.col("floor_height"))
            .otherwise(None)
            .alias("floor_height"),
            pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
            .then(pl.col("total_floor_area"))
            .otherwise(None)
            .alias("total_floor_area"),
        )
    )


def _certificate_member_names(zip_file: zipfile.ZipFile) -> list[str]:
    return sorted(
        name
        for name in zip_file.namelist()
        if not name.endswith("/")
        and Path(name).name.lower().startswith("certificates")
        and name.lower().endswith(".csv")
    )


def _read_zip_csv_header(zip_file: zipfile.ZipFile, member_name: str) -> list[str]:
    with zip_file.open(member_name) as member:
        text = io.TextIOWrapper(member, encoding="utf-8-sig", newline="")
        try:
            return next(csv.reader(text))
        except StopIteration as exc:
            raise ValueError(f"EPC CSV member is empty: {member_name}") from exc


def _source_columns_for_header(header: list[str]) -> list[str]:
    columns_by_normalised_name = {
        normalised: source
        for source, normalised in zip(header, _normalise_csv_columns(header))
    }
    return [
        columns_by_normalised_name.get(column, column) for column in EPC_SOURCE_COLUMNS
    ]


def _zip_certificates_to_parquet(zip_path: Path, output_path: Path) -> None:
    schema = pa.schema((column, pa.string()) for column in EPC_SOURCE_COLUMNS)
    writer = pq.ParquetWriter(output_path, schema=schema, compression="zstd")

    try:
        try:
            zip_file = zipfile.ZipFile(zip_path)
        except zipfile.BadZipFile as exc:
            raise ValueError(
                f"{zip_path} is not a readable EPC zip archive; re-download "
                "domestic-csv.zip and try again"
            ) from exc

        with zip_file:
            member_names = _certificate_member_names(zip_file)
            if not member_names:
                raise ValueError(f"No certificate CSV files found in {zip_path}")

            for member_name in member_names:
                print(f"Reading EPC certificates from {member_name}")
                source_columns = _source_columns_for_header(
                    _read_zip_csv_header(zip_file, member_name)
                )
                convert_options = pa_csv.ConvertOptions(
                    include_columns=source_columns,
                    include_missing_columns=True,
                    column_types={
                        source_column: pa.string() for source_column in source_columns
                    },
                    strings_can_be_null=True,
                )
                read_options = pa_csv.ReadOptions(block_size=64 * 1024 * 1024)

                with zip_file.open(member_name) as member:
                    reader = pa_csv.open_csv(
                        member,
                        read_options=read_options,
                        convert_options=convert_options,
                    )
                    while True:
                        try:
                            batch = reader.read_next_batch()
                        except StopIteration:
                            break

                        if batch.num_rows == 0:
                            continue

                        writer.write_batch(batch.rename_columns(EPC_SOURCE_COLUMNS))
    finally:
        writer.close()


def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
    if epc_path.suffix.lower() == ".zip":
        parquet_path = temp_dir / "epc-certificates.parquet"
        _zip_certificates_to_parquet(epc_path, parquet_path)
        raw = pl.scan_parquet(parquet_path)
    else:
        raw = pl.scan_csv(
            epc_path,
            infer_schema=False,
            with_column_names=_normalise_csv_columns,
        )

    return _select_epc_columns(raw)


def flag_price_outliers(slim: pl.DataFrame) -> pl.DataFrame:
    """Flag the implausible side of extreme consecutive-sale price jumps.

    ``slim`` holds one row per quality (>= MIN_PRICE, category A) sale:
    (_pp_group_address, _pp_group_postcode, date_of_transfer, price). Per
    property, each sale is compared against its previous and next sale and
    the HIGHER sale of an implausible pair is flagged:

    - UP rule: the sale is more than the time-aware threshold above its
      PREVIOUS sale (catches a garbage spike after a normal sale);
    - DOWN rule: the NEXT sale is less than 1/threshold of this one (catches
      a garbage spike before a normal sale);
    - either way the flagged price itself must be >= JUMP_MIN_PRICE, so
      cheap-property noise and right-to-buy-style resales stay safe.

    Runs as a bounded EAGER pass: .shift().over() window functions may not
    execute under the streaming sink used by fuzzy_join_on_postcode, so the
    flags are computed here and left-joined back into the lazy stream.

    Returns the exclusion rows (group keys, date_of_transfer, price) with a
    literal ``_price_outlier`` column, unique on the four join columns so
    the join-back can never fan out.
    """
    group_keys = ["_pp_group_address", "_pp_group_postcode"]
    # Years between consecutive sales, floored at six months so back-to-back
    # transfers don't get a near-zero exponent and an over-tight threshold.
    dy_prev = (
        (pl.col("date_of_transfer") - pl.col("_prev_date")).dt.total_days() / 365.25
    ).clip(lower_bound=0.5)
    dy_next = (
        (pl.col("_next_date") - pl.col("date_of_transfer")).dt.total_days() / 365.25
    ).clip(lower_bound=0.5)
    up_rule = (pl.col("price") / pl.col("_prev_price")) > JUMP_TOLERANCE * pl.lit(
        JUMP_GROWTH_PER_YEAR
    ).pow(dy_prev)
    down_rule = (pl.col("_next_price") / pl.col("price")) < 1 / (
        JUMP_TOLERANCE * pl.lit(JUMP_GROWTH_PER_YEAR).pow(dy_next)
    )
    return (
        slim.sort([*group_keys, "date_of_transfer"])
        .with_columns(
            pl.col("price").shift(1).over(group_keys).alias("_prev_price"),
            pl.col("date_of_transfer").shift(1).over(group_keys).alias("_prev_date"),
            pl.col("price").shift(-1).over(group_keys).alias("_next_price"),
            pl.col("date_of_transfer").shift(-1).over(group_keys).alias("_next_date"),
        )
        # fill_null(False): a missing neighbour (first/last sale of a group)
        # makes that rule's comparison null, which must read as "not flagged".
        .filter(
            (up_rule.fill_null(False) | down_rule.fill_null(False))
            & (pl.col("price") >= JUMP_MIN_PRICE)
        )
        .select(*group_keys, "date_of_transfer", "price")
        .unique()
        .with_columns(pl.lit(True).alias("_price_outlier"))
    )


def main():
    parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
    parser.add_argument(
        "--epc", type=Path, required=True, help="EPC certificates CSV file or zip"
    )
    parser.add_argument(
        "--price-paid", type=Path, required=True, help="Price paid parquet file"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()

    with tempfile.TemporaryDirectory(
        prefix="epc_certificates_", dir=local_tmp_dir()
    ) as tmpdir:
        _run(args.epc, args.price_paid, args.output, Path(tmpdir))


def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Path):
    epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
        normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
        normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
    )

    # Dedup fork: keep latest certificate per property. inspection_date is a typed
    # Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
    # null/unparseable-dated one so the genuinely newest certificate is chosen.
    epc = (
        epc_base.sort("inspection_date", descending=True, nulls_last=True)
        .group_by("_epc_match_address", "_epc_match_postcode")
        .first()
        .drop("tenure")
    )

    # Events fork: detect renovation events between consecutive certificates
    # Collect eagerly because .over() window functions don't work in streaming
    # engine (fuzzy_join.py:50 uses sink_parquet which requires streaming).
    events = (
        epc_base.sort("inspection_date")
        .with_columns(
            pl.col("current_energy_rating")
            .replace_strict(RATING_RANK, default=None, return_dtype=pl.Int32)
            .alias("_rating_rank"),
        )
        .with_columns(
            pl.col("number_habitable_rooms")
            .shift(1)
            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_rooms"),
            pl.col("total_floor_area")
            .shift(1)
            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_area"),
            pl.col("_rating_rank")
            .shift(1)
            .over("_epc_match_address", "_epc_match_postcode")
            .alias("_prev_rating_rank"),
        )
        .with_columns(
            pl.when(
                pl.col("number_habitable_rooms").is_not_null()
                & pl.col("_prev_rooms").is_not_null()
                & (pl.col("number_habitable_rooms") != pl.col("_prev_rooms"))
            )
            .then(pl.lit("Remodelling"))
            .when(
                pl.col("total_floor_area").is_not_null()
                & pl.col("_prev_area").is_not_null()
                & (pl.col("total_floor_area") > pl.col("_prev_area"))
            )
            .then(pl.lit("Extension"))
            .when(
                pl.col("_rating_rank").is_not_null()
                & pl.col("_prev_rating_rank").is_not_null()
                & (pl.col("_rating_rank") < pl.col("_prev_rating_rank"))
            )
            .then(pl.lit("Renovation"))
            .otherwise(pl.lit(None, dtype=pl.String))
            .alias("_event"),
        )
        .filter(pl.col("_event").is_not_null())
        .with_columns(
            pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
        )
        .group_by("_epc_match_address", "_epc_match_postcode")
        .agg(
            pl.struct(
                pl.col("_event_year").alias("year"),
                pl.col("_event").alias("event"),
            ).alias("renovation_history"),
        )
        .collect()
    )

    event_counts = (
        events["renovation_history"].explode().struct.field("event").value_counts()
    )
    print(f"Renovation events: {events.height} properties with events")
    print(event_counts)

    # Social tenure fork: flag properties that were ever social housing
    social_tenure = (
        epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
        .select("_epc_match_address", "_epc_match_postcode")
        .unique()
        .with_columns(pl.lit("Yes").alias("was_council_house"))
        .collect()
    )
    print(f"Former council houses (EPC social tenure): {social_tenure.height}")

    # Left-join events and social tenure back onto dedup EPC
    epc = (
        epc.join(
            events.lazy(),
            on=["_epc_match_address", "_epc_match_postcode"],
            how="left",
        )
        .join(
            social_tenure.lazy(),
            on=["_epc_match_address", "_epc_match_postcode"],
            how="left",
        )
        .with_columns(
            pl.col("was_council_house").fill_null("No"),
        )
    )

    print("EPC dataset")
    print(epc.head().collect())

    # https://www.gov.uk/guidance/about-the-price-paid-data
    property_type_map = {
        "D": "Detached",
        "S": "Semi-Detached",
        "T": "Terraced",
        "F": "Flats/Maisonettes",
        "O": "Other",
    }
    duration_map = {"F": "Freehold", "L": "Leasehold"}

    # price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
    # VALUE-QUALITY filters: they gate the price aggregations only. Category B
    # entries (repossessions, bulk/portfolio, power-of-sale transfers), sub-MIN
    # sales and jump-flagged outliers must not pollute latest_price /
    # historical_prices (and the downstream price-per-sqm feature), but they
    # MUST still count for first_transfer_date / old_new so a new-build's
    # genuine earliest transfer year is preserved.
    price_ok = pl.col("price") >= MIN_PRICE
    category_ok = pl.col("ppd_category") == "A"
    value_ok = price_ok & category_ok
    # quality_ok additionally excludes consecutive-sale jump outliers (see
    # flag_price_outliers); _price_outlier exists only after the join below.
    quality_ok = value_ok & pl.col("_price_outlier").is_null()

    price_paid_base = (
        pl.scan_parquet(price_paid_path)
        .select(
            "price",
            "date_of_transfer",
            pl.col("property_type")
            .alias("pp_property_type")
            .replace(property_type_map),
            pl.col("postcode").str.strip_chars(),
            "paon",
            "saon",
            "street",
            "locality",
            "town_city",
            pl.col("duration").replace(duration_map),
            "old_new",
            "ppd_category",
        )
        .filter(pl.col("pp_property_type") != "Other")
        .with_columns(
            _join_address_parts("saon", "paon", "street").alias("pp_address"),
        )
        .with_columns(
            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
            normalize_postcode_key(pl.col("postcode")).alias("_pp_match_postcode"),
        )
        .filter(pl.col("_pp_match_postcode").is_not_null())
        .with_columns(
            pl.coalesce("_pp_match_address", "pp_address").alias("_pp_group_address"),
            pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
        )
        .filter(pl.col("pp_address").is_not_null())
        # Price-paid carries ~72k duplicate (address, postcode, date, price)
        # transaction groups with DISTINCT transaction ids — the same completed
        # sale lodged twice — which double-counted sales in historical_prices.
        # Collapse each to one row. ppd_category stays in the subset so an
        # A/B-categorised pair of the same sale survives as two rows; only the
        # A row feeds the price aggregations (quality_ok), which is intentional.
        .unique(
            subset=[
                "_pp_group_address",
                "_pp_group_postcode",
                "date_of_transfer",
                "price",
                "ppd_category",
            ],
            keep="any",
        )
    )

    # Bounded eager pass over the quality sales only (~30M rows x 4 narrow
    # columns): the window functions inside flag_price_outliers may not run
    # under the streaming sink used by fuzzy_join_on_postcode, so the outlier
    # flags are computed here and joined back into the lazy stream.
    outliers = flag_price_outliers(
        price_paid_base.filter(value_ok)
        .select(
            "_pp_group_address", "_pp_group_postcode", "date_of_transfer", "price"
        )
        .collect(engine="streaming")
    )
    print(f"Implausible consecutive-sale price jumps flagged: {outliers.height}")

    price_paid = (
        # Outlier rows stay in the stream (they still count for
        # first_transfer_date / old_new, same as category-B sales); quality_ok
        # merely drops them from the price aggregations. _price_outlier is not
        # aggregated below, so the helper column dies with the group_by.
        price_paid_base.join(
            outliers.lazy(),
            on=[
                "_pp_group_address",
                "_pp_group_postcode",
                "date_of_transfer",
                "price",
            ],
            how="left",
        )
        .sort("date_of_transfer")
        .group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
        .agg(
            pl.col("pp_address").last(),
            pl.col("postcode").last(),
            pl.col("_pp_match_address").last(),
            pl.col("_pp_match_postcode").last(),
            # Price aggregations are restricted to quality-passing sales.
            pl.struct(
                pl.col("date_of_transfer").dt.year().alias("year"),
                pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
                "price",
            )
            .filter(quality_ok)
            .alias("historical_prices"),
            pl.col("pp_property_type").last(),
            pl.col("duration").last(),
            pl.col("price").filter(quality_ok).last().alias("latest_price"),
            pl.col("date_of_transfer").filter(quality_ok).last(),
            # first_transfer_date / old_new reflect the genuine earliest transfer
            # over the full per-group transaction stream (not value-filtered).
            pl.col("date_of_transfer").first().alias("first_transfer_date"),
            pl.col("old_new").first(),
        )
        # Preserve the property universe: previously a property needed >=1 sale
        # >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
        .filter(pl.col("latest_price").is_not_null())
    )

    print("Price paid dataset")
    print(price_paid.head().collect())

    joined = (
        fuzzy_join_on_postcode(
            left=price_paid,
            right=epc,
            left_address_col="pp_address",
            right_address_col="epc_address",
            left_postcode_col="postcode",
            right_postcode_col="epc_postcode",
        )
        .drop("epc_postcode")
        # Audit trail: keep the fuzzy-match confidence (100 = exact address
        # match) in the published output; null means no EPC match.
        .rename({"_match_score": "epc_match_score"})
        .collect(engine="streaming")
    )

    matched = joined.filter(
        pl.col("epc_address").is_not_null() & pl.col("pp_address").is_not_null()
    )
    total = joined.height
    print(f"Unique properties: {total}")
    print(f"Matched: {matched.height} ({100 * matched.height / total:.1f}%)")
    print(f"Unmatched: {total - matched.height}")

    # For new-builds (old_new == "Y"), use the first transaction date year as
    # the exact construction date; otherwise fall back to the EPC age band.
    epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
    transfer_year = (
        pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
    )
    is_new_build = pl.col("old_new") == "Y"

    joined = joined.with_columns(
        pl.when(is_new_build & transfer_year.is_not_null())
        .then(transfer_year)
        .otherwise(epc_band_year)
        .alias("construction_age_band"),
        pl.when(is_new_build & transfer_year.is_not_null())
        .then(pl.lit(0, dtype=pl.UInt8))
        .when(epc_band_year.is_not_null())
        .then(pl.lit(1, dtype=pl.UInt8))
        .otherwise(pl.lit(None, dtype=pl.UInt8))
        .alias("is_construction_date_approximate"),
    ).drop(
        [
            "old_new",
            "first_transfer_date",
            "_pp_match_address",
            "_pp_match_postcode",
            "_pp_group_address",
            "_pp_group_postcode",
            "_epc_match_address",
            "_epc_match_postcode",
        ],
        strict=False,
    )

    joined = joined.rename({col: col.lower() for col in joined.columns})

    print(joined.head())
    joined.write_parquet(output_path)
    print(f"Wrote {output_path}")


if __name__ == "__main__":
    main()