perfect-postcode/pipeline/transform/test_join_epc_pp.py

import csv
import io
import zipfile
from datetime import date
from pathlib import Path

import polars as pl

from pipeline.transform.join_epc_pp import (
    EPC_SOURCE_COLUMNS,
    _join_address_parts,
    _run,
    _scan_epc_certificates,
)


def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) -> None:
    with path.open("w", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def _row(**overrides: str) -> dict[str, str]:
    row = {
        "address": "1 Example Street",
        "postcode": " aa1 1aa ",
        "uprn": "100012345678",
        "current_energy_rating": "c",
        "potential_energy_rating": "b",
        "property_type": "House",
        "built_form": "Mid-Terrace",
        "inspection_date": "2024-01-02",
        "total_floor_area": "84.5",
        "number_habitable_rooms": "5",
        "floor_height": "2.4",
        "construction_age_band": "England and Wales: 1950-1966",
        "tenure": "owner-occupied",
    }
    row.update(overrides)
    return row


def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
    csv_path = tmp_path / "certificates.csv"
    fieldnames = [column.upper() for column in EPC_SOURCE_COLUMNS]
    row = {column.upper(): value for column, value in _row().items()}
    row["NUMBER_HABITABLE_ROOMS"] = "0"
    _write_csv(csv_path, fieldnames, [row])

    df = _scan_epc_certificates(csv_path, tmp_path).collect()

    assert df.to_dicts() == [
        {
            "epc_address": "1 Example Street",
            "epc_postcode": "AA1 1AA",
            "uprn": "100012345678",
            "current_energy_rating": "C",
            "potential_energy_rating": "B",
            "epc_property_type": "House",
            "built_form": "Mid-Terrace",
            "inspection_date": date(2024, 1, 2),
            "total_floor_area": 84.5,
            "number_habitable_rooms": None,
            "floor_height": 2.4,
            "construction_age_band": "England and Wales: 1950-1966",
            "tenure": "owner-occupied",
        }
    ]


def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    rows_2023 = [_row(address="2 Example Street", inspection_date="2023-03-04")]
    rows_2024 = [
        _row(
            address="3 Example Street",
            postcode="BB2 2BB",
            inspection_date="2024-05-06",
            total_floor_area="",
            tenure="Rented (social)",
        )
    ]

    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        for member_name, rows in [
            ("certificates-2023.csv", rows_2023),
            ("nested/certificates-2024.csv", rows_2024),
        ]:
            csv_text = [",".join(EPC_SOURCE_COLUMNS)]
            csv_text.extend(
                ",".join(row[column] for column in EPC_SOURCE_COLUMNS) for row in rows
            )
            archive.writestr(member_name, "\n".join(csv_text) + "\n")
        archive.writestr("recommendations-2024.csv", "address,postcode\nignored,X\n")

    df = _scan_epc_certificates(zip_path, tmp_path).sort("inspection_date").collect()

    assert df.select("epc_address", "epc_postcode", "total_floor_area").to_dicts() == [
        {
            "epc_address": "2 Example Street",
            "epc_postcode": "AA1 1AA",
            "total_floor_area": 84.5,
        },
        {
            "epc_address": "3 Example Street",
            "epc_postcode": "BB2 2BB",
            "total_floor_area": None,
        },
    ]
    assert df.get_column("tenure").to_list() == ["owner-occupied", "Rented (social)"]
    assert df.schema["number_habitable_rooms"] == pl.Int16


def test_join_address_parts_empty_string_components():
    # Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
    # concat_str(ignore_nulls=True) alone leaked the separator into the
    # display address (' 10 PALACE GREEN') and doubled it for empty middle
    # components. Empty/whitespace-only parts must contribute nothing.
    df = pl.DataFrame(
        {
            "saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, "  ", " FLAT 2"],
            "paon": ["10", "10", "", "82", "", None, "10", "11 "],
            "street": [
                "PALACE GREEN",
                "HIGH STREET",
                "HIGH STREET",
                "",
                "",
                None,
                "PALACE GREEN",
                "STATION ROAD",
            ],
        }
    )
    out = df.select(
        _join_address_parts("saon", "paon", "street").alias("address")
    ).get_column("address")

    assert out.to_list() == [
        "10 PALACE GREEN",  # empty saon -> no leading space
        "FLAT 1 10 HIGH STREET",  # normal three-part address is unchanged
        "FLAT 1 HIGH STREET",  # empty middle component -> no double space
        "FLAT 21 82",  # empty street -> no trailing space
        None,  # all-empty -> null, not whitespace junk
        None,  # all-null -> null
        "10 PALACE GREEN",  # whitespace-only component treated as empty
        "FLAT 2 11 STATION ROAD",  # per-component padding is stripped
    ]
    # Invariant: every produced address is trimmed and single-spaced.
    produced = out.drop_nulls()
    assert produced.str.starts_with(" ").sum() == 0
    assert produced.str.ends_with(" ").sum() == 0
    assert produced.str.contains("  ", literal=True).sum() == 0


def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
    # Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
    # published pp_address must not inherit a leading separator from it.
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000],
            "date_of_transfer": [date(2024, 2, 3)],
            "property_type": ["T"],
            "postcode": ["AA1 1AA"],
            "paon": ["1"],
            "saon": [""],
            "street": ["Example Street"],
            "locality": [""],
            "town_city": ["Exampletown"],
            "duration": ["F"],
            "old_new": ["N"],
            "ppd_category": ["A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    # No leading space, and the clean address still matches its EPC record.
    assert df.select("pp_address", "epc_address").to_dicts() == [
        {"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
    ]


def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerows(
            [
                _row(
                    current_energy_rating="d",
                    inspection_date="2023-01-01",
                    total_floor_area="80",
                    tenure="Rented (social)",
                ),
                _row(
                    current_energy_rating="c",
                    inspection_date="2024-01-01",
                    total_floor_area="85",
                    tenure="owner-occupied",
                ),
            ]
        )
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [200_000, 250_000],
            "date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA"],
            "paon": ["1", "1"],
            "saon": [None, None],
            "street": ["Example-Street", "Example Street"],
            "locality": [None, None],
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    assert df.select(
        "epc_address",
        "current_energy_rating",
        "total_floor_area",
        "construction_age_band",
        "was_council_house",
    ).to_dicts() == [
        {
            "epc_address": "1 Example Street",
            "current_energy_rating": "C",
            "total_floor_area": 85.0,
            # Band midpoint of 1950-1966, not the lower bound.
            "construction_age_band": 1958,
            "was_council_house": "Yes",
        }
    ]
    assert df.get_column("renovation_history").list.len().to_list() == [1]
    assert df.get_column("historical_prices").list.len().to_list() == [2]


def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
    # Two certificates for the same property. The cert with the garbled,
    # unparseable inspection_date must NOT be chosen as "latest": a string sort
    # nulls-first would have picked it, attaching a stale rating/floor area. The
    # valid-dated cert wins, so its rating ("C") and floor area (85) survive.
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerows(
            [
                _row(
                    current_energy_rating="c",
                    inspection_date="2024-01-01",
                    total_floor_area="85",
                ),
                # Same property; an unparseable date (OCR/garbled). Under a raw
                # string descending sort "not-a-date" outranks the ISO date and
                # wins the dedup, but as a null Date it loses.
                _row(
                    current_energy_rating="g",
                    inspection_date="not-a-date",
                    total_floor_area="40",
                ),
            ]
        )
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000],
            "date_of_transfer": [date(2024, 2, 3)],
            "property_type": ["T"],
            "postcode": ["AA1 1AA"],
            "paon": ["1"],
            "saon": [None],
            "street": ["Example Street"],
            "locality": [None],
            "town_city": ["Exampletown"],
            "duration": ["F"],
            "old_new": ["N"],
            "ppd_category": ["A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    # The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
    assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
        {"current_energy_rating": "C", "total_floor_area": 85.0}
    ]


def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000, 300_000],
            "date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", ""],
            "paon": ["1", "2"],
            "saon": [None, None],
            "street": ["Example Street", "Example Street"],
            "locality": [None, None],
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df["postcode"].to_list() == ["AA1 1AA"]


def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row(address="1 Totally Different Road"))
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000],
            "date_of_transfer": [date(2024, 2, 3)],
            "property_type": ["T"],
            "postcode": ["AA1 1AA"],
            "paon": ["1"],
            "saon": [None],
            "street": ["Example Street"],
            "locality": [None],
            "town_city": ["Exampletown"],
            "duration": ["F"],
            "old_new": ["N"],
            "ppd_category": ["A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    assert df.select(
        "pp_address",
        "epc_address",
        "total_floor_area",
        "current_energy_rating",
    ).to_dicts() == [
        {
            "pp_address": "1 Example Street",
            "epc_address": None,
            "total_floor_area": None,
            "current_energy_rating": None,
        }
    ]


def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
    # Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
    # pollute latest_price / historical_prices, but the property still survives
    # via its standard Category A sales.
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [200_000, 250_000, 5_000_000],
            "date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
            "property_type": ["T", "T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
            "paon": ["1", "1", "1"],
            "saon": [None, None, None],
            "street": ["Example Street", "Example Street", "Example Street"],
            "locality": [None, None, None],
            "town_city": ["Exampletown", "Exampletown", "Exampletown"],
            "duration": ["F", "F", "F"],
            "old_new": ["N", "N", "N"],
            # The latest (5M) sale is a Category B bulk/portfolio transfer.
            "ppd_category": ["A", "A", "B"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    # Only the two Category A sales survive; the 5M Category B transfer is dropped.
    assert df.get_column("latest_price").to_list() == [250_000]
    assert df.get_column("historical_prices").list.len().to_list() == [2]


def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
    # A new-build whose earliest sale is below MIN_PRICE must still take that early
    # year as its EXACT construction date, while latest_price uses only the
    # quality-passing (>=MIN_PRICE) sale.
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            # 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
            # must still anchor the construction year but stay out of the price
            # aggregations.
            "price": [5_000, 300_000],
            "date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA"],
            "paon": ["1", "1"],
            "saon": [None, None],
            "street": ["Example Street", "Example Street"],
            "locality": [None, None],
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["Y", "Y"],
            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    # Construction year is the genuine earliest transfer (2015), flagged EXACT,
    # even though that sale is below MIN_PRICE.
    assert df.get_column("construction_age_band").to_list() == [2015]
    assert df.get_column("is_construction_date_approximate").to_list() == [0]
    # latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
    assert df.get_column("latest_price").to_list() == [300_000]
    assert df.get_column("historical_prices").list.len().to_list() == [1]


def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
    # A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
    # NEW floor (10k): it must now be RETAINED in the price aggregations. This
    # pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
    # excluded, giving historical_prices length 1 / latest_price 250_000).
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
        csv_buffer = io.StringIO()
        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
        writer.writeheader()
        writer.writerow(_row())
        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())

    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
            "price": [250_000, 30_000],
            "date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
            "property_type": ["T", "T"],
            "postcode": ["AA1 1AA", "AA1 1AA"],
            "paon": ["1", "1"],
            "saon": [None, None],
            "street": ["Example Street", "Example Street"],
            "locality": [None, None],
            "town_city": ["Exampletown", "Exampletown"],
            "duration": ["F", "F"],
            "old_new": ["N", "N"],
            "ppd_category": ["A", "A"],
        }
    ).write_parquet(price_paid_path)

    output_path = tmp_path / "epc-pp.parquet"
    _run(zip_path, price_paid_path, output_path, tmp_path)

    df = pl.read_parquet(output_path)

    assert df.height == 1
    # Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
    assert df.get_column("historical_prices").list.len().to_list() == [2]
    assert df.get_column("latest_price").to_list() == [30_000]


def test_epc_band_to_year_uses_midpoint_and_clamps():
    import polars as pl

    from pipeline.transform.join_epc_pp import epc_band_to_year

    df = pl.DataFrame(
        {
            "b": [
                "England and Wales: 1950-1966",  # midpoint 1958
                "1900-1929",  # midpoint 1914
                "England and Wales: before 1900",  # too wide -> null
                "2012 onwards",  # single year
                "1012",  # implausible -> null
                "2202",  # implausible -> null
                None,  # null -> null
                "1958",  # already-numeric-as-string -> pass through
            ]
        }
    )
    years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
    assert years == [1958, 1914, None, 2012, None, None, None, 1958]