Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -8,6 +8,7 @@ import polars as pl

 from pipeline.transform.join_epc_pp import (
    EPC_SOURCE_COLUMNS,
+    _join_address_parts,
    _run,
    _scan_epc_certificates,
 )
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
    assert df.schema["number_habitable_rooms"] == pl.Int16


+def test_join_address_parts_empty_string_components():
+    # Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
+    # concat_str(ignore_nulls=True) alone leaked the separator into the
+    # display address (' 10 PALACE GREEN') and doubled it for empty middle
+    # components. Empty/whitespace-only parts must contribute nothing.
+    df = pl.DataFrame(
+        {
+            "saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, "  ", " FLAT 2"],
+            "paon": ["10", "10", "", "82", "", None, "10", "11 "],
+            "street": [
+                "PALACE GREEN",
+                "HIGH STREET",
+                "HIGH STREET",
+                "",
+                "",
+                None,
+                "PALACE GREEN",
+                "STATION ROAD",
+            ],
+        }
+    )
+    out = df.select(
+        _join_address_parts("saon", "paon", "street").alias("address")
+    ).get_column("address")
+
+    assert out.to_list() == [
+        "10 PALACE GREEN",  # empty saon -> no leading space
+        "FLAT 1 10 HIGH STREET",  # normal three-part address is unchanged
+        "FLAT 1 HIGH STREET",  # empty middle component -> no double space
+        "FLAT 21 82",  # empty street -> no trailing space
+        None,  # all-empty -> null, not whitespace junk
+        None,  # all-null -> null
+        "10 PALACE GREEN",  # whitespace-only component treated as empty
+        "FLAT 2 11 STATION ROAD",  # per-component padding is stripped
+    ]
+    # Invariant: every produced address is trimmed and single-spaced.
+    produced = out.drop_nulls()
+    assert produced.str.starts_with(" ").sum() == 0
+    assert produced.str.ends_with(" ").sum() == 0
+    assert produced.str.contains("  ", literal=True).sum() == 0
+
+
+def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
+    # Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
+    # published pp_address must not inherit a leading separator from it.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [""],
+            "street": ["Example Street"],
+            "locality": [""],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+            "ppd_category": ["A"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # No leading space, and the clean address still matches its EPC record.
+    assert df.select("pp_address", "epc_address").to_dicts() == [
+        {"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
+    ]
+
+
 def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive: