LGTM

2026-05-14 08:09:19 +01:00 · 2026-05-14 08:09:19 +01:00 · a4103b0896
commit a4103b0896
parent a8165249a4
64 changed files with 5376 additions and 3832 deletions
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -136,17 +136,17 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
-            "price": [250_000],
-            "date_of_transfer": [date(2024, 2, 3)],
-            "property_type": ["T"],
-            "postcode": ["AA1 1AA"],
-            "paon": ["1"],
-            "saon": [None],
-            "street": ["Example Street"],
-            "locality": [None],
-            "town_city": ["Exampletown"],
-            "duration": ["F"],
-            "old_new": ["N"],
+            "price": [200_000, 250_000],
+            "date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
+            "property_type": ["T", "T"],
+            "postcode": ["AA1 1AA", "AA1 1AA"],
+            "paon": ["1", "1"],
+            "saon": [None, None],
+            "street": ["Example-Street", "Example Street"],
+            "locality": [None, None],
+            "town_city": ["Exampletown", "Exampletown"],
+            "duration": ["F", "F"],
+            "old_new": ["N", "N"],
        }
    ).write_parquet(price_paid_path)

@ -172,3 +172,85 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
        }
    ]
    assert df.get_column("renovation_history").list.len().to_list() == [1]
+    assert df.get_column("historical_prices").list.len().to_list() == [2]
+
+
+def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000, 300_000],
+            "date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
+            "property_type": ["T", "T"],
+            "postcode": ["AA1 1AA", ""],
+            "paon": ["1", "2"],
+            "saon": [None, None],
+            "street": ["Example Street", "Example Street"],
+            "locality": [None, None],
+            "town_city": ["Exampletown", "Exampletown"],
+            "duration": ["F", "F"],
+            "old_new": ["N", "N"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df["postcode"].to_list() == ["AA1 1AA"]
+
+
+def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row(address="1 Totally Different Road"))
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [None],
+            "street": ["Example Street"],
+            "locality": [None],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    assert df.select(
+        "pp_address",
+        "epc_address",
+        "total_floor_area",
+        "current_energy_rating",
+    ).to_dicts() == [
+        {
+            "pp_address": "1 Example Street",
+            "epc_address": None,
+            "total_floor_area": None,
+            "current_energy_rating": None,
+        }
+    ]