Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
            "epc_address": "1 Example Street",
            "current_energy_rating": "C",
            "total_floor_area": 85.0,
-            "construction_age_band": 1950,
+            # Band midpoint of 1950-1966, not the lower bound.
+            "construction_age_band": 1958,
            "was_council_house": "Yes",
        }
    ]
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
            "current_energy_rating": None,
        }
    ]
+
+
+def test_epc_band_to_year_uses_midpoint_and_clamps():
+    import polars as pl
+
+    from pipeline.transform.join_epc_pp import epc_band_to_year
+
+    df = pl.DataFrame(
+        {
+            "b": [
+                "England and Wales: 1950-1966",  # midpoint 1958
+                "1900-1929",  # midpoint 1914
+                "England and Wales: before 1900",  # too wide -> null
+                "2012 onwards",  # single year
+                "1012",  # implausible -> null
+                "2202",  # implausible -> null
+                None,  # null -> null
+                "1958",  # already-numeric-as-string -> pass through
+            ]
+        }
+    )
+    years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
+    assert years == [1958, 1914, None, 2012, None, None, None, 1958]