Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
"epc_address": "1 Example Street",
"current_energy_rating": "C",
"total_floor_area": 85.0,
"construction_age_band": 1950,
# Band midpoint of 1950-1966, not the lower bound.
"construction_age_band": 1958,
"was_council_house": "Yes",
}
]
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
"current_energy_rating": None,
}
]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl
from pipeline.transform.join_epc_pp import epc_band_to_year
df = pl.DataFrame(
{
"b": [
"England and Wales: 1950-1966", # midpoint 1958
"1900-1929", # midpoint 1914
"England and Wales: before 1900", # too wide -> null
"2012 onwards", # single year
"1012", # implausible -> null
"2202", # implausible -> null
None, # null -> null
"1958", # already-numeric-as-string -> pass through
]
}
)
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
assert years == [1958, 1914, None, 2012, None, None, None, 1958]