Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
"epc_address": "1 Example Street",
|
||||
"current_energy_rating": "C",
|
||||
"total_floor_area": 85.0,
|
||||
"construction_age_band": 1950,
|
||||
# Band midpoint of 1950-1966, not the lower bound.
|
||||
"construction_age_band": 1958,
|
||||
"was_council_house": "Yes",
|
||||
}
|
||||
]
|
||||
|
|
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
"current_energy_rating": None,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.join_epc_pp import epc_band_to_year
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"b": [
|
||||
"England and Wales: 1950-1966", # midpoint 1958
|
||||
"1900-1929", # midpoint 1914
|
||||
"England and Wales: before 1900", # too wide -> null
|
||||
"2012 onwards", # single year
|
||||
"1012", # implausible -> null
|
||||
"2202", # implausible -> null
|
||||
None, # null -> null
|
||||
"1958", # already-numeric-as-string -> pass through
|
||||
]
|
||||
}
|
||||
)
|
||||
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
|
||||
assert years == [1958, 1914, None, 2012, None, None, None, 1958]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue