Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -18,11 +18,49 @@ from ..utils import (
normalize_postcode_key,
)
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
(1958) rather than the lower bound, which previously biased every band-derived
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
that year. Already-numeric inputs (a year produced by an earlier call) pass
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
"""
text = (
band.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
)
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
year = (
pl.when(text.str.starts_with("before "))
.then(None)
.when(high.is_not_null())
.then(((low + high) / 2).round(0).cast(pl.Int32))
.otherwise(low)
)
return (
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
.then(year)
.otherwise(None)
.cast(pl.UInt16, strict=False)
)
EPC_SOURCE_COLUMNS = [
"address",
"postcode",
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = (
pl.col("construction_age_band")
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
transfer_year = (
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
)