Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -18,11 +18,49 @@ from ..utils import (
|
|||
normalize_postcode_key,
|
||||
)
|
||||
|
||||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
|
||||
# Plausible construction-year range; band-derived years outside it (e.g. OCR
|
||||
# noise like 1012 or 2202) are nulled rather than published.
|
||||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
||||
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
|
||||
(1958) rather than the lower bound, which previously biased every band-derived
|
||||
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
|
||||
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
|
||||
that year. Already-numeric inputs (a year produced by an earlier call) pass
|
||||
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
|
||||
"""
|
||||
text = (
|
||||
band.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
)
|
||||
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
|
||||
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
|
||||
year = (
|
||||
pl.when(text.str.starts_with("before "))
|
||||
.then(None)
|
||||
.when(high.is_not_null())
|
||||
.then(((low + high) / 2).round(0).cast(pl.Int32))
|
||||
.otherwise(low)
|
||||
)
|
||||
return (
|
||||
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
|
||||
.then(year)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
||||
|
||||
EPC_SOURCE_COLUMNS = [
|
||||
"address",
|
||||
"postcode",
|
||||
|
|
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
|
||||
# For new-builds (old_new == "Y"), use the first transaction date year as
|
||||
# the exact construction date; otherwise fall back to the EPC age band.
|
||||
epc_band_year = (
|
||||
pl.col("construction_age_band")
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
|
||||
transfer_year = (
|
||||
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue