Good changes

This commit is contained in:
Andras Schmelczer 2026-02-10 22:09:46 +00:00
parent d39d1b15fd
commit 1f68ca0512
23 changed files with 670 additions and 289 deletions

View file

@ -74,6 +74,18 @@ def _build_wide(
iod = pl.scan_parquet(iod_path)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Living Environment Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
@ -158,10 +170,20 @@ def _build_wide(
geosure = pl.scan_parquet(geosure_path)
wide = wide.join(geosure, on="postcode", how="left")
# Use built_form (Terraced, Semi-detached) when available, otherwise epc_property_type
# Derive property_type: prefer EPC data, fall back to price-paid.
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
wide = wide.with_columns(
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc & is_house)
.then(pl.col("pp_property_type"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(pl.col("pp_property_type"))
.alias("property_type")
)