Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -36,6 +36,16 @@ MIN_PRICE = 10_000
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
# habitable rooms) that otherwise propagate verbatim into the published per-
# property columns. Values outside these bands are nulled (treated as unknown)
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
)
.filter(pl.col("epc_address").is_not_null())
.with_columns(
pl.when(pl.col("number_habitable_rooms") == 0)
.then(None)
.otherwise(pl.col("number_habitable_rooms"))
# Null implausible EPC dimensions so data-entry errors don't reach
# the published per-property columns (Interior height, Total floor
# area, Number of bedrooms & living rooms). Treated as unknown.
pl.when(
(pl.col("number_habitable_rooms") >= 1)
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
)
.then(pl.col("number_habitable_rooms"))
.otherwise(None)
.alias("number_habitable_rooms"),
pl.when(
pl.col("floor_height").is_between(
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
)
)
.then(pl.col("floor_height"))
.otherwise(None)
.alias("floor_height"),
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
.then(pl.col("total_floor_area"))
.otherwise(None)
.alias("total_floor_area"),
)
)