Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -36,6 +36,16 @@ MIN_PRICE = 10_000
|
|||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
|
||||
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
|
||||
# habitable rooms) that otherwise propagate verbatim into the published per-
|
||||
# property columns. Values outside these bands are nulled (treated as unknown)
|
||||
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
|
||||
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
|
||||
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
|
||||
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
|
||||
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
|
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
pl.when(pl.col("number_habitable_rooms") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("number_habitable_rooms"))
|
||||
# Null implausible EPC dimensions so data-entry errors don't reach
|
||||
# the published per-property columns (Interior height, Total floor
|
||||
# area, Number of bedrooms & living rooms). Treated as unknown.
|
||||
pl.when(
|
||||
(pl.col("number_habitable_rooms") >= 1)
|
||||
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
|
||||
)
|
||||
.then(pl.col("number_habitable_rooms"))
|
||||
.otherwise(None)
|
||||
.alias("number_habitable_rooms"),
|
||||
pl.when(
|
||||
pl.col("floor_height").is_between(
|
||||
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
|
||||
)
|
||||
)
|
||||
.then(pl.col("floor_height"))
|
||||
.otherwise(None)
|
||||
.alias("floor_height"),
|
||||
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
|
||||
.then(pl.col("total_floor_area"))
|
||||
.otherwise(None)
|
||||
.alias("total_floor_area"),
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue