This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -839,25 +839,36 @@ def _join_area_side_tables(
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
serious_crime_cols = [
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
]
minor_crime_cols = [
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
]
# The LEFT join leaves every per-type column null for postcodes absent from
# the crime table; sum_horizontal alone would fabricate a "zero crime"
# rollup there, so keep the rollup null when ALL components are null.
base = base.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(serious_crime_cols))
.alias("serious_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(minor_crime_cols))
.alias("minor_crime_avg_yr"),
)
base = base.join(median_age, on="lsoa21", how="left")
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
# pages); tolerate its absence so older parquets and test fixtures still
# load. Digits-only so it compares equal to the EPC register's UPRN.
if "UPRN" in raw.collect_schema().names():
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
# the candidate-side key for every dtype. For a Float UPRN we must
# stringify via its integer form (100023336956.0 -> "100023336956"),
# otherwise stripping non-digits from "100023336956.0" yields a bogus
# trailing-zero key ("1000233369560") that never collides; and a
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
uprn_col = pl.col("UPRN")
if raw.collect_schema()["UPRN"].is_float():
integral = uprn_col.cast(pl.Int64, strict=False)
uprn_digits = (
pl.when(integral == uprn_col)
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
.otherwise(None)
)
else:
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
listing_uprn_expr = (
pl.when(uprn_digits.str.len_chars() > 0)
.then(uprn_digits)
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
# The raw property-level value is fill_null("No") upstream, so a plain
# coalesce lets a non-null "No" override a directly-matched listing
# "Yes". "Former council house" should fire if EITHER side says so.
if raw_column == "was_council_house":
return (
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
.then(pl.lit("Yes"))
.otherwise(coalesce)
.alias(raw_column)
)
return coalesce.alias(raw_column)
return wide.with_columns(
[
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
_coalesced(raw_column, direct_column)
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
]
)