idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -839,25 +839,36 @@ def _join_area_side_tables(
|
|||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
base = base.join(crime, on="postcode", how="left")
|
||||
serious_crime_cols = [
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
]
|
||||
minor_crime_cols = [
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
]
|
||||
# The LEFT join leaves every per-type column null for postcodes absent from
|
||||
# the crime table; sum_horizontal alone would fabricate a "zero crime"
|
||||
# rollup there, so keep the rollup null when ALL components are null.
|
||||
base = base.with_columns(
|
||||
pl.sum_horizontal(
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
).alias("serious_crime_avg_yr"),
|
||||
pl.sum_horizontal(
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
).alias("minor_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(serious_crime_cols))
|
||||
.alias("serious_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(minor_crime_cols))
|
||||
.alias("minor_crime_avg_yr"),
|
||||
)
|
||||
|
||||
base = base.join(median_age, on="lsoa21", how="left")
|
||||
|
|
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
|
|||
# pages); tolerate its absence so older parquets and test fixtures still
|
||||
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
||||
if "UPRN" in raw.collect_schema().names():
|
||||
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
|
||||
# the candidate-side key for every dtype. For a Float UPRN we must
|
||||
# stringify via its integer form (100023336956.0 -> "100023336956"),
|
||||
# otherwise stripping non-digits from "100023336956.0" yields a bogus
|
||||
# trailing-zero key ("1000233369560") that never collides; and a
|
||||
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
|
||||
uprn_col = pl.col("UPRN")
|
||||
if raw.collect_schema()["UPRN"].is_float():
|
||||
integral = uprn_col.cast(pl.Int64, strict=False)
|
||||
uprn_digits = (
|
||||
pl.when(integral == uprn_col)
|
||||
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
|
||||
.otherwise(None)
|
||||
)
|
||||
else:
|
||||
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||
listing_uprn_expr = (
|
||||
pl.when(uprn_digits.str.len_chars() > 0)
|
||||
.then(uprn_digits)
|
||||
|
|
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(
|
|||
|
||||
|
||||
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
||||
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
|
||||
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
|
||||
# The raw property-level value is fill_null("No") upstream, so a plain
|
||||
# coalesce lets a non-null "No" override a directly-matched listing
|
||||
# "Yes". "Former council house" should fire if EITHER side says so.
|
||||
if raw_column == "was_council_house":
|
||||
return (
|
||||
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
|
||||
.then(pl.lit("Yes"))
|
||||
.otherwise(coalesce)
|
||||
.alias(raw_column)
|
||||
)
|
||||
return coalesce.alias(raw_column)
|
||||
|
||||
return wide.with_columns(
|
||||
[
|
||||
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
|
||||
_coalesced(raw_column, direct_column)
|
||||
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
|
||||
]
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue