idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
|
|||
)
|
||||
|
||||
|
||||
def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Keep one row per (postcode, pp_address) — the most-recent transaction.
|
||||
|
||||
The terminated-postcode remap can map two distinct postcodes onto one active
|
||||
successor, collapsing the same physical address onto a single
|
||||
(postcode, pp_address) key with conflicting sale records. Keep the row with
|
||||
the latest date_of_transfer so the headline price/date reflect the most
|
||||
recent transaction; genuinely distinct addresses (a different pp_address) are
|
||||
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
|
||||
never merges unrelated rows.
|
||||
"""
|
||||
return wide.sort(
|
||||
"date_of_transfer", descending=True, nulls_last=True
|
||||
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
|
||||
|
||||
|
||||
def _filter_to_active_english_postcodes(
|
||||
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
|
||||
) -> pl.LazyFrame:
|
||||
|
|
@ -837,38 +853,19 @@ def _join_area_side_tables(
|
|||
)
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
base = base.join(crime, on="postcode", how="left")
|
||||
serious_crime_cols = [
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
]
|
||||
minor_crime_cols = [
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
]
|
||||
# The LEFT join leaves every per-type column null for postcodes absent from
|
||||
# the crime table; sum_horizontal alone would fabricate a "zero crime"
|
||||
# rollup there, so keep the rollup null when ALL components are null.
|
||||
base = base.with_columns(
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(serious_crime_cols))
|
||||
.alias("serious_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(minor_crime_cols))
|
||||
.alias("minor_crime_avg_yr"),
|
||||
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
|
||||
# precomputes the Serious/Minor headline rollups as the mean of the by-year
|
||||
# rollup bars; read those straight through (renamed to the internal columns
|
||||
# _finalize_merged_columns expects) rather than re-summing the per-type
|
||||
# avg/yr columns — summing divides each type by its OWN years-present and
|
||||
# overstates the rollup when types differ in coverage. A postcode absent from
|
||||
# the crime table keeps null rollups via the left join (no fabricated zero);
|
||||
# the per-type avg/yr columns pass through unchanged for display.
|
||||
base = base.join(crime, on="postcode", how="left").rename(
|
||||
{
|
||||
"Serious crime (avg/yr)": "serious_crime_avg_yr",
|
||||
"Minor crime (avg/yr)": "minor_crime_avg_yr",
|
||||
}
|
||||
)
|
||||
|
||||
base = base.join(median_age, on="lsoa21", how="left")
|
||||
|
|
@ -881,7 +878,37 @@ def _join_area_side_tables(
|
|||
)
|
||||
if tree_density is not None:
|
||||
base = base.join(tree_density, on="postcode", how="left")
|
||||
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
# Broadband is the one side table sourced straight from a third-party CSV
|
||||
# (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
|
||||
# step, so its postcode may drift in spacing/casing from the NSPL `pcds`
|
||||
# base key. Normalize BOTH sides to the same canonical pcds form (reusing
|
||||
# `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
|
||||
# before joining, otherwise a real postcode silently misses and its
|
||||
# `max_download_speed` reads as null "no data" downstream. Re-aggregate on
|
||||
# the canonical key so two raw spellings collapsing to one key can't fan out
|
||||
# the base; drop a null canonical key so an unparseable Ofcom row joins
|
||||
# nothing rather than matching a null-key base row.
|
||||
broadband_canonical = (
|
||||
broadband.with_columns(
|
||||
_canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
|
||||
)
|
||||
.drop_nulls("_bb_canonical_postcode")
|
||||
.group_by("_bb_canonical_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
)
|
||||
return (
|
||||
base.with_columns(
|
||||
_canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
|
||||
)
|
||||
.join(
|
||||
broadband_canonical,
|
||||
left_on="_base_canonical_postcode",
|
||||
right_on="_bb_canonical_postcode",
|
||||
how="left",
|
||||
)
|
||||
.drop("_base_canonical_postcode")
|
||||
)
|
||||
|
||||
|
||||
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
|
||||
|
|
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
|
|||
)
|
||||
|
||||
return (
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
epc_base.sort("inspection_date", descending=True, nulls_last=True)
|
||||
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
||||
.first()
|
||||
.join(
|
||||
|
|
@ -1918,6 +1945,10 @@ def _build(
|
|||
# terminated English postcodes are retained under their successor postcode.
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
|
||||
# The remap can collapse two terminated postcodes onto one active successor,
|
||||
# duplicating a physical address's (postcode, pp_address) key; keep only the
|
||||
# most-recent transaction per address before the per-postcode joins.
|
||||
wide = _dedupe_collapsed_properties(wide)
|
||||
arcgis_raw = pl.scan_parquet(arcgis_path)
|
||||
arcgis = _active_english_postcode_area(arcgis_raw)
|
||||
active_postcodes = arcgis.select("postcode").unique()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue