Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -2,6 +2,7 @@ import argparse
import re
import tempfile
from dataclasses import dataclass
from datetime import date
from typing import Literal
import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
TREE_DENSITY_FEATURE = "Street tree density percentile"
# Named "Tree canopy" (not "Street tree") because the underlying density unions
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
# woodland-edge postcode's score reflects forest canopy, not only street trees.
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
def _is_current_planning_record(end_date: object) -> bool:
"""A planning record is current when it has no end-date OR its end-date is
still in the future. The planning.data.gov.uk `end-date` field marks when a
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
area and must NOT be dropped the previous "any non-empty date = ended"
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
if end_date is None:
return True
if isinstance(end_date, str):
return end_date.strip() == ""
text = end_date.strip()
if text == "":
return True
try:
return date.fromisoformat(text[:10]) > date.today()
except ValueError:
# Unparseable end-date: keep the record rather than silently drop it.
return True
return False
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
)
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
like median age and IoD. The IoD table defines the LSOA universe every
postcode resolves into, so a missing LSOA would silently null the ethnicity
columns for those postcodes; require full coverage instead.
"""
iod_lsoas = pl.read_parquet(
iod_path, columns=["LSOA code (2021)"]
).rename({"LSOA code (2021)": "lsoa21"})
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
missing_ethnicity = iod_lsoas.join(
ethnicity_lsoas, on="lsoa21", how="anti"
).sort("lsoa21")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing LSOA coverage: "
f"{missing_ethnicity.height} LSOAs, e.g. "
f"{missing_ethnicity.head(10).to_dicts()}"
)
def _validate_lad_source_coverage(
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
iod_path: Path, rental_prices_path: Path
) -> None:
iod_lads = (
pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
.unique(["lad"])
)
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
{"Geography_code": "lad"}
)
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing 2024 LAD coverage: "
f"{missing_ethnicity.to_dicts()}"
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"}
)
@ -849,12 +879,10 @@ def _join_area_side_tables(
broadband: pl.LazyFrame,
) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
base = base.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
# Local-Authority broadcast, with no change to the 6-bucket output schema.
base = base.join(ethnicity, on="lsoa21", how="left")
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
"""
if mode == "listings" and actual_listings_path is None:
raise ValueError("listings mode requires actual_listings_path")
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
_validate_lad_source_coverage(iod_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
"--ethnicity",
type=Path,
required=True,
help="Ethnicity by local authority parquet file (optional)",
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
)
parser.add_argument(
"--crime",