Improve data

2026-06-10 07:54:25 +01:00 · 2026-06-10 07:54:25 +01:00 · 85da1941aa
commit 85da1941aa
parent b4d66a28c1
31 changed files with 901 additions and 319 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -2,6 +2,7 @@ import argparse
 import re
 import tempfile
 from dataclasses import dataclass
+from datetime import date
 from typing import Literal

 import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
 CONSERVATION_AREA_FEATURE = "Within conservation area"
-TREE_DENSITY_FEATURE = "Street tree density percentile"
+# Named "Tree canopy" (not "Street tree") because the underlying density unions
+# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
+# woodland-edge postcode's score reflects forest canopy, not only street trees.
+TREE_DENSITY_FEATURE = "Tree canopy density percentile"
 LISTED_BUILDING_FEATURE = "Listed building"
 LISTED_BUILDING_MATCH_RADIUS_M = 250.0
 LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:


 def _is_current_planning_record(end_date: object) -> bool:
+    """A planning record is current when it has no end-date OR its end-date is
+    still in the future. The planning.data.gov.uk `end-date` field marks when a
+    designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
+    area and must NOT be dropped — the previous "any non-empty date = ended"
+    logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
    if end_date is None:
        return True
    if isinstance(end_date, str):
-        return end_date.strip() == ""
+        text = end_date.strip()
+        if text == "":
+            return True
+        try:
+            return date.fromisoformat(text[:10]) > date.today()
+        except ValueError:
+            # Unparseable end-date: keep the record rather than silently drop it.
+            return True
    return False


@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
    )


+def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
+    """Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
+
+    Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
+    like median age and IoD. The IoD table defines the LSOA universe every
+    postcode resolves into, so a missing LSOA would silently null the ethnicity
+    columns for those postcodes; require full coverage instead.
+    """
+    iod_lsoas = pl.read_parquet(
+        iod_path, columns=["LSOA code (2021)"]
+    ).rename({"LSOA code (2021)": "lsoa21"})
+
+    ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
+    missing_ethnicity = iod_lsoas.join(
+        ethnicity_lsoas, on="lsoa21", how="anti"
+    ).sort("lsoa21")
+    if missing_ethnicity.height > 0:
+        raise ValueError(
+            "Ethnicity data is missing LSOA coverage: "
+            f"{missing_ethnicity.height} LSOAs, e.g. "
+            f"{missing_ethnicity.head(10).to_dicts()}"
+        )
+
+
 def _validate_lad_source_coverage(
-    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
+    iod_path: Path, rental_prices_path: Path
 ) -> None:
    iod_lads = (
        pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
        .unique(["lad"])
    )

-    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
-        {"Geography_code": "lad"}
-    )
-    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
-    if missing_ethnicity.height > 0:
-        raise ValueError(
-            "Ethnicity data is missing 2024 LAD coverage: "
-            f"{missing_ethnicity.to_dicts()}"
-        )
-
    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
        {"area_code": "lad"}
    )
@ -849,12 +879,10 @@ def _join_area_side_tables(
    broadband: pl.LazyFrame,
 ) -> pl.LazyFrame:
    base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
-    base = base.join(
-        ethnicity,
-        left_on="Local Authority District code (2024)",
-        right_on="Geography_code",
-        how="left",
-    )
+    # Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
+    # `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
+    # Local-Authority broadcast, with no change to the 6-bucket output schema.
+    base = base.join(ethnicity, on="lsoa21", how="left")

    # Crime is counted spatially per postcode (incidents within 50m of the
    # postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
    """
    if mode == "listings" and actual_listings_path is None:
        raise ValueError("listings mode requires actual_listings_path")
-    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
+    _validate_lsoa_source_coverage(iod_path, ethnicity_path)
+    _validate_lad_source_coverage(iod_path, rental_prices_path)

    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
        "--ethnicity",
        type=Path,
        required=True,
-        help="Ethnicity by local authority parquet file (optional)",
+        help="Census 2021 ethnic group (TS021) by LSOA parquet file",
    )
    parser.add_argument(
        "--crime",