Update data

2026-05-14 08:17:10 +01:00 · 2026-05-14 08:17:10 +01:00 · 273d7a83ee
commit 273d7a83ee
parent a4103b0896
15 changed files with 716 additions and 316 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -22,6 +22,8 @@ _AREA_COLUMNS = [
    "Postcode",
    "lat",
    "lon",
+    # Runtime provenance for deciding whether missing coordinates are skippable.
+    "ctry25cd",
    # Deprivation
    "Income Score",
    "Employment Score",
@ -86,6 +88,15 @@ _AREA_COLUMNS = [
 _DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
 _DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
 TREE_DENSITY_FEATURE = "Street tree density percentile"
+_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
+    r"^Tree canopy density percentile within \d+m$"
+)
+_RENT_SOURCE_UNAVAILABLE_LADS = {
+    # ONS PIPR does not publish LAD-level private-rent estimates for these
+    # small authorities. Keep rent null there, but fail on any other LAD miss.
+    "E06000053": "Isles of Scilly",
+    "E09000001": "City of London",
+}


 def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -112,6 +123,107 @@ def _less_deprived_percentile_expr(column: str) -> pl.Expr:
    )


+def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
+    tree_density = pl.scan_parquet(tree_density_postcodes_path)
+    columns = set(tree_density.collect_schema().names())
+    if "postcode" not in columns:
+        raise ValueError(
+            f"{tree_density_postcodes_path} is missing required column: postcode"
+        )
+
+    if TREE_DENSITY_FEATURE in columns:
+        density_column = TREE_DENSITY_FEATURE
+    else:
+        candidates = sorted(
+            c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
+        )
+        if len(candidates) != 1:
+            raise ValueError(
+                f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
+                'or exactly one "Tree canopy density percentile within {radius}m" column; '
+                f"found {len(candidates)} postcode percentile columns"
+            )
+        density_column = candidates[0]
+
+    return (
+        tree_density.select(
+            pl.col("postcode"),
+            pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
+        )
+        .drop_nulls(["postcode"])
+        .unique(["postcode"])
+    )
+
+
+def _validate_lad_source_coverage(
+    iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
+) -> None:
+    iod_lads = (
+        pl.read_parquet(
+            iod_path,
+            columns=[
+                "Local Authority District code (2024)",
+                "Local Authority District name (2024)",
+            ],
+        )
+        .rename(
+            {
+                "Local Authority District code (2024)": "lad",
+                "Local Authority District name (2024)": "lad_name",
+            }
+        )
+        .unique(["lad"])
+    )
+
+    ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
+        {"Geography_code": "lad"}
+    )
+    missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
+    if missing_ethnicity.height > 0:
+        raise ValueError(
+            "Ethnicity data is missing 2024 LAD coverage: "
+            f"{missing_ethnicity.to_dicts()}"
+        )
+
+    rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
+        {"area_code": "lad"}
+    )
+    missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
+    unexpected_missing_rent = missing_rent.filter(
+        ~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
+    )
+    if unexpected_missing_rent.height > 0:
+        raise ValueError(
+            "Rental data is missing 2024 LAD coverage: "
+            f"{unexpected_missing_rent.to_dicts()}"
+        )
+    if missing_rent.height > 0:
+        print(
+            "PIPR has no LAD-level rent estimates for source-unavailable LADs; "
+            f"rent will remain null there: {missing_rent.to_dicts()}"
+        )
+
+
+def _validate_property_postcodes(df: pl.DataFrame) -> None:
+    invalid = df.filter(
+        pl.col("Postcode").is_null()
+        | (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
+    )
+    if invalid.height == 0:
+        return
+
+    sample_cols = [
+        col
+        for col in ("Postcode", "Address per Property Register", "Last known price")
+        if col in invalid.columns
+    ]
+    sample = invalid.select(sample_cols).head(10).to_dicts()
+    raise ValueError(
+        "Property rows missing a postcode after merge: "
+        f"{invalid.height} rows. Sample: {sample}"
+    )
+
+
 def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
@ -126,12 +238,14 @@ def _build(
    lsoa_population_path: Path,
    median_age_path: Path,
    election_results_path: Path,
-    tree_density_addresses_path: Path | None = None,
+    tree_density_postcodes_path: Path | None = None,
 ) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Build postcode and properties dataframes from epc_pp + auxiliary data.

    Returns (postcode_df, properties_df).
    """
+    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
+
    wide = pl.scan_parquet(epc_pp_path).filter(
        pl.col("total_floor_area").is_null()
        | (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -152,9 +266,15 @@ def _build(
        .drop("new_postcode")
    )

+    arcgis_raw = pl.scan_parquet(arcgis_path)
+    postcode_country = arcgis_raw.select(
+        pl.col("pcds").alias("postcode"),
+        pl.col("ctry25cd"),
+    ).unique(["postcode"])
+    wide = wide.join(postcode_country, on="postcode", how="left")
+
    arcgis = (
-        pl.scan_parquet(arcgis_path)
-        .filter(pl.col("ctry25cd") == "E92000001")  # England only
+        arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")  # England only
        .filter(pl.col("doterm").is_null())  # Active postcodes only
        # NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
        # Alias them back to the short canonical names used across the
@ -191,7 +311,9 @@ def _build(
        .cast(pl.UInt8)
        .alias("_bedrooms"),
    )
-    rental = pl.scan_parquet(rental_prices_path)
+    rental = pl.scan_parquet(rental_prices_path).select(
+        "area_code", "bedrooms", "mean_monthly_rent"
+    )
    wide = wide.join(
        rental,
        left_on=["Local Authority District code (2024)", "_bedrooms"],
@ -260,17 +382,9 @@ def _build(
    school_proximity = pl.scan_parquet(school_proximity_path)
    wide = wide.join(school_proximity, on="postcode", how="left")

-    if tree_density_addresses_path is not None:
-        tree_density = (
-            pl.scan_parquet(tree_density_addresses_path)
-            .select(
-                pl.col("postcode"),
-                pl.col("pp_address"),
-                pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
-            )
-            .unique(["postcode", "pp_address"])
-        )
-        wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
+    if tree_density_postcodes_path is not None:
+        tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
+        wide = wide.join(tree_density, on="postcode", how="left")

    # Broadband: derive max available download speed tier per postcode from
    # Ofcom availability percentages.  Tiers: Gigabit ≥1000, UFBB ≥300,
@ -415,6 +529,7 @@ def _build(

    print("Collecting with streaming engine...")
    df = wide.collect(engine="streaming")
+    _validate_property_postcodes(df)

    # Split into postcode-level and property-level dataframes
    area_cols = [
@ -508,10 +623,10 @@ def main():
        help="2024 General Election results by constituency parquet file",
    )
    parser.add_argument(
-        "--tree-density-addresses",
+        "--tree-density-postcodes",
        type=Path,
        required=False,
-        help="Address-level tree density parquet from pipeline.transform.tree_density",
+        help="Postcode-level tree density parquet from pipeline.transform.tree_density",
    )
    parser.add_argument(
        "--output-postcodes",
@ -541,7 +656,7 @@ def main():
        lsoa_population_path=args.lsoa_population,
        median_age_path=args.median_age,
        election_results_path=args.election_results,
-        tree_density_addresses_path=args.tree_density_addresses,
+        tree_density_postcodes_path=args.tree_density_postcodes,
    )

    print(f"\nPostcode columns: {postcode_df.columns}")