Rerun data pipelines

2026-05-10 14:49:53 +01:00 · 2026-05-10 14:49:53 +01:00 · fc10381692
commit fc10381692
parent 4c95815dc8
27 changed files with 2143 additions and 215 deletions
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10

+_IOD_PERCENTILE_COLUMNS = [
+    "Education, Skills and Training Score",
+    "Income Score (rate)",
+    "Employment Score (rate)",
+    "Health Deprivation and Disability Score",
+    "Indoors Sub-domain Score",
+    "Outdoors Sub-domain Score",
+]
+

 _AREA_COLUMNS = [
    "Postcode",
@ -76,6 +85,24 @@ _AREA_COLUMNS = [
 ]


+def _less_deprived_percentile_expr(column: str) -> pl.Expr:
+    """Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
+    non_null_count = pl.col(column).count()
+    descending_rank = pl.col(column).rank("average", descending=True)
+    return (
+        pl.when(pl.col(column).is_null())
+        .then(None)
+        .when(pl.col(column) == pl.col(column).min())
+        .then(100.0)
+        .when(pl.col(column) == pl.col(column).max())
+        .then(0.0)
+        .when(non_null_count > 1)
+        .then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
+        .otherwise(100.0)
+        .alias(column)
+    )
+
+
 def _build(
    epc_pp_path: Path,
    arcgis_path: Path,
@ -134,20 +161,11 @@ def _build(
    )
    wide = wide.join(arcgis, on="postcode", how="left")

-    iod = pl.scan_parquet(iod_path)
+    iod = pl.scan_parquet(iod_path).with_columns(
+        *(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
+    )
    wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")

-    # Invert deprivation scores so that higher values = less deprived (better)
-    iod_score_cols = [
-        "Education, Skills and Training Score",
-        "Income Score (rate)",
-        "Employment Score (rate)",
-        "Health Deprivation and Disability Score",
-        "Indoors Sub-domain Score",
-        "Outdoors Sub-domain Score",
-    ]
-    wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
-
    ethnicity = pl.scan_parquet(ethnicity_path)
    wide = wide.join(
        ethnicity,