From ef921361ec27c2c100ba298fcf13c1dd7425c6a0 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Tue, 10 Mar 2026 21:51:46 +0000
Subject: [PATCH] Separate chinese

---
 pipeline/download/ethnicity.py | 50 ++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/pipeline/download/ethnicity.py b/pipeline/download/ethnicity.py
index 5971c49..00181fb 100644
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@@ -18,16 +18,50 @@ def download_and_convert(output_path: Path) -> None:
     df = pl.read_csv(response.content)
     print(f"Raw shape: {df.head(100)}")
 
-    # Keep only broad ethnicity categories (5+1), exclude "All" totals
-    df = df.filter(
-        (pl.col("Ethnicity_type") == "ONS 2021 5+1") & (pl.col("Ethnicity") != "All")
+    # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
+    # then aggregate back to the broad groups plus South Asian / East Asian split.
+    detailed = df.filter(
+        (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All")
     )
 
-    # Pivot: one row per local authority, columns = ethnicity percentages
-    wide = df.pivot(
-        on="Ethnicity",
-        index="Geography_code",
-        values="Value1",
+    # Map detailed categories to our output groups
+    group_map = {
+        # White
+        "White British": "White",
+        "White Irish": "White",
+        "Gypsy Or Irish Traveller": "White",
+        "Roma": "White",
+        "Any Other White Background": "White",
+        # South Asian
+        "Indian": "South Asian",
+        "Pakistani": "South Asian",
+        "Bangladeshi": "South Asian",
+        "Any Other Asian Background": "South Asian",
+        # East Asian
+        "Chinese": "East Asian",
+        # Black
+        "Black African": "Black",
+        "Black Caribbean": "Black",
+        "Any Other Black Background": "Black",
+        # Mixed
+        "Mixed White And Asian": "Mixed",
+        "Mixed White And Black African": "Mixed",
+        "Mixed White And Black Caribbean": "Mixed",
+        "Any Other Mixed/Multiple Ethnic Background": "Mixed",
+        # Other
+        "Arab": "Other",
+        "Any Other Ethnic Background": "Other",
+    }
+
+    detailed = detailed.with_columns(
+        pl.col("Ethnicity").replace_strict(group_map).alias("group"),
+    )
+
+    # Sum percentages within each group per local authority
+    wide = (
+        detailed.group_by("Geography_code", "group")
+        .agg(pl.col("Value1").sum().round(1))
+        .pivot(on="group", index="Geography_code", values="Value1")
     )
 
     # Rename columns to be descriptive