From ef921361ec27c2c100ba298fcf13c1dd7425c6a0 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Tue, 10 Mar 2026 21:51:46 +0000 Subject: [PATCH] Separate chinese --- pipeline/download/ethnicity.py | 50 ++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/pipeline/download/ethnicity.py b/pipeline/download/ethnicity.py index 5971c49..00181fb 100644 --- a/pipeline/download/ethnicity.py +++ b/pipeline/download/ethnicity.py @@ -18,16 +18,50 @@ def download_and_convert(output_path: Path) -> None: df = pl.read_csv(response.content) print(f"Raw shape: {df.head(100)}") - # Keep only broad ethnicity categories (5+1), exclude "All" totals - df = df.filter( - (pl.col("Ethnicity_type") == "ONS 2021 5+1") & (pl.col("Ethnicity") != "All") + # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity, + # then aggregate back to the broad groups plus South Asian / East Asian split. + detailed = df.filter( + (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All") ) - # Pivot: one row per local authority, columns = ethnicity percentages - wide = df.pivot( - on="Ethnicity", - index="Geography_code", - values="Value1", + # Map detailed categories to our output groups + group_map = { + # White + "White British": "White", + "White Irish": "White", + "Gypsy Or Irish Traveller": "White", + "Roma": "White", + "Any Other White Background": "White", + # South Asian + "Indian": "South Asian", + "Pakistani": "South Asian", + "Bangladeshi": "South Asian", + "Any Other Asian Background": "South Asian", + # East Asian + "Chinese": "East Asian", + # Black + "Black African": "Black", + "Black Caribbean": "Black", + "Any Other Black Background": "Black", + # Mixed + "Mixed White And Asian": "Mixed", + "Mixed White And Black African": "Mixed", + "Mixed White And Black Caribbean": "Mixed", + "Any Other Mixed/Multiple Ethnic Background": "Mixed", + # Other + "Arab": "Other", + "Any Other Ethnic Background": "Other", + } + + detailed = detailed.with_columns( + pl.col("Ethnicity").replace_strict(group_map).alias("group"), + ) + + # Sum percentages within each group per local authority + wide = ( + detailed.group_by("Geography_code", "group") + .agg(pl.col("Value1").sum().round(1)) + .pivot(on="group", index="Geography_code", values="Value1") ) # Rename columns to be descriptive