import argparse from pathlib import Path import httpx import polars as pl pl.Config.set_tbl_cols(-1) URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv" def download_and_convert(output_path: Path) -> None: print("Downloading ethnicity data...") response = httpx.get(URL, follow_redirects=True, timeout=60) response.raise_for_status() df = pl.read_csv(response.content) print(f"Raw shape: {df.head(100)}") # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity, # then aggregate back to the broad groups plus South Asian / East Asian split. detailed = df.filter( (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All") ) # Map detailed categories to our output groups group_map = { # White "White British": "White", "White Irish": "White", "Gypsy Or Irish Traveller": "White", "Roma": "White", "Any Other White Background": "White", # South Asian "Indian": "South Asian", "Pakistani": "South Asian", "Bangladeshi": "South Asian", "Any Other Asian Background": "South Asian", # East Asian "Chinese": "East Asian", # Black "Black African": "Black", "Black Caribbean": "Black", "Any Other Black Background": "Black", # Mixed "Mixed White And Asian": "Mixed", "Mixed White And Black African": "Mixed", "Mixed White And Black Caribbean": "Mixed", "Any Other Mixed/Multiple Ethnic Background": "Mixed", # Other "Arab": "Other", "Any Other Ethnic Background": "Other", } detailed = detailed.with_columns( pl.col("Ethnicity").replace_strict(group_map).alias("group"), ) # Sum percentages within each group per local authority wide = ( detailed.group_by("Geography_code", "group") .agg(pl.col("Value1").sum().round(1)) .pivot(on="group", index="Geography_code", values="Value1") ) # Rename columns to be descriptive rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"} wide = wide.rename(rename_map) print(f"Output shape: {wide.shape}") print(f"Columns: {wide.columns}") wide.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ethnicity by local authority data" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_and_convert(args.output) if __name__ == "__main__": main()