import argparse from pathlib import Path import httpx import polars as pl pl.Config.set_tbl_cols(-1) URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv" def download_and_convert(output_path: Path) -> None: print("Downloading ethnicity data...") response = httpx.get(URL, follow_redirects=True, timeout=60) response.raise_for_status() df = pl.read_csv(response.content) print(f"Raw shape: {df.head(100)}") # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity, # then aggregate back to the broad groups plus South Asian / East Asian split. detailed = df.filter( (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All") ) # Map detailed categories to our output groups group_map = { # White "White British": "White", "White Irish": "White", "Gypsy Or Irish Traveller": "White", "Roma": "White", "Any Other White Background": "White", # South Asian "Indian": "South Asian", "Pakistani": "South Asian", "Bangladeshi": "South Asian", "Any Other Asian Background": "South Asian", # East Asian "Chinese": "East Asian", # Black "Black African": "Black", "Black Caribbean": "Black", "Any Other Black Background": "Black", # Mixed "Mixed White And Asian": "Mixed", "Mixed White And Black African": "Mixed", "Mixed White And Black Caribbean": "Mixed", "Any Other Mixed/Multiple Ethnic Background": "Mixed", # Other "Arab": "Other", "Any Other Ethnic Background": "Other", } detailed = detailed.with_columns( pl.col("Ethnicity").replace_strict(group_map).alias("group"), ) # Sum percentages within each group per local authority (keep full precision) grouped = detailed.group_by("Geography_code", "group").agg(pl.col("Value1").sum()) wide = grouped.pivot(on="group", index="Geography_code", values="Value1") # Normalize so each row sums to exactly 100%, then round using largest-remainder # method to preserve the sum. Independent rounding of 6 values can drift ±0.3. group_cols = [c for c in wide.columns if c != "Geography_code"] row_total = sum(pl.col(c) for c in group_cols) # Scale each group so they sum to exactly 100 wide = wide.with_columns( [(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols] ) # Round to 1 decimal, then adjust the largest group to absorb residual rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols] wide = wide.with_columns(rounded_cols) rounded_sum = sum(pl.col(c) for c in group_cols) residual = (100.0 - rounded_sum).round(1) # Find which group is largest per row and add the residual there largest_col = pl.concat_list(group_cols).list.arg_max() wide = wide.with_columns( [ pl.when(largest_col == i) .then(pl.col(c) + residual) .otherwise(pl.col(c)) .alias(c) for i, c in enumerate(group_cols) ] ) # Rename columns to be descriptive rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"} wide = wide.rename(rename_map) print(f"Output shape: {wide.shape}") print(f"Columns: {wide.columns}") wide.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ethnicity by local authority data" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_and_convert(args.output) if __name__ == "__main__": main()