import argparse from pathlib import Path import httpx import polars as pl pl.Config.set_tbl_cols(-1) URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv" GEOGRAPHY_CODE_REPLACEMENTS = { # 2023 Cumberland unitary authority "E07000026": "E06000063", # Allerdale "E07000028": "E06000063", # Carlisle "E07000029": "E06000063", # Copeland # 2023 Westmorland and Furness unitary authority "E07000027": "E06000064", # Barrow-in-Furness "E07000030": "E06000064", # Eden "E07000031": "E06000064", # South Lakeland # 2023 North Yorkshire unitary authority "E07000163": "E06000065", # Craven "E07000164": "E06000065", # Hambleton "E07000165": "E06000065", # Harrogate "E07000166": "E06000065", # Richmondshire "E07000167": "E06000065", # Ryedale "E07000168": "E06000065", # Scarborough "E07000169": "E06000065", # Selby # 2023 Somerset unitary authority "E07000187": "E06000066", # Mendip "E07000188": "E06000066", # Sedgemoor "E07000189": "E06000066", # South Somerset "E07000246": "E06000066", # Somerset West and Taunton } def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame: # Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity, # then aggregate back to the broad groups plus South Asian / East Asian split. detailed = df.filter( (pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All") ) # Map detailed categories to our output groups group_map = { # White "White British": "White", "White Irish": "White", "Gypsy Or Irish Traveller": "White", "Roma": "White", "Any Other White Background": "White", # South Asian "Indian": "South Asian", "Pakistani": "South Asian", "Bangladeshi": "South Asian", "Any Other Asian Background": "South Asian", # East Asian "Chinese": "East Asian", # Black "Black African": "Black", "Black Caribbean": "Black", "Any Other Black Background": "Black", # Mixed "Mixed White And Asian": "Mixed", "Mixed White And Black African": "Mixed", "Mixed White And Black Caribbean": "Mixed", "Any Other Mixed/Multiple Ethnic Background": "Mixed", # Other "Arab": "Other", "Any Other Ethnic Background": "Other", } detailed = detailed.with_columns( pl.col("Ethnicity").replace_strict(group_map).alias("group"), pl.col("Geography_code") .replace(GEOGRAPHY_CODE_REPLACEMENTS) .alias("output_geography_code"), pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"), ) # Sum counts, not rounded percentages, so old districts can be safely # recombined into their current unitary authorities. grouped = detailed.group_by("output_geography_code", "group").agg( pl.col("_population").sum() ) wide = grouped.pivot( on="group", index="output_geography_code", values="_population" ).rename({"output_geography_code": "Geography_code"}) # Normalize so each row sums to exactly 100%, then round using largest-remainder # method to preserve the sum. Independent rounding of 6 values can drift ±0.3. group_cols = [c for c in wide.columns if c != "Geography_code"] row_total = sum(pl.col(c) for c in group_cols) # Scale each group so they sum to exactly 100 wide = wide.with_columns( [(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols] ) # Round to 1 decimal, then adjust the largest group to absorb residual rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols] wide = wide.with_columns(rounded_cols) rounded_sum = sum(pl.col(c) for c in group_cols) residual = (100.0 - rounded_sum).round(1) # Find which group is largest per row and add the residual there largest_col = pl.concat_list(group_cols).list.arg_max() wide = wide.with_columns( [ pl.when(largest_col == i) .then(pl.col(c) + residual) .otherwise(pl.col(c)) .alias(c) for i, c in enumerate(group_cols) ] ) # Rename columns to be descriptive rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"} wide = wide.rename(rename_map) return wide def download_and_convert(output_path: Path) -> None: print("Downloading ethnicity data...") response = httpx.get(URL, follow_redirects=True, timeout=60) response.raise_for_status() df = pl.read_csv(response.content) print(f"Raw shape: {df.head(100)}") wide = _ethnicity_percentages(df) print(f"Output shape: {wide.shape}") print(f"Columns: {wide.columns}") wide.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ethnicity by local authority data" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_and_convert(args.output) if __name__ == "__main__": main()