import argparse from pathlib import Path import httpx import polars as pl pl.Config.set_tbl_cols(-1) URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv" def download_and_convert(output_path: Path) -> None: print("Downloading ethnicity data...") response = httpx.get(URL, follow_redirects=True, timeout=60) response.raise_for_status() df = pl.read_csv(response.content) print(f"Raw shape: {df.head(100)}") # Keep only broad ethnicity categories (5+1), exclude "All" totals df = df.filter( (pl.col("Ethnicity_type") == "ONS 2021 5+1") & (pl.col("Ethnicity") != "All") ) # Pivot: one row per local authority, columns = ethnicity percentages wide = df.pivot( on="Ethnicity", index="Geography_code", values="Value1", ) # Rename columns to be descriptive rename_map = { col: f"% {col}" for col in wide.columns if col != "Geography_code" } wide = wide.rename(rename_map) print(f"Output shape: {wide.shape}") print(f"Columns: {wide.columns}") wide.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ethnicity by local authority data" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_and_convert(args.output) if __name__ == "__main__": main()