perfect-postcode/pipeline/download/ethnicity.py
2026-02-07 19:13:36 +00:00

56 lines
1.6 KiB
Python

import argparse
from pathlib import Path
import httpx
import polars as pl
pl.Config.set_tbl_cols(-1)
URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"
def download_and_convert(output_path: Path) -> None:
print("Downloading ethnicity data...")
response = httpx.get(URL, follow_redirects=True, timeout=60)
response.raise_for_status()
df = pl.read_csv(response.content)
print(f"Raw shape: {df.head(100)}")
# Keep only broad ethnicity categories (5+1), exclude "All" totals
df = df.filter(
(pl.col("Ethnicity_type") == "ONS 2021 5+1") & (pl.col("Ethnicity") != "All")
)
# Pivot: one row per local authority, columns = ethnicity percentages
wide = df.pivot(
on="Ethnicity",
index="Geography_code",
values="Value1",
)
# Rename columns to be descriptive
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
wide = wide.rename(rename_map)
print(f"Output shape: {wide.shape}")
print(f"Columns: {wide.columns}")
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert ethnicity by local authority data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()