"""Download Census 2021 median age by LSOA. Downloads five-year age band counts (TS007A) from the NOMIS API, then computes the median age per LSOA using linear interpolation within the median class. Source: NOMIS (ONS Census 2021 — TS007A dataset, NM_2020_1) License: Open Government Licence v3.0 """ import argparse from io import BytesIO from pathlib import Path import httpx import polars as pl # NOMIS API: Census 2021 TS007A (age by five-year bands) by LSOA 2021 (TYPE151) # c2021_age_19=1..18 selects 18 five-year bands (excluding 0 = Total) # measures=20100 selects absolute count BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv?date=latest&geography=TYPE151&c2021_age_19=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18&measures=20100&select=GEOGRAPHY_CODE,C2021_AGE_19_NAME,OBS_VALUE" PAGE_SIZE = 25000 # Five-year age bands in order, with lower bounds for interpolation. # The last band (85+) is open-ended — we treat it as 85-89 for median purposes. AGE_BANDS = [ (0, 5), # Aged 0 to 4 years (5, 5), # Aged 5 to 9 years (10, 5), # Aged 10 to 14 years (15, 5), # Aged 15 to 19 years (20, 5), # Aged 20 to 24 years (25, 5), # Aged 25 to 29 years (30, 5), # Aged 30 to 34 years (35, 5), # Aged 35 to 39 years (40, 5), # Aged 40 to 44 years (45, 5), # Aged 45 to 49 years (50, 5), # Aged 50 to 54 years (55, 5), # Aged 55 to 59 years (60, 5), # Aged 60 to 64 years (65, 5), # Aged 65 to 69 years (70, 5), # Aged 70 to 74 years (75, 5), # Aged 75 to 79 years (80, 5), # Aged 80 to 84 years (85, 5), # Aged 85 years and over ] def compute_median_age(counts: list[int]) -> float: """Compute median age from five-year band counts using linear interpolation.""" total = sum(counts) if total == 0: return float("nan") half = total / 2 cumulative = 0 for i, count in enumerate(counts): if cumulative + count >= half: lower_bound, width = AGE_BANDS[i] # Linear interpolation within the median band return lower_bound + ((half - cumulative) / count) * width cumulative += count return float("nan") def download_and_convert(output_path: Path) -> None: print("Downloading Census 2021 age by five-year bands from NOMIS...") frames = [] offset = 0 while True: url = f"{BASE_URL}&recordoffset={offset}" response = httpx.get(url, follow_redirects=True, timeout=120) response.raise_for_status() if len(response.content) == 0: break chunk = pl.read_csv(BytesIO(response.content)) if chunk.height == 0: break frames.append(chunk) print(f" Fetched {chunk.height} rows (offset={offset})") if chunk.height < PAGE_SIZE: break offset += PAGE_SIZE df = pl.concat(frames) print(f"Total rows: {df.height}") # Filter to England only df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E")) # Pivot: one row per LSOA, columns = age band names, values = counts pivoted = df.pivot( on="C2021_AGE_19_NAME", index="GEOGRAPHY_CODE", values="OBS_VALUE", ) # Extract age band columns in order and compute median # NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over" band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"] # Sort by the lower bound of each band band_cols.sort(key=lambda c: int(c.split()[1])) print(f"Age bands found: {len(band_cols)}") print(f" First: {band_cols[0]}") print(f" Last: {band_cols[-1]}") # Compute median age per LSOA rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts() medians = [] for row in rows: counts = [row[col] for col in band_cols] median = compute_median_age(counts) medians.append( {"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)} ) result = pl.DataFrame(medians).with_columns( pl.col("median_age").cast(pl.Float32), ) print(f"England LSOAs: {result.height}") print( f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}" ) print(f"Mean of medians: {result['median_age'].mean():.1f}") output_path.parent.mkdir(parents=True, exist_ok=True) result.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download Census 2021 median age by LSOA" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() download_and_convert(args.output) if __name__ == "__main__": main()