perfect-postcode/pipeline/download/median_age.py

"""Download Census 2021 median age by LSOA.

Downloads five-year age band counts (TS007A) from the NOMIS API, then computes
the median age per LSOA using linear interpolation within the median class.

Source: NOMIS (ONS Census 2021 — TS007A dataset, NM_2020_1)
License: Open Government Licence v3.0
"""

import argparse
from io import BytesIO
from pathlib import Path

import httpx
import polars as pl

# NOMIS API: Census 2021 TS007A (age by five-year bands) by LSOA 2021 (TYPE151)
# c2021_age_19=1..18 selects 18 five-year bands (excluding 0 = Total)
# measures=20100 selects absolute count
BASE_URL = "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv?date=latest&geography=TYPE151&c2021_age_19=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18&measures=20100&select=GEOGRAPHY_CODE,C2021_AGE_19_NAME,OBS_VALUE"
PAGE_SIZE = 25000

# Five-year age bands in order, with lower bounds for interpolation.
# The last band (85+) is open-ended — we treat it as 85-89 for median purposes.
AGE_BANDS = [
    (0, 5),  # Aged 0 to 4 years
    (5, 5),  # Aged 5 to 9 years
    (10, 5),  # Aged 10 to 14 years
    (15, 5),  # Aged 15 to 19 years
    (20, 5),  # Aged 20 to 24 years
    (25, 5),  # Aged 25 to 29 years
    (30, 5),  # Aged 30 to 34 years
    (35, 5),  # Aged 35 to 39 years
    (40, 5),  # Aged 40 to 44 years
    (45, 5),  # Aged 45 to 49 years
    (50, 5),  # Aged 50 to 54 years
    (55, 5),  # Aged 55 to 59 years
    (60, 5),  # Aged 60 to 64 years
    (65, 5),  # Aged 65 to 69 years
    (70, 5),  # Aged 70 to 74 years
    (75, 5),  # Aged 75 to 79 years
    (80, 5),  # Aged 80 to 84 years
    (85, 5),  # Aged 85 years and over
]


def compute_median_age(counts: list[int]) -> float:
    """Compute median age from five-year band counts using linear interpolation."""
    total = sum(counts)
    if total == 0:
        return float("nan")

    half = total / 2
    cumulative = 0
    for i, count in enumerate(counts):
        if cumulative + count >= half:
            lower_bound, width = AGE_BANDS[i]
            # Linear interpolation within the median band
            return lower_bound + ((half - cumulative) / count) * width
        cumulative += count

    return float("nan")


def download_and_convert(output_path: Path) -> None:
    print("Downloading Census 2021 age by five-year bands from NOMIS...")
    frames = []
    offset = 0
    while True:
        url = f"{BASE_URL}&recordoffset={offset}"
        response = httpx.get(url, follow_redirects=True, timeout=120)
        response.raise_for_status()
        if len(response.content) == 0:
            break
        chunk = pl.read_csv(BytesIO(response.content))
        if chunk.height == 0:
            break
        frames.append(chunk)
        print(f"  Fetched {chunk.height} rows (offset={offset})")
        if chunk.height < PAGE_SIZE:
            break
        offset += PAGE_SIZE

    df = pl.concat(frames)
    print(f"Total rows: {df.height}")

    # Filter to England only
    df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))

    # Pivot: one row per LSOA, columns = age band names, values = counts
    pivoted = df.pivot(
        on="C2021_AGE_19_NAME",
        index="GEOGRAPHY_CODE",
        values="OBS_VALUE",
    )

    # Extract age band columns in order and compute median
    # NOMIS returns band names like "Aged 0 to 4 years", "Aged 85 years and over"
    band_cols = [c for c in pivoted.columns if c != "GEOGRAPHY_CODE"]
    # Sort by the lower bound of each band
    band_cols.sort(key=lambda c: int(c.split()[1]))

    print(f"Age bands found: {len(band_cols)}")
    print(f"  First: {band_cols[0]}")
    print(f"  Last:  {band_cols[-1]}")

    # Compute median age per LSOA
    rows = pivoted.select("GEOGRAPHY_CODE", *band_cols).to_dicts()
    medians = []
    for row in rows:
        counts = [row[col] for col in band_cols]
        median = compute_median_age(counts)
        medians.append(
            {"lsoa21": row["GEOGRAPHY_CODE"], "median_age": round(median, 1)}
        )

    result = pl.DataFrame(medians).with_columns(
        pl.col("median_age").cast(pl.Float32),
    )

    print(f"England LSOAs: {result.height}")
    print(
        f"Median age range: {result['median_age'].min()} - {result['median_age'].max()}"
    )
    print(f"Mean of medians: {result['median_age'].mean():.1f}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
    result.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download Census 2021 median age by LSOA"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    download_and_convert(args.output)


if __name__ == "__main__":
    main()