perfect-postcode/pipeline/download/rental_prices.py

import argparse
import tempfile

import polars as pl
from pathlib import Path

from pipeline.utils import download

URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls"

# Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed
# (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it)
BEDROOM_SHEETS = {
    12: 0,  # Studio
    13: 1,  # One Bedroom
    14: 2,  # Two Bedrooms
    15: 3,  # Three Bedrooms
    16: 4,  # Four or more Bedrooms
}

# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
LA_PREFIXES = ("E06", "E07", "E08", "E09")

# April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes.
# The ONS rental data (Oct 2022 – Sep 2023) uses the old codes; IoD 2025 uses the new ones.
# We remap old → new and average the medians so the join in merge.py works.
LA_CONSOLIDATION = {
    # North Northamptonshire (April 2021)
    "E07000150": "E06000061",  # Corby
    "E07000152": "E06000061",  # East Northamptonshire
    "E07000153": "E06000061",  # Kettering
    "E07000156": "E06000061",  # Wellingborough
    # West Northamptonshire (April 2021)
    "E07000151": "E06000062",  # Daventry
    "E07000154": "E06000062",  # Northampton
    "E07000155": "E06000062",  # South Northamptonshire
    # Cumberland (April 2023)
    "E07000026": "E06000063",  # Allerdale
    "E07000028": "E06000063",  # Carlisle
    "E07000029": "E06000063",  # Copeland
    # Westmorland and Furness (April 2023)
    "E07000027": "E06000064",  # Barrow-in-Furness
    "E07000030": "E06000064",  # Eden
    "E07000031": "E06000064",  # South Lakeland
    # North Yorkshire (April 2023)
    "E07000163": "E06000065",  # Craven
    "E07000164": "E06000065",  # Hambleton
    "E07000165": "E06000065",  # Harrogate
    "E07000166": "E06000065",  # Richmondshire
    "E07000167": "E06000065",  # Ryedale
    "E07000168": "E06000065",  # Scarborough
    "E07000169": "E06000065",  # Selby
    # Somerset (April 2023)
    "E07000187": "E06000066",  # Mendip
    "E07000188": "E06000066",  # Sedgemoor
    "E07000189": "E06000066",  # South Somerset
    "E07000246": "E06000066",  # Somerset West and Taunton
}


def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
    """Read one bedroom category sheet, extract LA-level median rents."""
    df = pl.read_excel(xls_path, sheet_id=sheet_id)

    # Columns are unnamed; positional:
    # 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ
    # First 4 rows are headers (title, notes, bedroom label, column headers)
    df = df.slice(4)

    area_code_col = df.columns[1]
    median_col = df.columns[6]

    return (
        df.select(
            pl.col(area_code_col).alias("area_code"),
            pl.col(median_col).alias("median_monthly_rent"),
        )
        .filter(
            pl.col("area_code").is_not_null()
            & pl.any_horizontal(
                pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
            )
        )
        .with_columns(
            # Suppressed values are ".." — cast will turn them to null
            pl.col("median_monthly_rent").cast(pl.Float32, strict=False),
            pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
        )
    )


def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
    frames = []
    for sheet_id, bedrooms in BEDROOM_SHEETS.items():
        df = _read_sheet(xls_path, sheet_id, bedrooms)
        print(f"  Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows")
        frames.append(df)

    combined = pl.concat(frames)

    # Remap old LA codes to new unitary authority codes and average medians
    combined = combined.with_columns(
        pl.col("area_code").replace(LA_CONSOLIDATION),
    ).group_by("area_code", "bedrooms").agg(
        pl.col("median_monthly_rent").mean(),
    )

    print(f"Combined: {combined.shape}")
    print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
    print(combined.head(10))

    combined.write_parquet(parquet_path, compression="zstd")
    print(f"Saved to {parquet_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download and convert ONS private rental market statistics"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()

    with tempfile.TemporaryDirectory() as cache_dir:
        xls_path = Path(cache_dir) / "rental_prices.xls"
        download(URL, xls_path, timeout=60)
        convert_to_parquet(xls_path, args.output)


if __name__ == "__main__":
    main()