perfect-postcode/pipeline/download/rental_prices.py

"""Download ONS Price Index of Private Rents (PIPR) monthly price statistics.

Provides mean monthly private rent by local authority and bedroom count.
Replaces the discontinued Private Rental Market Summary Statistics.

Source: https://www.ons.gov.uk/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics
License: Open Government Licence v3.0
"""

import argparse
import tempfile
from pathlib import Path

import polars as pl

from pipeline.local_temp import local_tmp_dir
from pipeline.utils import download

URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"

# Local authority district codes in England
LA_PREFIXES = ("E06", "E07", "E08", "E09")

# ONS PIPR uses newer ONS codes for the 2026 South Yorkshire boundary/code
# update while IoD 2025 still carries the predecessor codes. Duplicate rows
# under the IoD codes so downstream joins are complete without inventing rents.
AREA_CODE_ALIASES = {
    "E08000038": "E08000016",  # Barnsley
    "E08000039": "E08000019",  # Sheffield
}


def _data_rows(df: pl.DataFrame) -> pl.DataFrame:
    """Rows below Table 1's header row.

    The preamble length varies (title, optional "This worksheet contains..."
    note, then the header row starting with "Time period"), so locate the
    header by content instead of counting rows — a fixed slice leaves the
    header in the data whenever ONS adds or removes a note line.
    """
    header_marker = (
        pl.col("column_1").cast(pl.String).str.strip_chars().str.to_lowercase()
        == "time period"
    )
    header_rows = df.with_row_index("_row").filter(header_marker)
    if header_rows.is_empty():
        raise ValueError("PIPR Table 1: no 'Time period' header row found")
    return df.slice(int(header_rows["_row"][0]) + 1)


def _latest_rents_long(df: pl.DataFrame) -> pl.DataFrame:
    # Table 1 layout below the header: 40 columns in repeating blocks of 4
    # (index, monthly change, annual change, rental price) for each category.
    # Rental price columns (0-indexed):
    #   7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed,
    #   23 = Four or more bed
    df = _data_rows(df)

    df = df.select(
        pl.col("column_1").alias("time_period"),
        pl.col("column_2").alias("area_code"),
        pl.col("column_3").alias("area_name"),
        pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"),
        pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"),
        pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"),
        pl.col("column_24").cast(pl.Float32, strict=False).alias("rent_4plus"),
    )

    # Filter to English local authorities
    df = df.filter(
        pl.any_horizontal(pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES)
    )

    # Use only the latest month
    latest = df["time_period"].max()
    print(f"Latest month in data: {latest}")
    df = df.filter(pl.col("time_period") == latest)
    print(f"LAs in latest month: {df.height}")

    # Melt to long format: one row per area x bedroom count.
    # PIPR has no Studio category — one-bed rent used as proxy for bedrooms=0.
    frames = []
    for col, bedrooms in [
        ("rent_1bed", 0),  # Studio (proxy)
        ("rent_1bed", 1),
        ("rent_2bed", 2),
        ("rent_3bed", 3),
        ("rent_4plus", 4),
    ]:
        frames.append(
            df.select(
                pl.col("area_code"),
                pl.col("area_name"),
                pl.col(col).alias("mean_monthly_rent"),
                pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
            )
        )

    combined = pl.concat(frames)
    alias_rows = []
    for source_code, alias_code in AREA_CODE_ALIASES.items():
        alias_rows.append(
            combined.filter(pl.col("area_code") == source_code).with_columns(
                pl.lit(alias_code).alias("area_code")
            )
        )
    if alias_rows:
        combined = pl.concat([combined, *alias_rows])

    return combined.unique(["area_code", "bedrooms"], keep="first")


def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
    print("Reading PIPR Excel file (Table 1)...")
    raw = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
    combined = _latest_rents_long(raw)

    print(f"Combined: {combined.shape}")
    print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}")
    print(combined.head(10))

    parquet_path.parent.mkdir(parents=True, exist_ok=True)
    combined.write_parquet(parquet_path, compression="zstd")
    print(f"Saved to {parquet_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Download ONS private rent monthly price statistics"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()

    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
        xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
        download(URL, xlsx_path, timeout=120)
        convert_to_parquet(xlsx_path, args.output)


if __name__ == "__main__":
    main()