"""Download ONS Price Index of Private Rents (PIPR) monthly price statistics. Provides mean monthly private rent by local authority and bedroom count. Replaces the discontinued Private Rental Market Summary Statistics. Source: https://www.ons.gov.uk/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics License: Open Government Licence v3.0 """ import argparse import tempfile from pathlib import Path import polars as pl from pipeline.utils import download URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx" # Local authority district codes in England LA_PREFIXES = ("E06", "E07", "E08", "E09") def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None: print("Reading PIPR Excel file (Table 1)...") # Table 1 layout: row 0 = title, row 1 = column headers, row 2+ = data. # 40 columns in repeating blocks of 4 (index, monthly change, annual change, # rental price) for each category. Rental price columns (0-indexed): # 7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed, # 23 = Four or more bed df = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False) df = df.slice(2) # Skip title and header rows df = df.select( pl.col("column_1").alias("time_period"), pl.col("column_2").alias("area_code"), pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"), pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"), pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"), pl.col("column_24").cast(pl.Float32, strict=False).alias("rent_4plus"), ) # Filter to English local authorities df = df.filter( pl.any_horizontal(pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES) ) # Use only the latest month latest = df["time_period"].max() print(f"Latest month in data: {latest}") df = df.filter(pl.col("time_period") == latest) print(f"LAs in latest month: {df.height}") # Melt to long format: one row per area x bedroom count. # PIPR has no Studio category — one-bed rent used as proxy for bedrooms=0. frames = [] for col, bedrooms in [ ("rent_1bed", 0), # Studio (proxy) ("rent_1bed", 1), ("rent_2bed", 2), ("rent_3bed", 3), ("rent_4plus", 4), ]: frames.append( df.select( pl.col("area_code"), pl.col(col).alias("mean_monthly_rent"), pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"), ) ) combined = pl.concat(frames) print(f"Combined: {combined.shape}") print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}") print(combined.head(10)) parquet_path.parent.mkdir(parents=True, exist_ok=True) combined.write_parquet(parquet_path, compression="zstd") print(f"Saved to {parquet_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download ONS private rent monthly price statistics" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() with tempfile.TemporaryDirectory() as cache_dir: xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx" download(URL, xlsx_path, timeout=120) convert_to_parquet(xlsx_path, args.output) if __name__ == "__main__": main()