100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
"""Download ONS Price Index of Private Rents (PIPR) monthly price statistics.
|
|
|
|
Provides mean monthly private rent by local authority and bedroom count.
|
|
Replaces the discontinued Private Rental Market Summary Statistics.
|
|
|
|
Source: https://www.ons.gov.uk/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics
|
|
License: Open Government Licence v3.0
|
|
"""
|
|
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.utils import download
|
|
|
|
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
|
|
|
|
# Local authority district codes in England
|
|
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
|
|
|
|
|
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
|
print("Reading PIPR Excel file (Table 1)...")
|
|
|
|
# Table 1 layout: row 0 = title, row 1 = column headers, row 2+ = data.
|
|
# 40 columns in repeating blocks of 4 (index, monthly change, annual change,
|
|
# rental price) for each category. Rental price columns (0-indexed):
|
|
# 7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed,
|
|
# 23 = Four or more bed
|
|
df = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
|
|
df = df.slice(2) # Skip title and header rows
|
|
|
|
df = df.select(
|
|
pl.col("column_1").alias("time_period"),
|
|
pl.col("column_2").alias("area_code"),
|
|
pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"),
|
|
pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"),
|
|
pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"),
|
|
pl.col("column_24").cast(pl.Float32, strict=False).alias("rent_4plus"),
|
|
)
|
|
|
|
# Filter to English local authorities
|
|
df = df.filter(
|
|
pl.any_horizontal(pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES)
|
|
)
|
|
|
|
# Use only the latest month
|
|
latest = df["time_period"].max()
|
|
print(f"Latest month in data: {latest}")
|
|
df = df.filter(pl.col("time_period") == latest)
|
|
print(f"LAs in latest month: {df.height}")
|
|
|
|
# Melt to long format: one row per area x bedroom count.
|
|
# PIPR has no Studio category — one-bed rent used as proxy for bedrooms=0.
|
|
frames = []
|
|
for col, bedrooms in [
|
|
("rent_1bed", 0), # Studio (proxy)
|
|
("rent_1bed", 1),
|
|
("rent_2bed", 2),
|
|
("rent_3bed", 3),
|
|
("rent_4plus", 4),
|
|
]:
|
|
frames.append(
|
|
df.select(
|
|
pl.col("area_code"),
|
|
pl.col(col).alias("mean_monthly_rent"),
|
|
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
|
|
)
|
|
)
|
|
|
|
combined = pl.concat(frames)
|
|
|
|
print(f"Combined: {combined.shape}")
|
|
print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}")
|
|
print(combined.head(10))
|
|
|
|
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
|
combined.write_parquet(parquet_path, compression="zstd")
|
|
print(f"Saved to {parquet_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download ONS private rent monthly price statistics"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
with tempfile.TemporaryDirectory() as cache_dir:
|
|
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
|
|
download(URL, xlsx_path, timeout=120)
|
|
convert_to_parquet(xlsx_path, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|