125 lines
4.5 KiB
Python
125 lines
4.5 KiB
Python
"""Download ONS Price Index of Private Rents (PIPR) monthly price statistics.
|
|
|
|
Provides mean monthly private rent by local authority and bedroom count.
|
|
Replaces the discontinued Private Rental Market Summary Statistics.
|
|
|
|
Source: https://www.ons.gov.uk/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics
|
|
License: Open Government Licence v3.0
|
|
"""
|
|
|
|
import argparse
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
from pipeline.local_temp import local_tmp_dir
|
|
from pipeline.utils import download
|
|
|
|
URL = "https://www.ons.gov.uk/file?uri=/economy/inflationandpriceindices/datasets/priceindexofprivaterentsukmonthlypricestatistics/25march2026/priceindexofprivaterentsukmonthlypricestatistics.xlsx"
|
|
|
|
# Local authority district codes in England
|
|
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
|
|
|
# ONS PIPR uses newer ONS codes for the 2026 South Yorkshire boundary/code
|
|
# update while IoD 2025 still carries the predecessor codes. Duplicate rows
|
|
# under the IoD codes so downstream joins are complete without inventing rents.
|
|
AREA_CODE_ALIASES = {
|
|
"E08000038": "E08000016", # Barnsley
|
|
"E08000039": "E08000019", # Sheffield
|
|
}
|
|
|
|
|
|
def _latest_rents_long(df: pl.DataFrame) -> pl.DataFrame:
|
|
# Table 1 layout: row 0 = title, row 1 = column headers, row 2+ = data.
|
|
# 40 columns in repeating blocks of 4 (index, monthly change, annual change,
|
|
# rental price) for each category. Rental price columns (0-indexed):
|
|
# 7 = All categories, 11 = One bed, 15 = Two bed, 19 = Three bed,
|
|
# 23 = Four or more bed
|
|
df = df.slice(2) # Skip title and header rows
|
|
|
|
df = df.select(
|
|
pl.col("column_1").alias("time_period"),
|
|
pl.col("column_2").alias("area_code"),
|
|
pl.col("column_3").alias("area_name"),
|
|
pl.col("column_12").cast(pl.Float32, strict=False).alias("rent_1bed"),
|
|
pl.col("column_16").cast(pl.Float32, strict=False).alias("rent_2bed"),
|
|
pl.col("column_20").cast(pl.Float32, strict=False).alias("rent_3bed"),
|
|
pl.col("column_24").cast(pl.Float32, strict=False).alias("rent_4plus"),
|
|
)
|
|
|
|
# Filter to English local authorities
|
|
df = df.filter(
|
|
pl.any_horizontal(pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES)
|
|
)
|
|
|
|
# Use only the latest month
|
|
latest = df["time_period"].max()
|
|
print(f"Latest month in data: {latest}")
|
|
df = df.filter(pl.col("time_period") == latest)
|
|
print(f"LAs in latest month: {df.height}")
|
|
|
|
# Melt to long format: one row per area x bedroom count.
|
|
# PIPR has no Studio category — one-bed rent used as proxy for bedrooms=0.
|
|
frames = []
|
|
for col, bedrooms in [
|
|
("rent_1bed", 0), # Studio (proxy)
|
|
("rent_1bed", 1),
|
|
("rent_2bed", 2),
|
|
("rent_3bed", 3),
|
|
("rent_4plus", 4),
|
|
]:
|
|
frames.append(
|
|
df.select(
|
|
pl.col("area_code"),
|
|
pl.col("area_name"),
|
|
pl.col(col).alias("mean_monthly_rent"),
|
|
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
|
|
)
|
|
)
|
|
|
|
combined = pl.concat(frames)
|
|
alias_rows = []
|
|
for source_code, alias_code in AREA_CODE_ALIASES.items():
|
|
alias_rows.append(
|
|
combined.filter(pl.col("area_code") == source_code).with_columns(
|
|
pl.lit(alias_code).alias("area_code")
|
|
)
|
|
)
|
|
if alias_rows:
|
|
combined = pl.concat([combined, *alias_rows])
|
|
|
|
return combined.unique(["area_code", "bedrooms"], keep="first")
|
|
|
|
|
|
def convert_to_parquet(xlsx_path: Path, parquet_path: Path) -> None:
|
|
print("Reading PIPR Excel file (Table 1)...")
|
|
raw = pl.read_excel(xlsx_path, sheet_name="Table 1", has_header=False)
|
|
combined = _latest_rents_long(raw)
|
|
|
|
print(f"Combined: {combined.shape}")
|
|
print(f"Non-null rents: {combined['mean_monthly_rent'].drop_nulls().len()}")
|
|
print(combined.head(10))
|
|
|
|
parquet_path.parent.mkdir(parents=True, exist_ok=True)
|
|
combined.write_parquet(parquet_path, compression="zstd")
|
|
print(f"Saved to {parquet_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download ONS private rent monthly price statistics"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as cache_dir:
|
|
xlsx_path = Path(cache_dir) / "pipr_monthly.xlsx"
|
|
download(URL, xlsx_path, timeout=120)
|
|
convert_to_parquet(xlsx_path, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|