perfect-postcode/pipeline/download/rental_prices.py
2026-03-15 21:22:28 +00:00

136 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import argparse
import tempfile
import polars as pl
from pathlib import Path
from pipeline.utils import download
URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls"
# Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed
# (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it)
BEDROOM_SHEETS = {
12: 0, # Studio
13: 1, # One Bedroom
14: 2, # Two Bedrooms
15: 3, # Three Bedrooms
16: 4, # Four or more Bedrooms
}
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
LA_PREFIXES = ("E06", "E07", "E08", "E09")
# April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes.
# The ONS rental data (Oct 2022 Sep 2023) uses the old codes; IoD 2025 uses the new ones.
# We remap old → new and average the medians so the join in merge.py works.
LA_CONSOLIDATION = {
# North Northamptonshire (April 2021)
"E07000150": "E06000061", # Corby
"E07000152": "E06000061", # East Northamptonshire
"E07000153": "E06000061", # Kettering
"E07000156": "E06000061", # Wellingborough
# West Northamptonshire (April 2021)
"E07000151": "E06000062", # Daventry
"E07000154": "E06000062", # Northampton
"E07000155": "E06000062", # South Northamptonshire
# Cumberland (April 2023)
"E07000026": "E06000063", # Allerdale
"E07000028": "E06000063", # Carlisle
"E07000029": "E06000063", # Copeland
# Westmorland and Furness (April 2023)
"E07000027": "E06000064", # Barrow-in-Furness
"E07000030": "E06000064", # Eden
"E07000031": "E06000064", # South Lakeland
# North Yorkshire (April 2023)
"E07000163": "E06000065", # Craven
"E07000164": "E06000065", # Hambleton
"E07000165": "E06000065", # Harrogate
"E07000166": "E06000065", # Richmondshire
"E07000167": "E06000065", # Ryedale
"E07000168": "E06000065", # Scarborough
"E07000169": "E06000065", # Selby
# Somerset (April 2023)
"E07000187": "E06000066", # Mendip
"E07000188": "E06000066", # Sedgemoor
"E07000189": "E06000066", # South Somerset
"E07000246": "E06000066", # Somerset West and Taunton
}
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
"""Read one bedroom category sheet, extract LA-level median rents."""
df = pl.read_excel(xls_path, sheet_id=sheet_id)
# Columns are unnamed; positional:
# 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ
# First 4 rows are headers (title, notes, bedroom label, column headers)
df = df.slice(4)
area_code_col = df.columns[1]
median_col = df.columns[6]
return (
df.select(
pl.col(area_code_col).alias("area_code"),
pl.col(median_col).alias("median_monthly_rent"),
)
.filter(
pl.col("area_code").is_not_null()
& pl.any_horizontal(
pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
)
)
.with_columns(
# Suppressed values are ".." — cast will turn them to null
pl.col("median_monthly_rent").cast(pl.Float32, strict=False),
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
)
)
def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
frames = []
for sheet_id, bedrooms in BEDROOM_SHEETS.items():
df = _read_sheet(xls_path, sheet_id, bedrooms)
print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows")
frames.append(df)
combined = pl.concat(frames)
# Remap old LA codes to new unitary authority codes and average medians
combined = (
combined.with_columns(
pl.col("area_code").replace(LA_CONSOLIDATION),
)
.group_by("area_code", "bedrooms")
.agg(
pl.col("median_monthly_rent").mean(),
)
)
print(f"Combined: {combined.shape}")
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
print(combined.head(10))
combined.write_parquet(parquet_path, compression="zstd")
print(f"Saved to {parquet_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download and convert ONS private rental market statistics"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
with tempfile.TemporaryDirectory() as cache_dir:
xls_path = Path(cache_dir) / "rental_prices.xls"
download(URL, xls_path, timeout=60)
convert_to_parquet(xls_path, args.output)
if __name__ == "__main__":
main()