import argparse import tempfile import polars as pl from pathlib import Path from pipeline.utils import download URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls" # Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed # (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it) BEDROOM_SHEETS = { 12: 0, # Studio 13: 1, # One Bedroom 14: 2, # Two Bedrooms 15: 3, # Three Bedrooms 16: 4, # Four or more Bedrooms } # Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system LA_PREFIXES = ("E06", "E07", "E08", "E09") # April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes. # The ONS rental data (Oct 2022 – Sep 2023) uses the old codes; IoD 2025 uses the new ones. # We remap old → new and average the medians so the join in merge.py works. LA_CONSOLIDATION = { # North Northamptonshire (April 2021) "E07000150": "E06000061", # Corby "E07000152": "E06000061", # East Northamptonshire "E07000153": "E06000061", # Kettering "E07000156": "E06000061", # Wellingborough # West Northamptonshire (April 2021) "E07000151": "E06000062", # Daventry "E07000154": "E06000062", # Northampton "E07000155": "E06000062", # South Northamptonshire # Cumberland (April 2023) "E07000026": "E06000063", # Allerdale "E07000028": "E06000063", # Carlisle "E07000029": "E06000063", # Copeland # Westmorland and Furness (April 2023) "E07000027": "E06000064", # Barrow-in-Furness "E07000030": "E06000064", # Eden "E07000031": "E06000064", # South Lakeland # North Yorkshire (April 2023) "E07000163": "E06000065", # Craven "E07000164": "E06000065", # Hambleton "E07000165": "E06000065", # Harrogate "E07000166": "E06000065", # Richmondshire "E07000167": "E06000065", # Ryedale "E07000168": "E06000065", # Scarborough "E07000169": "E06000065", # Selby # Somerset (April 2023) "E07000187": "E06000066", # Mendip "E07000188": "E06000066", # Sedgemoor "E07000189": "E06000066", # South Somerset "E07000246": "E06000066", # Somerset West and Taunton } def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame: """Read one bedroom category sheet, extract LA-level median rents.""" df = pl.read_excel(xls_path, sheet_id=sheet_id) # Columns are unnamed; positional: # 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ # First 4 rows are headers (title, notes, bedroom label, column headers) df = df.slice(4) area_code_col = df.columns[1] median_col = df.columns[6] return ( df.select( pl.col(area_code_col).alias("area_code"), pl.col(median_col).alias("median_monthly_rent"), ) .filter( pl.col("area_code").is_not_null() & pl.any_horizontal( pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES ) ) .with_columns( # Suppressed values are ".." — cast will turn them to null pl.col("median_monthly_rent").cast(pl.Float32, strict=False), pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"), ) ) def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None: frames = [] for sheet_id, bedrooms in BEDROOM_SHEETS.items(): df = _read_sheet(xls_path, sheet_id, bedrooms) print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows") frames.append(df) combined = pl.concat(frames) # Remap old LA codes to new unitary authority codes and average medians combined = ( combined.with_columns( pl.col("area_code").replace(LA_CONSOLIDATION), ) .group_by("area_code", "bedrooms") .agg( pl.col("median_monthly_rent").mean(), ) ) print(f"Combined: {combined.shape}") print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}") print(combined.head(10)) combined.write_parquet(parquet_path, compression="zstd") print(f"Saved to {parquet_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ONS private rental market statistics" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() with tempfile.TemporaryDirectory() as cache_dir: xls_path = Path(cache_dir) / "rental_prices.xls" download(URL, xls_path, timeout=60) convert_to_parquet(xls_path, args.output) if __name__ == "__main__": main()