import argparse import tempfile import polars as pl from pathlib import Path from pipeline.utils import download URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls" # Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed # (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it) BEDROOM_SHEETS = { 12: 0, # Studio 13: 1, # One Bedroom 14: 2, # Two Bedrooms 15: 3, # Three Bedrooms 16: 4, # Four or more Bedrooms } # Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system LA_PREFIXES = ("E06", "E07", "E08", "E09") def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame: """Read one bedroom category sheet, extract LA-level median rents.""" df = pl.read_excel(xls_path, sheet_id=sheet_id) # Columns are unnamed; positional: # 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ # First 4 rows are headers (title, notes, bedroom label, column headers) df = df.slice(4) area_code_col = df.columns[1] median_col = df.columns[6] return ( df.select( pl.col(area_code_col).alias("area_code"), pl.col(median_col).alias("median_monthly_rent"), ) .filter( pl.col("area_code").is_not_null() & pl.any_horizontal( pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES ) ) .with_columns( # Suppressed values are ".." — cast will turn them to null pl.col("median_monthly_rent").cast(pl.Float32, strict=False), pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"), ) ) def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None: frames = [] for sheet_id, bedrooms in BEDROOM_SHEETS.items(): df = _read_sheet(xls_path, sheet_id, bedrooms) print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows") frames.append(df) combined = pl.concat(frames) print(f"Combined: {combined.shape}") print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}") print(combined.head(10)) combined.write_parquet(parquet_path, compression="zstd") print(f"Saved to {parquet_path}") def main() -> None: parser = argparse.ArgumentParser( description="Download and convert ONS private rental market statistics" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() with tempfile.TemporaryDirectory() as cache_dir: xls_path = Path(cache_dir) / "rental_prices.xls" download(URL, xls_path, timeout=60) convert_to_parquet(xls_path, args.output) if __name__ == "__main__": main()