132 lines
4.7 KiB
Python
132 lines
4.7 KiB
Python
import argparse
|
||
import tempfile
|
||
|
||
import polars as pl
|
||
from pathlib import Path
|
||
|
||
from pipeline.utils import download
|
||
|
||
URL = "https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/housing/datasets/privaterentalmarketsummarystatisticsinengland/october2022toseptember2023/privaterentalmarketstatistics231220.xls"
|
||
|
||
# Sheets 12-16 are LA-level breakdowns: Studio, 1 Bed, 2 Bed, 3 Bed, 4+ Bed
|
||
# (Sheet 11 is "Room" — shared house rooms, not self-contained, so skip it)
|
||
BEDROOM_SHEETS = {
|
||
12: 0, # Studio
|
||
13: 1, # One Bedroom
|
||
14: 2, # Two Bedrooms
|
||
15: 3, # Three Bedrooms
|
||
16: 4, # Four or more Bedrooms
|
||
}
|
||
|
||
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
|
||
LA_PREFIXES = ("E06", "E07", "E08", "E09")
|
||
|
||
# April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes.
|
||
# The ONS rental data (Oct 2022 – Sep 2023) uses the old codes; IoD 2025 uses the new ones.
|
||
# We remap old → new and average the medians so the join in merge.py works.
|
||
LA_CONSOLIDATION = {
|
||
# North Northamptonshire (April 2021)
|
||
"E07000150": "E06000061", # Corby
|
||
"E07000152": "E06000061", # East Northamptonshire
|
||
"E07000153": "E06000061", # Kettering
|
||
"E07000156": "E06000061", # Wellingborough
|
||
# West Northamptonshire (April 2021)
|
||
"E07000151": "E06000062", # Daventry
|
||
"E07000154": "E06000062", # Northampton
|
||
"E07000155": "E06000062", # South Northamptonshire
|
||
# Cumberland (April 2023)
|
||
"E07000026": "E06000063", # Allerdale
|
||
"E07000028": "E06000063", # Carlisle
|
||
"E07000029": "E06000063", # Copeland
|
||
# Westmorland and Furness (April 2023)
|
||
"E07000027": "E06000064", # Barrow-in-Furness
|
||
"E07000030": "E06000064", # Eden
|
||
"E07000031": "E06000064", # South Lakeland
|
||
# North Yorkshire (April 2023)
|
||
"E07000163": "E06000065", # Craven
|
||
"E07000164": "E06000065", # Hambleton
|
||
"E07000165": "E06000065", # Harrogate
|
||
"E07000166": "E06000065", # Richmondshire
|
||
"E07000167": "E06000065", # Ryedale
|
||
"E07000168": "E06000065", # Scarborough
|
||
"E07000169": "E06000065", # Selby
|
||
# Somerset (April 2023)
|
||
"E07000187": "E06000066", # Mendip
|
||
"E07000188": "E06000066", # Sedgemoor
|
||
"E07000189": "E06000066", # South Somerset
|
||
"E07000246": "E06000066", # Somerset West and Taunton
|
||
}
|
||
|
||
|
||
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
|
||
"""Read one bedroom category sheet, extract LA-level median rents."""
|
||
df = pl.read_excel(xls_path, sheet_id=sheet_id)
|
||
|
||
# Columns are unnamed; positional:
|
||
# 0=LA Code, 1=Area Code, 2=Area Name, 3=Count, 4=Mean, 5=LQ, 6=Median, 7=UQ
|
||
# First 4 rows are headers (title, notes, bedroom label, column headers)
|
||
df = df.slice(4)
|
||
|
||
area_code_col = df.columns[1]
|
||
median_col = df.columns[6]
|
||
|
||
return (
|
||
df.select(
|
||
pl.col(area_code_col).alias("area_code"),
|
||
pl.col(median_col).alias("median_monthly_rent"),
|
||
)
|
||
.filter(
|
||
pl.col("area_code").is_not_null()
|
||
& pl.any_horizontal(
|
||
pl.col("area_code").str.starts_with(p) for p in LA_PREFIXES
|
||
)
|
||
)
|
||
.with_columns(
|
||
# Suppressed values are ".." — cast will turn them to null
|
||
pl.col("median_monthly_rent").cast(pl.Float32, strict=False),
|
||
pl.lit(bedrooms).cast(pl.UInt8).alias("bedrooms"),
|
||
)
|
||
)
|
||
|
||
|
||
def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
|
||
frames = []
|
||
for sheet_id, bedrooms in BEDROOM_SHEETS.items():
|
||
df = _read_sheet(xls_path, sheet_id, bedrooms)
|
||
print(f" Sheet {sheet_id} (bedrooms={bedrooms}): {df.height} rows")
|
||
frames.append(df)
|
||
|
||
combined = pl.concat(frames)
|
||
|
||
# Remap old LA codes to new unitary authority codes and average medians
|
||
combined = combined.with_columns(
|
||
pl.col("area_code").replace(LA_CONSOLIDATION),
|
||
).group_by("area_code", "bedrooms").agg(
|
||
pl.col("median_monthly_rent").mean(),
|
||
)
|
||
|
||
print(f"Combined: {combined.shape}")
|
||
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
|
||
print(combined.head(10))
|
||
|
||
combined.write_parquet(parquet_path, compression="zstd")
|
||
print(f"Saved to {parquet_path}")
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Download and convert ONS private rental market statistics"
|
||
)
|
||
parser.add_argument(
|
||
"--output", type=Path, required=True, help="Output parquet file path"
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
with tempfile.TemporaryDirectory() as cache_dir:
|
||
xls_path = Path(cache_dir) / "rental_prices.xls"
|
||
download(URL, xls_path, timeout=60)
|
||
convert_to_parquet(xls_path, args.output)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|