This commit is contained in:
Andras Schmelczer 2026-03-15 17:38:26 +00:00
parent 80c093b7ba
commit f72c43a9fa
101 changed files with 2168 additions and 1177 deletions

View file

@ -21,6 +21,42 @@ BEDROOM_SHEETS = {
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
LA_PREFIXES = ("E06", "E07", "E08", "E09")
# April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes.
# The ONS rental data (Oct 2022 Sep 2023) uses the old codes; IoD 2025 uses the new ones.
# We remap old → new and average the medians so the join in merge.py works.
LA_CONSOLIDATION = {
# North Northamptonshire (April 2021)
"E07000150": "E06000061", # Corby
"E07000152": "E06000061", # East Northamptonshire
"E07000153": "E06000061", # Kettering
"E07000156": "E06000061", # Wellingborough
# West Northamptonshire (April 2021)
"E07000151": "E06000062", # Daventry
"E07000154": "E06000062", # Northampton
"E07000155": "E06000062", # South Northamptonshire
# Cumberland (April 2023)
"E07000026": "E06000063", # Allerdale
"E07000028": "E06000063", # Carlisle
"E07000029": "E06000063", # Copeland
# Westmorland and Furness (April 2023)
"E07000027": "E06000064", # Barrow-in-Furness
"E07000030": "E06000064", # Eden
"E07000031": "E06000064", # South Lakeland
# North Yorkshire (April 2023)
"E07000163": "E06000065", # Craven
"E07000164": "E06000065", # Hambleton
"E07000165": "E06000065", # Harrogate
"E07000166": "E06000065", # Richmondshire
"E07000167": "E06000065", # Ryedale
"E07000168": "E06000065", # Scarborough
"E07000169": "E06000065", # Selby
# Somerset (April 2023)
"E07000187": "E06000066", # Mendip
"E07000188": "E06000066", # Sedgemoor
"E07000189": "E06000066", # South Somerset
"E07000246": "E06000066", # Somerset West and Taunton
}
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
"""Read one bedroom category sheet, extract LA-level median rents."""
@ -61,6 +97,14 @@ def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
frames.append(df)
combined = pl.concat(frames)
# Remap old LA codes to new unitary authority codes and average medians
combined = combined.with_columns(
pl.col("area_code").replace(LA_CONSOLIDATION),
).group_by("area_code", "bedrooms").agg(
pl.col("median_monthly_rent").mean(),
)
print(f"Combined: {combined.shape}")
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
print(combined.head(10))