This commit is contained in:
Andras Schmelczer 2026-03-15 17:38:26 +00:00
parent 80c093b7ba
commit f72c43a9fa
101 changed files with 2168 additions and 1177 deletions

View file

@ -13,13 +13,13 @@ import polars as pl
from shapely.geometry import Point
from tqdm import tqdm
from pipeline.download.pois import (
from pipeline.utils.england_geometry import (
ENGLAND_BBOX_EAST,
ENGLAND_BBOX_NORTH,
ENGLAND_BBOX_SOUTH,
ENGLAND_BBOX_WEST,
load_england_polygon,
)
from pipeline.utils.england_geometry import load_england_polygon
PLACE_TYPES = {"city"}

View file

@ -21,6 +21,42 @@ BEDROOM_SHEETS = {
# Local authority district codes in England, https://en.wikipedia.org/wiki/ONS_coding_system
LA_PREFIXES = ("E06", "E07", "E08", "E09")
# April 2021 + April 2023 LA reorganizations: old district codes → new unitary authority codes.
# The ONS rental data (Oct 2022 Sep 2023) uses the old codes; IoD 2025 uses the new ones.
# We remap old → new and average the medians so the join in merge.py works.
LA_CONSOLIDATION = {
# North Northamptonshire (April 2021)
"E07000150": "E06000061", # Corby
"E07000152": "E06000061", # East Northamptonshire
"E07000153": "E06000061", # Kettering
"E07000156": "E06000061", # Wellingborough
# West Northamptonshire (April 2021)
"E07000151": "E06000062", # Daventry
"E07000154": "E06000062", # Northampton
"E07000155": "E06000062", # South Northamptonshire
# Cumberland (April 2023)
"E07000026": "E06000063", # Allerdale
"E07000028": "E06000063", # Carlisle
"E07000029": "E06000063", # Copeland
# Westmorland and Furness (April 2023)
"E07000027": "E06000064", # Barrow-in-Furness
"E07000030": "E06000064", # Eden
"E07000031": "E06000064", # South Lakeland
# North Yorkshire (April 2023)
"E07000163": "E06000065", # Craven
"E07000164": "E06000065", # Hambleton
"E07000165": "E06000065", # Harrogate
"E07000166": "E06000065", # Richmondshire
"E07000167": "E06000065", # Ryedale
"E07000168": "E06000065", # Scarborough
"E07000169": "E06000065", # Selby
# Somerset (April 2023)
"E07000187": "E06000066", # Mendip
"E07000188": "E06000066", # Sedgemoor
"E07000189": "E06000066", # South Somerset
"E07000246": "E06000066", # Somerset West and Taunton
}
def _read_sheet(xls_path: Path, sheet_id: int, bedrooms: int) -> pl.DataFrame:
"""Read one bedroom category sheet, extract LA-level median rents."""
@ -61,6 +97,14 @@ def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
frames.append(df)
combined = pl.concat(frames)
# Remap old LA codes to new unitary authority codes and average medians
combined = combined.with_columns(
pl.col("area_code").replace(LA_CONSOLIDATION),
).group_by("area_code", "bedrooms").agg(
pl.col("median_monthly_rent").mean(),
)
print(f"Combined: {combined.shape}")
print(f"Non-null medians: {combined['median_monthly_rent'].drop_nulls().len()}")
print(combined.head(10))

View file

@ -1,4 +1,4 @@
"""England boundary polygon for accurate point-in-country filtering.
"""England boundary polygon and bounding box for geographic filtering.
Uses shapely prepared geometry for fast single-point checks (osmium handlers)
and vectorized shapely.contains for batch checks (Polars DataFrames).
@ -12,6 +12,12 @@ import shapely
from shapely.geometry import shape
from shapely.prepared import PreparedGeometry, prep
# Bounding box for fast pre-filtering before the precise polygon check
ENGLAND_BBOX_WEST = -6.45
ENGLAND_BBOX_SOUTH = 49.85
ENGLAND_BBOX_EAST = 1.77
ENGLAND_BBOX_NORTH = 55.82
def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
"""Load England boundary as a prepared shapely polygon for fast contains checks."""