Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
return _clean_string(column).cast(dtype, strict=False)
def _join_address_parts(*columns: str) -> pl.Expr:
"""Join address components into one display address, single-spaced.
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent
saon is "" on ~88% of rows and ``concat_str(..., ignore_nulls=True)``
skips only nulls, so empty components still contributed their separator
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
Convert ``''``null per component so ignore_nulls works as intended, then
defensively collapse residual whitespace runs and strip the result. A
fully-empty address becomes null (dropped by the downstream
``pp_address.is_not_null()`` filter) instead of whitespace junk.
"""
joined = pl.concat_str(
[_clean_string(column) for column in columns],
separator=" ",
ignore_nulls=True,
)
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
return pl.when(cleaned == "").then(None).otherwise(cleaned)
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
_join_address_parts("saon", "paon", "street").alias("pp_address"),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),

View file

@ -102,15 +102,11 @@ _AREA_COLUMNS = [
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Good+ primary schools within 2km",
"Good+ secondary schools within 2km",
"Outstanding primary schools within 5km",
"Outstanding secondary schools within 5km",
"Outstanding primary schools within 2km",
"Outstanding secondary schools within 2km",
# Schools (modelled historical catchment areas covering the postcode)
"Good+ primary school catchments",
"Good+ secondary school catchments",
"Outstanding primary school catchments",
"Outstanding secondary school catchments",
# Demographics
"Median age",
# Politics
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"good_primary_catchments": "Good+ primary school catchments",
"good_secondary_catchments": "Good+ secondary school catchments",
"outstanding_primary_catchments": "Outstanding primary school catchments",
"outstanding_secondary_catchments": "Outstanding secondary school catchments",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
@ -874,7 +866,7 @@ def _join_area_side_tables(
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_proximity: pl.LazyFrame,
school_catchments: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
@ -905,7 +897,7 @@ def _join_area_side_tables(
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_proximity, on="postcode", how="left")
base = base.join(school_catchments, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
@ -1970,7 +1962,7 @@ def _build(
ethnicity_path: Path,
crime_path: Path,
noise_path: Path,
school_proximity_path: Path,
school_catchments_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
@ -2080,7 +2072,7 @@ def _build(
)
.select("postcode", "noise_lden_db")
)
school_proximity = pl.scan_parquet(school_proximity_path)
school_catchments = pl.scan_parquet(school_catchments_path)
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
@ -2120,7 +2112,7 @@ def _build(
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_proximity": school_proximity,
"school_catchments": school_catchments,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
@ -2267,10 +2259,10 @@ def main():
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
)
parser.add_argument(
"--school-proximity",
"--school-catchments",
type=Path,
required=True,
help="School proximity counts parquet file",
help="School catchment counts parquet file",
)
parser.add_argument(
"--broadband",
@ -2376,7 +2368,7 @@ def main():
ethnicity_path=args.ethnicity,
crime_path=args.crime,
noise_path=args.noise,
school_proximity_path=args.school_proximity,
school_catchments_path=args.school_catchments,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,

View file

@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
GROCERIES_GROUP = "Groceries"
# Groceries categories EXCLUDED from the static "Number of grocery shops and
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
# are speciality food retail, not somewhere you do a grocery shop; together
# they were ~a third of the group and inflated the headline count. The metric
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
}
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
# (open public recreation grounds) is borderline but kept: outside big cities
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
# excluded — a playground is not a park, and "Playground" is already its own
# OSM-derived category. The remaining functions (Religious Grounds, Golf
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
# Facility) are clearly not parks.
GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
"parks": ["Public Park Or Garden", "Playing Field"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
"""Return the distinct `category` values for the Groceries group.
"""Return the distinct `category` values for the static groceries metric.
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
with group "Groceries"; it never emits the literal "Supermarket". Collecting
every Groceries category captures both the OSM strings and the brand names.
Speciality food retail (bakeries, butchers, delis, off-licences) is
excluded see GROCERY_STATIC_EXCLUDED_CATEGORIES.
"""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
return (
pois.filter(pl.col("group") == GROCERIES_GROUP)
pois.filter(
(pl.col("group") == GROCERIES_GROUP)
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
)
.select("category")
.unique()
.sort("category")
@ -109,6 +133,40 @@ def _build_poi_category_groups(
return groups, display_names
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
"""Collapse the greenspace frame to ONE representative row per site.
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
right grain for nearest-distance (the nearest gate is what matters) but
wildly over-counts "Number of amenities (Park) within Xkm" a large park
with 30 gates counted as 30 parks. Counting uses one row per site at the
site centroid (falling back to the first access point when no centroid is
available). Degrades gracefully: a legacy parquet without `site_id` is
returned unchanged (gate-grain counts) rather than crashing.
"""
if "site_id" not in greenspace.columns:
print(
"WARNING: greenspace parquet has no site_id column; park counts "
"will count access points, not sites (regenerate os_greenspace)"
)
return greenspace
keyed = greenspace.filter(pl.col("site_id").is_not_null())
unkeyed = greenspace.filter(pl.col("site_id").is_null())
representatives = keyed.unique(subset=["site_id"], keep="first")
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
representatives = representatives.with_columns(
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
)
frames = [representatives.select(greenspace.columns)]
if len(unkeyed) > 0:
frames.append(unkeyed)
return pl.concat(frames)
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
@ -185,13 +243,16 @@ def main():
# Park counts and distances from OS Open Greenspace. They use the dynamic
# amenity metric names so filters read through the same side-table path as
# OSM-derived amenity metrics.
# OSM-derived amenity metrics. Distances use the access-point grain (the
# nearest park GATE is the right semantics); counts use one row per SITE so
# a park with many gates counts once.
greenspace = pl.read_parquet(args.greenspace)
greenspace_sites = _greenspace_count_frame(greenspace)
park_counts_2km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
)
park_counts_5km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS

View file

@ -260,6 +260,12 @@ def main() -> None:
)
args = parser.parse_args()
if args.greenspace and not args.greenspace.exists():
# Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
# the subtraction is exactly how parks/lakes shipped inside postcode
# boundaries unnoticed.
raise SystemExit(f"--greenspace file not found: {args.greenspace}")
fragments_cache = args.output / "fragments_cache.parquet"
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
# so a greenspace change must not invalidate the fragment cache.
@ -294,7 +300,7 @@ def main() -> None:
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
if args.greenspace:
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")

View file

@ -3,7 +3,7 @@
from pathlib import Path
import polars as pl
from shapely import wkb
from shapely import make_valid, wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.strtree import STRtree
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index.
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
polygon would make the per-postcode ``intersects`` predicate (and the exact
difference path) liable to raise mid-merge, hours into a build. Empty
geometries are dropped.
Returns:
(tree, geoms) where tree is a Shapely STRtree and geoms is
the list of geometries indexed by the tree.
"""
df = pl.read_parquet(path)
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
geoms = []
for raw in df["geometry"].to_list():
geom = wkb.loads(raw)
if not geom.is_valid:
geom = make_valid(geom)
if not geom.is_empty:
geoms.append(geom)
tree = STRtree(geoms)
return tree, geoms

View file

@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
return geojson_dict
def _is_pointlike(geom_bng) -> bool:
"""True if a BNG geometry carries no real extent (tower-block signature).
Near-zero area AND short perimeter together distinguish a collapsed point
from a genuine thin sliver, which still carries length.
"""
try:
return (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
)
except GEOSException:
return False
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
gets a building-scale buffer so it is not reduced to an invisible sub-metre
dot; thin slivers that still carry length keep the minimal buffer.
"""
buffer_m = _MIN_FOOTPRINT_BUFFER_M
try:
if (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
):
buffer_m = _POINT_RESCUE_BUFFER_M
except GEOSException:
pass
buffer_m = (
_POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
)
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
if footprint is None:
return None
@ -147,10 +156,16 @@ def to_wgs84_geojson(
)
if simplified is None:
simplified = cleaned
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
if _is_pointlike(simplified):
# A POINTLIKE footprint is rescued to building scale even when it
# would survive snapping: a 0.1-1 m² polygon serializes fine but
# ships as an invisible dot covering a whole tower block.
result = _rescue_footprint(simplified)
else:
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
result = _rescue_footprint(simplified)
if result is not None:
return result
@ -229,6 +244,10 @@ def merge_fragments(
greenspace_tree: Optional STRtree of park/water polygons.
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
"""
subtract = greenspace_tree is not None and greenspace_geoms is not None
if subtract:
from .greenspace import subtract_greenspace
by_postcode: dict[str, list] = defaultdict(list)
for pc, geom in all_fragments:
by_postcode[pc].append(geom)
@ -256,9 +275,7 @@ def merge_fragments(
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
if greenspace_tree is not None and greenspace_geoms is not None:
from .greenspace import subtract_greenspace
if subtract:
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _keep_polygon_parts(combined)

View file

@ -921,6 +921,49 @@ class TestToWgs84Geojson:
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
0.86 ) must NOT ship as-is just because it survives precision snapping;
pointlike inputs are rescued to a ~201 disc unconditionally."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
tiny = box(530000, 180000, 530000.9, 180000.9)
from .output import _snap_to_wgs84_geojson
assert _snap_to_wgs84_geojson(tiny) is not None, (
"precondition: this polygon must be snappable, otherwise the test "
"exercises the old snap-fails path instead of the new one"
)
result = to_wgs84_geojson(tiny)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert 150 < area_m2 < 300, (
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
"instead of a building-scale (~201 m^2) disc"
)
def test_normal_polygon_area_unchanged(self):
"""A normal polygon must pass through without rescue inflation."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
result = to_wgs84_geojson(poly)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 == pytest.approx(10_000, rel=0.01)
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
load: it would otherwise make the per-postcode intersects/difference
liable to raise hours into a merge."""
from .greenspace import load_greenspace
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
assert not bowtie.is_valid
valid = box(20, 20, 30, 30)
path = tmp_path / "greenspace.parquet"
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
tree, geoms = load_greenspace(path)
assert len(geoms) == 2
assert all(g.is_valid and not g.is_empty for g in geoms)
# The repaired bow-tie must still subtract cleanly.
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
assert result.is_valid
assert result.area < 10_000
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""

View file

@ -26,6 +26,7 @@ from pipeline.transform.price_estimation.shrinkage import (
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
LATEST_COMPLETE_YEAR,
SMOOTHNESS_SUPPORT_PAIRS,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
@ -37,6 +38,19 @@ from pipeline.transform.price_estimation.utils import (
MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
# Gap-aware companion to OUTLIER_THRESHOLD: |log_ratio| must also stay within
# this many log-units PER YEAR of holding period (short gaps are allowed a
# full year's band). A flat +/-3.0 cap admits e.g. a 10k -> 196k "sale" six
# months apart (log +2.95, and weight 1/sqrt(gap) gives it the leverage of
# ~10 normal pairs); Huber does NOT recover, because once the thin year's
# beta satisfies the garbage pair it is the many good long-gap pairs that
# carry the residual and get down-weighted. Such pairs are data errors or
# non-market transfers (right-to-buy, probate, flips), not house-price
# signal -- standard repeat-sales practice (Case-Shiller) excludes extreme
# annualised returns for the same reason. 0.7 log/yr (~2x in a year) keeps
# any plausible genuine market move; long-gap pairs are still governed by
# the +/-3.0 cap.
ANNUALISED_OUTLIER_THRESHOLD = 0.7
HUBER_K = 1.345
IRLS_ITERATIONS = 5
@ -111,7 +125,16 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
/ (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
).alias("weight"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.filter(
pl.col("log_ratio").abs()
<= pl.min_horizontal(
pl.lit(OUTLIER_THRESHOLD),
ANNUALISED_OUTLIER_THRESHOLD
* pl.max_horizontal(
pl.col("frac_year2") - pl.col("frac_year1"), pl.lit(1.0)
),
)
)
.collect()
)
@ -181,11 +204,27 @@ def solve_robust_index(
# beta=0) has no column, so the penalty spans the non-baseline years only.
# For cells with <3 betas there is no curvature to penalise and the solve is
# unchanged.
#
# The penalty is SUPPORT-SCALED per row: a flat lambda is too weak for
# years identified by only 1-2 repeat-sale pairs (a cell can have hundreds
# of pairs overall yet single thin years, yielding 2-7x one-year spikes
# that cell-level shrinkage cannot catch). Each curvature row's lambda is
# lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s), with s the minimum
# cross-year pair count among the row's three years, so thin years are
# pulled strongly toward the local trend while well-supported years keep
# the baseline penalty. Taking the min over the triple (not just the
# middle year) also covers thin FIRST/LAST years of the range, which only
# ever appear at a triple's edge -- the last solved year feeds the
# CURRENT_YEAR trend extrapolation, so spikes there are the costliest.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cross = years1 != years2
touched, counts = np.unique(
np.concatenate([years1[cross], years2[cross]]), return_counts=True
)
support = {int(y): int(c) for y, c in zip(touched, counts)}
years_sorted = sorted(year_to_col)
cols_by_year = [year_to_col[y] for y in years_sorted]
n_pen = n_cols - 2
@ -202,6 +241,11 @@ def solve_robust_index(
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
s_k = min(support.get(y, 0) for y in (y0, y1, y2))
lam_k = TEMPORAL_SMOOTHNESS_LAMBDA * (
1.0 + SMOOTHNESS_SUPPORT_PAIRS / max(s_k, 1)
)
sqrt_lambda = float(np.sqrt(lam_k))
pen_vals[3 * k : 3 * k + 3] = (
sqrt_lambda * w0,
sqrt_lambda * w1,
@ -347,10 +391,22 @@ def compute_hedonic_index(
EXTRAPOLATION_YEARS = 3
# Bound on the per-year slope used to trend-extrapolate beyond the last solved
# year (the solve stops at LATEST_COMPLETE_YEAR; CURRENT_YEAR is filled here).
# +/-0.10 log/yr (~+/-10.5%/yr) comfortably covers genuine UK sector-level
# annual moves while preventing a residual spike in the recent betas from
# compounding into an absurd extrapolated step (e.g. +49% in one year).
MAX_EXTRAPOLATION_SLOPE = 0.10
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
"""Forward-fill missing years, with linear extrapolation beyond last known year."""
"""Forward-fill missing years, with trend extrapolation beyond last known year.
The extrapolation slope is the MEDIAN of the per-year slopes between
consecutive known points in the recent window (a single noisy year corrupts
at most one of those slopes, unlike a least-squares fit through all the
points), clamped to +/-MAX_EXTRAPOLATION_SLOPE.
"""
if not index:
return {y: 0.0 for y in range(min_year, max_year + 1)}
@ -365,7 +421,7 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
last = index[y]
filled[y] = last
# Linear extrapolation beyond last known year
# Robust trend extrapolation beyond last known year
if last_known_year < max_year:
recent = [
(y, index[y])
@ -373,9 +429,17 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
if y >= last_known_year - EXTRAPOLATION_YEARS
]
if len(recent) >= 2:
years_arr = np.array([r[0] for r in recent], dtype=np.float64)
vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
slope = np.polyfit(years_arr, vals_arr, 1)[0]
slopes = [
(v_b - v_a) / (y_b - y_a)
for (y_a, v_a), (y_b, v_b) in zip(recent[:-1], recent[1:])
]
slope = float(
np.clip(
np.median(slopes),
-MAX_EXTRAPOLATION_SLOPE,
MAX_EXTRAPOLATION_SLOPE,
)
)
for y in range(last_known_year + 1, max_year + 1):
filled[y] = index[last_known_year] + slope * (y - last_known_year)
else:
@ -389,12 +453,16 @@ def build_index(
input_path: Path,
max_pair_year: int | None = None,
postcodes_path: Path | None = None,
sectors: list[str] | None = None,
) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
sectors: if provided, restrict the build to these postcode sectors (for
debugging/verification runs; hierarchy levels are then computed only from
the scoped pairs, so scoped output is NOT identical to a full build).
"""
# Solve the index only on COMPLETE calendar years: exclude the partial
# current year, whose thin repeat-sale set yields wild betas. The index is
@ -405,6 +473,9 @@ def build_index(
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
)
pairs = extract_pairs(input_path, max_year2=estimation_cap)
if sectors is not None:
pairs = pairs.filter(pl.col("sector").is_in(sectors))
print(f" Scoped to {len(sectors)} sectors: {len(pairs):,} pairs")
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
@ -534,9 +605,21 @@ def main():
help="Path to postcode.parquet (for lat/lon centroids)",
)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--sectors",
type=str,
default=None,
help="Comma-separated postcode sectors to scope the build to "
"(debug/verification only; hierarchy is computed from scoped pairs)",
)
args = parser.parse_args()
result = build_index(args.input, postcodes_path=args.postcodes)
sectors = (
[s.strip() for s in args.sectors.split(",") if s.strip()]
if args.sectors
else None
)
result = build_index(args.input, postcodes_path=args.postcodes, sectors=sectors)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -3,7 +3,10 @@ import polars as pl
from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
MAX_EXTRAPOLATION_SLOPE,
compute_indices_for_level,
extract_pairs,
forward_fill,
solve_robust_index,
)
@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
assert abs(idx[2015] - true[2015]) < 0.05
def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
"""Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
years = range(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years}
y1, y2, lr, w = [], [], [], []
for a in range(2010, 2020):
for _ in range(ramp_reps):
y1.append(a)
y2.append(a + 1)
lr.append(true[a + 1] - true[a])
w.append(1.0)
for _ in range(tail_n):
y1.append(2020)
y2.append(2021)
lr.append(tail_ratio)
w.append(1.0)
return (
np.array(y1, dtype=np.int32),
np.array(y2, dtype=np.int32),
np.array(lr, dtype=np.float64),
np.array(w, dtype=np.float64),
)
def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
"""A final year identified by a SINGLE pair claiming a +1.5 log jump is
pulled strongly toward the local trend; with the flat baseline penalty
(support scaling off) the jump survives almost entirely. The thin year is
the LAST year of the range (only ever at a penalty triple's edge), proving
the min-over-triple support rule covers range edges -- the last solved year
feeds the CURRENT_YEAR trend extrapolation."""
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
flat = solve_robust_index(y1, y2, lr, w)
monkeypatch.undo()
scaled = solve_robust_index(y1, y2, lr, w)
flat_step = flat[2021] - flat[2020]
scaled_step = scaled[2021] - scaled[2020]
assert flat_step > 1.2 # flat lambda barely resists the spike
assert scaled_step < 0.65 # support-scaled lambda suppresses it
# The well-supported ramp stays close to truth: the strong penalty row
# spanning the thin year drags its immediate neighbour slightly (<0.1)
# toward collinearity -- the price of suppressing a x4.5 one-year spike.
for y in range(2010, 2021):
assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
"""With ample pairs everywhere (support 50-100 per year), lambda_eff ~
lambda0 and the solution matches the flat-penalty solve to <1e-3."""
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
flat = solve_robust_index(y1, y2, lr, w)
monkeypatch.undo()
scaled = solve_robust_index(y1, y2, lr, w)
assert set(flat) == set(scaled)
assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
def test_forward_fill_extrapolation_uses_robust_median_slope():
"""A residual spike in ONE recent year must not corrupt the extrapolated
step: the median of consecutive per-year slopes ignores it (a least-squares
fit through the same points would extrapolate a large positive slope)."""
index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
filled = forward_fill(index, 2022, 2026)
# slopes: [+0.05, +0.55, -0.50] -> median +0.05
assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
def test_forward_fill_extrapolated_slope_is_clamped():
"""A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
filled = forward_fill(index, 2022, 2026)
assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
filled_down = forward_fill(index_down, 2022, 2026)
assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
def test_forward_fill_preserves_sane_trend_and_flat_fallback():
"""Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
trend); with <2 recent points the fill is flat."""
index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
filled = forward_fill(index, 2022, 2026)
assert abs(filled[2026] - 1.20) < 1e-9
assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
"""A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
Such pairs are dropped via the annualised cap; large ratios over long
holding periods (genuine appreciation) are kept."""
df = pl.DataFrame(
{
"Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
"Property type": ["Detached", "Detached", "Detached"],
"historical_prices": [
# +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
[
{"year": 2020, "month": 1, "price": 100_000},
{"year": 2020, "month": 7, "price": 1_000_000},
],
# +2.20 log over 24 years -> kept (flat 3.0 cap governs)
[
{"year": 2000, "month": 1, "price": 100_000},
{"year": 2024, "month": 1, "price": 900_000},
],
# +0.41 log in 1 year -> kept (within the 0.7/yr band)
[
{"year": 2020, "month": 1, "price": 100_000},
{"year": 2021, "month": 1, "price": 150_000},
],
],
}
)
path = tmp_path / "props.parquet"
df.write_parquet(path)
pairs = extract_pairs(path)
assert len(pairs) == 2
ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
assert ratios == [0.41, 2.2]
def test_n_pairs_counts_only_cross_year_pairs():
"""FIX #12: same-year pairs carry zero index information and must not inflate
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""

View file

@ -36,6 +36,20 @@ SHRINKAGE_K = 50
# noisy year) without flattening genuine multi-year trends.
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
# Per-year support scaling for the temporal smoothness penalty. A flat lambda
# is too weak for years with very few repeat-sale pairs: a sector can have
# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
# yet have individual years estimated from 1-2 pairs, producing 2-7x
# single-year index spikes. Each curvature row is therefore scaled by the
# local pair support of its year triple:
# lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
# where s is the minimum cross-year pair count among the triple's years.
# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
# lambda0 (current behaviour); a year identified by a single pair gets
# ~41x lambda0, pulling its beta strongly toward the local trend through its
# neighbours. Same-year pairs cancel in the design and are not counted.
SMOOTHNESS_SUPPORT_PAIRS = 40
def type_group_expr():
"""Polars expression: Property type -> type_group."""

View file

@ -0,0 +1,748 @@
"""Model historical school catchment areas and count them per postcode.
No national dataset of school catchment areas exists for England: catchments
are set per admission authority, only a handful of councils publish polygons,
and the pupil-residence data behind commercial "heatmap" catchments lives in
the restricted National Pupil Database. This module therefore COMPILES one
from open data, estimating each school's admission cutoff distance ("last
distance offered") — the radius within which an applicant would plausibly be
offered a place.
Model: English state admissions are run as deferred acceptance with distance
tie-breaks, which in a continuum economy is equivalent to finding
market-clearing cutoff distances (Azevedo & Leshno 2016). Per phase
(primary/secondary):
1. Demand Census 2021 children per LSOA (TS007A age bands, prorated to the
phase's cohort ages) split evenly across the LSOA's live postcodes.
2. Supply every open, non-selective state-funded school (GIAS), with a fill
target of max(capacity, headcount) prorated to the phase's cohorts
(sixth-form and nursery years carry reduced weight, since their class
sizes differ and they are not allocated by the same admissions round).
3. Preferences children prefer nearby schools, trading distance against
Ofsted grade: a school's effective distance is its real distance minus a
grade bonus (Outstanding > Good > ungraded > below-Good). Because real
first preferences are heterogeneous, each postcode's children split
across nearby feasible schools with logit weights over effective
distance rather than all picking the same one.
4. Equilibrium cutoffs start unbounded and tighten monotonically: each
round, children apply to their preferred feasible school(s), and
oversubscribed schools tighten their cutoff to the distance of their
marginal admitted child. Converges to the deferred-acceptance outcome.
5. Schools that never fill have no binding cutoff anyone who applies gets
in so their feasibility radius is the distance within which the local
child population would cover their fill target, capped.
The free parameters (preference bonuses, demand scale, choice temperature,
residual calibration factors) are CALIBRATED against published "last
distance offered" figures scraped from nine local authorities' allocation
reports see check_school_cutoffs.py and the constants below.
A postcode is "inside the catchment" of every school whose cutoff radius
covers it. The output counts those schools per postcode for the four
good+/outstanding x primary/secondary categories (Ofsted-classified, same
rules as the previous proximity metric). Selective (grammar) schools are
excluded throughout: their intakes are test-based and region-wide, so a
distance model would fabricate a catchment that does not exist.
Known limitations: faith oversubscription criteria are not modelled (whether
a faith school's catchment is open to a given family depends on the family),
and Census 2021 child counts lag current rolls slightly. Cutoffs are
straight-line distances, the modal LA tie-break criterion.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
from pipeline.utils.poi_counts import _project_lat_lng_km, valid_uk_coords_mask
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
# Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary metrics — Ofsted's coarse "Ofsted phase"
# labels such schools as just "Secondary", which previously hid them from every
# postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
# Cohort ages (inclusive) each phase competes for: Reception-Y6 and Y7-Y11.
PRIMARY_AGES = (4, 10)
SECONDARY_AGES = (11, 15)
# Cohort weights for prorating a school's headcount/capacity across the ages
# it teaches. Nursery classes are typically part-time and small; sixth forms
# run at roughly 60% of a school's Y7-Y11 cohort size. A flat proration
# undersupplied secondary places by ~8%.
NURSERY_COHORT_WEIGHT = 0.5 # ages < 4
SIXTH_FORM_COHORT_WEIGHT = 0.6 # ages >= 16
# Only schools that admit (mostly) by geography take part in the assignment.
# Independent, special and Welsh schools and post-16 colleges either don't
# admit by distance or fall outside the England postcode universe; selective
# (grammar) schools admit by test from a wide region.
STATE_SCHOOL_TYPE_GROUPS = [
"Academies",
"Local authority maintained schools",
"Free Schools",
]
# Preference bonuses (km of extra travel a family accepts for a better
# school), applied as a discount on effective distance when children choose.
# Grade 3/4 schools repel by the same magnitudes.
PREF_BONUS_OUTSTANDING_KM = 0.6
PREF_BONUS_GOOD_KM = 0.3
# Share of resident children who actually compete for state places. Census
# 2021 counts overstate current entry cohorts (birth rates fell ~10% between
# 2016 and 2021, which is exactly the gap between the census stock and the
# children reaching Reception by mid-decade) and independent/home-educated
# children (~7%) never enter the allocation at all. Without this, modelled
# cutoffs run systematically tight and undersubscribed schools look full.
DEMAND_SCALE = 0.8
# Logit choice temperature (km). With deterministic choice every child at a
# postcode ranks the same school first, so popular schools fill entirely from
# their nearest band and the marginal admitted child sits unrealistically
# close. Real first preferences are heterogeneous; a school draws only a
# distance-decaying share of nearby families. Children therefore split across
# nearby feasible schools with weights softmax(-effective_distance / tau):
# higher tau = more smearing = wider cutoffs. tau -> 0 recovers the
# deterministic model (used by the unit tests). Calibrated 2026-06 against
# 240 published binding cutoffs from 9 LAs (check_school_cutoffs.py): 0.3 km
# maximises rank correlation and within-2x share; beyond ~0.6 the smearing
# erases school-to-school differentiation (Spearman 0.24 -> 0.01).
CHOICE_TEMPERATURE_KM = 0.3
# Residual calibration from the same ground truth: after the equilibrium
# solve, modelled cutoffs still ran systematically tight (median log2 bias
# -0.53 primary / -0.36 secondary at the settings above — published "last
# distance offered" reflects offer-day frictions, waiting-list churn and
# furthest-applicant noise that no clean equilibrium reproduces). Radii are
# multiplied by 2^-bias so the modelled median matches the published median;
# rank ordering is unaffected.
CUTOFF_CALIBRATION_FACTOR = {"primary": 1.44, "secondary": 1.28}
# Each demand postcode considers this many nearest schools; beyond ~16
# candidates assignment shares are negligible.
NEAREST_SCHOOL_CANDIDATES = 16
# Radius guard rails: the floor absorbs postcode-centroid noise around tiny
# urban catchments; the cap bounds feasibility radii for schools the model
# never fills (mostly rural).
MIN_RADIUS_KM = 0.3
MAX_RADIUS_KM = 25.0
EQUILIBRIUM_MAX_ITER = 100
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for catchment counts.
Derives a grade ("1" = outstanding, "2" = good) and one or two
``category`` rows per school, returning a ``(urn, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding"). Filtering on the graded column
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
ungraded outcome, but ONLY when there is no usable graded result
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
"""
graded = _with_derived_grade(ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("_ofsted_grade").is_in(["1", "2"])
)
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
# A school can yield up to two rows (primary and secondary).
primary = graded.filter(pl.col("_serves_primary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
.alias("category")
)
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("URN").cast(pl.Int64).alias("urn"),
"category",
)
def _with_derived_grade(ofsted: pl.DataFrame) -> pl.DataFrame:
"""Attach ``_ofsted_grade`` ("1"-"4" or null): graded OEIF result first,
falling back to ungraded "School remains Good/Outstanding" outcomes (minus
"(Concerns)") only when there is no usable graded result."""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
return ofsted.with_columns(
pl.when(oeif.is_in(["1", "2", "3", "4"]))
.then(oeif)
.when(no_usable_grade & remains_outstanding)
.then(pl.lit("1"))
.when(no_usable_grade & remains_good)
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
def school_preference_bonuses(
ofsted: pl.DataFrame,
bonus_outstanding_km: float = PREF_BONUS_OUTSTANDING_KM,
bonus_good_km: float = PREF_BONUS_GOOD_KM,
) -> pl.DataFrame:
"""Per-school preference bonus in km, from the derived Ofsted grade.
Outstanding/Good schools attract demand from further away; grade 3/4
schools repel it symmetrically. Ungraded (typically new) schools are
neutral. Returns ``(urn, bonus_km)`` with one row per URN.
"""
bonus = {
"1": bonus_outstanding_km,
"2": bonus_good_km,
"3": -bonus_good_km,
"4": -bonus_outstanding_km,
}
return (
_with_derived_grade(ofsted)
.filter(pl.col("URN").is_not_null())
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
pl.col("_ofsted_grade")
.replace_strict(bonus, default=0.0, return_dtype=pl.Float64)
.alias("bonus_km"),
)
.unique(subset="urn", keep="first")
)
def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
"""Per-school phase-prorated fill targets for the admissions model.
Returns one row per open, non-selective state-funded school with valid
coordinates: ``(urn, lat, lng, primary_intake, secondary_intake)``. The
fill target max(capacity, headcount), so over-full schools keep their
demonstrated size and under-full schools can admit up to capacity is
spread over the cohort ages the school teaches (parsed from ``age_range``,
e.g. "311" = ages 3..10) with nursery and sixth-form ages down-weighted,
and each phase receives the share of cohort weight in its age band.
"""
ages = pl.col("age_range").str.extract_all(r"\d+")
low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
# The leaving age is exclusive as a cohort: a "3-11" school teaches
# children aged 3 through 10.
high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
schools = (
gias.filter(
pl.col("type_group").is_in(STATE_SCHOOL_TYPE_GROUPS)
& (
pl.col("admissions_policy").is_null()
| (pl.col("admissions_policy") != "Selective")
)
& pl.col("lat").is_not_null()
& pl.col("lng").is_not_null()
)
.with_columns(low.alias("_low"), high.alias("_high"))
.filter(pl.col("_low").is_not_null() & (pl.col("_high") >= pl.col("_low")))
.with_columns(
pl.max_horizontal(
pl.col("pupils").fill_null(0), pl.col("capacity").fill_null(0)
)
.cast(pl.Float64)
.alias("_fill_target"),
)
.filter(pl.col("_fill_target") > 0)
)
def weighted_overlap(lo: int, hi: int, weight: float = 1.0) -> pl.Expr:
"""Cohort weight contributed by ages [lo, hi] within [_low, _high]."""
return (
weight
* (
pl.min_horizontal(pl.col("_high"), hi)
- pl.max_horizontal(pl.col("_low"), lo)
+ 1
).clip(lower_bound=0)
).cast(pl.Float64)
total_weight = (
weighted_overlap(0, 3, NURSERY_COHORT_WEIGHT)
+ weighted_overlap(4, 15)
+ weighted_overlap(16, 30, SIXTH_FORM_COHORT_WEIGHT)
)
return schools.select(
pl.col("urn").cast(pl.Int64),
"lat",
"lng",
(pl.col("_fill_target") * weighted_overlap(*PRIMARY_AGES) / total_weight).alias(
"primary_intake"
),
(
pl.col("_fill_target") * weighted_overlap(*SECONDARY_AGES) / total_weight
).alias("secondary_intake"),
)
def children_per_postcode(
postcodes: pl.DataFrame, lsoa_children: pl.DataFrame
) -> pl.DataFrame:
"""Estimate phase-age children living at each live postcode.
Census age bands don't align with school phases, so phase totals take
fractional shares of bands (one fifth per single year of age): primary
(4-10) = age 4 + ages 5-9 + age 10, secondary (11-15) = ages 11-14 +
age 15. LSOA totals are then split evenly across the LSOA's postcodes.
"""
lsoa = lsoa_children.select(
"lsoa21",
(
0.2 * pl.col("aged_0_4") + pl.col("aged_5_9") + 0.2 * pl.col("aged_10_14")
).alias("_lsoa_primary"),
(0.8 * pl.col("aged_10_14") + 0.2 * pl.col("aged_15_19")).alias(
"_lsoa_secondary"
),
)
return (
postcodes.join(lsoa, left_on="lsoa21cd", right_on="lsoa21", how="inner")
.with_columns(pl.len().over("lsoa21cd").alias("_lsoa_postcodes"))
.select(
"postcode",
"lat",
"lng",
(pl.col("_lsoa_primary") / pl.col("_lsoa_postcodes")).alias(
"primary_children"
),
(pl.col("_lsoa_secondary") / pl.col("_lsoa_postcodes")).alias(
"secondary_children"
),
)
)
def equilibrium_cutoffs(
school_xy: np.ndarray,
fill_target: np.ndarray,
bonus_km: np.ndarray,
pc_xy: np.ndarray,
pc_children: np.ndarray,
k: int = NEAREST_SCHOOL_CANDIDATES,
max_iter: int = EQUILIBRIUM_MAX_ITER,
tau_km: float = CHOICE_TEMPERATURE_KM,
) -> np.ndarray:
"""Market-clearing admission cutoff distance (km) per school.
Deferred acceptance with distance priority, solved as cutoff dynamics
(Azevedo & Leshno): cutoffs start unbounded; each round every child unit
applies to its preferred feasible school(s) a logit split over
effective distance (distance - school bonus) among schools whose cutoff
covers it, collapsing to the single best school when ``tau_km`` is 0
and each oversubscribed school tightens its cutoff to its marginal
admitted child's distance. Cutoffs only ever tighten, so the iteration
converges.
Returns np.inf for schools that never fill (no binding cutoff).
"""
n_schools = len(school_xy)
k = min(k, n_schools)
demand = np.flatnonzero(pc_children > 0)
weights = pc_children[demand]
tree = cKDTree(school_xy)
dist, cand = tree.query(pc_xy[demand], k=k, workers=-1)
if k == 1:
dist = dist[:, None]
cand = cand[:, None]
eff = dist - bonus_km[cand]
rows = np.arange(len(demand))
cutoff = np.full(n_schools, np.inf)
for _ in range(max_iter):
eff_feasible = np.where(dist <= cutoff[cand], eff, np.inf)
if tau_km <= 0:
choice = np.argmin(eff_feasible, axis=1)
valid = np.isfinite(eff_feasible[rows, choice])
chosen_school = cand[rows[valid], choice[valid]]
chosen_dist = dist[rows[valid], choice[valid]]
chosen_mass = weights[valid]
else:
z = -eff_feasible / tau_km
z_max = z.max(axis=1, keepdims=True)
share = np.exp(z - np.where(np.isfinite(z_max), z_max, 0.0))
share[~np.isfinite(eff_feasible)] = 0.0
total = share.sum(axis=1, keepdims=True)
mass = weights[:, None] * share / np.where(total > 0, total, 1.0)
# Sub-thousandth-of-a-child applications only slow the sort down.
keep = mass > 1e-3
chosen_school = cand[keep]
chosen_dist = dist[keep]
chosen_mass = mass[keep]
order = np.lexsort((chosen_dist, chosen_school))
s_sorted = chosen_school[order]
d_sorted = chosen_dist[order]
m_cum = np.cumsum(chosen_mass[order])
boundaries = np.flatnonzero(np.diff(s_sorted)) + 1
starts = np.concatenate(([0], boundaries))
ends = np.concatenate((boundaries, [len(s_sorted)]))
changed = False
for start, end in zip(starts, ends):
school = s_sorted[start]
seg_cum = m_cum[start:end] - (m_cum[start - 1] if start else 0.0)
if seg_cum[-1] <= fill_target[school]:
continue
marginal = d_sorted[start + np.searchsorted(seg_cum, fill_target[school])]
if marginal < cutoff[school]:
cutoff[school] = marginal
changed = True
if not changed:
break
return cutoff
def capacity_fill_radii(
school_xy: np.ndarray,
fill_target: np.ndarray,
pc_xy: np.ndarray,
pc_children: np.ndarray,
max_radius_km: float = MAX_RADIUS_KM,
) -> np.ndarray:
"""Feasibility radius for schools without a binding cutoff.
An undersubscribed school admits anyone who applies, so its catchment is
bounded by plausibility rather than competition: the distance within
which the local child population would cover its fill target. Capped at
``max_radius_km``.
"""
demand = np.flatnonzero(pc_children > 0)
tree = cKDTree(pc_xy[demand])
radii = np.full(len(school_xy), max_radius_km)
k = min(4096, len(demand))
for i in range(len(school_xy)):
dists, idx = tree.query(
school_xy[i], k=k, distance_upper_bound=max_radius_km
)
found = np.isfinite(dists)
cum = np.cumsum(pc_children[demand[idx[found]]])
if len(cum) and cum[-1] >= fill_target[i]:
radii[i] = dists[found][np.searchsorted(cum, fill_target[i])]
return radii
def count_covering_catchments(
pc_xy: np.ndarray,
pc_valid: np.ndarray,
school_xy: np.ndarray,
school_radii: np.ndarray,
n_postcodes: int,
) -> np.ndarray:
"""Count, per postcode, how many schools' catchment radii cover it."""
counts = np.zeros(n_postcodes, dtype=np.int32)
if len(school_xy) == 0:
return counts
valid_indices = np.flatnonzero(pc_valid)
tree = cKDTree(pc_xy[valid_indices])
covered = np.zeros(len(valid_indices), dtype=np.int32)
for indices in tree.query_ball_point(school_xy, school_radii, workers=-1):
covered[indices] += 1
counts[valid_indices] = covered
return counts
def main():
parser = argparse.ArgumentParser(
description=(
"Model school admission cutoff radii and count good+/outstanding "
"primary/secondary catchments covering each postcode"
)
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--gias", type=Path, required=True, help="GIAS open-school parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--lsoa-children",
type=Path,
required=True,
help="Census 2021 children by LSOA parquet",
)
parser.add_argument(
"--output",
type=Path,
default=None,
help="Per-postcode counts parquet; omit for calibration runs that only "
"need --schools-output",
)
parser.add_argument(
"--schools-output",
type=Path,
default=None,
help="Optional per-school catchment radii parquet (for calibration/debugging)",
)
parser.add_argument(
"--bonus-outstanding-km",
type=float,
default=PREF_BONUS_OUTSTANDING_KM,
help="Preference bonus for Outstanding schools (calibration sweeps)",
)
parser.add_argument(
"--bonus-good-km",
type=float,
default=PREF_BONUS_GOOD_KM,
help="Preference bonus for Good schools (calibration sweeps)",
)
parser.add_argument(
"--demand-scale",
type=float,
default=DEMAND_SCALE,
help="Share of resident children competing for state places",
)
parser.add_argument(
"--choice-temperature-km",
type=float,
default=CHOICE_TEMPERATURE_KM,
help="Logit choice temperature over effective distance",
)
args = parser.parse_args()
gias = pl.read_parquet(args.gias)
open_urns = set(
gias.select(pl.col("urn").cast(pl.Int64, strict=False))
.to_series()
.drop_nulls()
.to_list()
)
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = pl.read_parquet(args.ofsted)
rated = classify_good_plus_schools(ofsted, open_urns=open_urns)
if rated.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ school/phase rows: {len(rated):,}")
supply = phase_intakes(gias).join(
school_preference_bonuses(
ofsted,
bonus_outstanding_km=args.bonus_outstanding_km,
bonus_good_km=args.bonus_good_km,
),
on="urn",
how="left",
).with_columns(pl.col("bonus_km").fill_null(0.0))
print(f"State schools in admissions model: {len(supply):,}")
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
"lsoa21cd",
"doterm",
)
live = arcgis.filter(
pl.col("doterm").is_null() & pl.col("lsoa21cd").str.starts_with("E")
)
demand = children_per_postcode(live, pl.read_parquet(args.lsoa_children))
print(
f"Demand postcodes: {len(demand):,} "
f"({demand['primary_children'].sum():,.0f} primary-age, "
f"{demand['secondary_children'].sum():,.0f} secondary-age children)"
)
# Shared local-km projection so assignment and coverage use one metric.
pc_lats = arcgis["lat"].to_numpy()
pc_lngs = arcgis["lng"].to_numpy()
pc_valid = valid_uk_coords_mask(pc_lats, pc_lngs)
origin_lat = float(np.mean(pc_lats[pc_valid]))
pc_xy = _project_lat_lng_km(pc_lats, pc_lngs, origin_lat)
demand_lats = demand["lat"].to_numpy()
demand_lngs = demand["lng"].to_numpy()
demand_valid = valid_uk_coords_mask(demand_lats, demand_lngs)
demand_xy = _project_lat_lng_km(demand_lats, demand_lngs, origin_lat)
school_xy = _project_lat_lng_km(
supply["lat"].to_numpy(), supply["lng"].to_numpy(), origin_lat
)
radii = {}
for phase in ("primary", "secondary"):
in_phase = supply[f"{phase}_intake"].to_numpy() > 0
targets = supply[f"{phase}_intake"].to_numpy()[in_phase]
xy = school_xy[in_phase]
children = np.where(
demand_valid,
demand[f"{phase}_children"].to_numpy() * args.demand_scale,
0.0,
)
print(f"Solving {phase} admissions for {in_phase.sum():,} schools...")
cutoffs = equilibrium_cutoffs(
xy,
targets,
supply["bonus_km"].to_numpy()[in_phase],
demand_xy,
children,
tau_km=args.choice_temperature_km,
)
filled = np.isfinite(cutoffs)
print(
f" {filled.sum():,} schools have binding cutoffs "
f"(median {np.median(cutoffs[filled]):.2f} km); "
f"{(~filled).sum():,} undersubscribed"
)
fallback = capacity_fill_radii(
xy[~filled], targets[~filled], demand_xy, children
)
raw = cutoffs.copy()
raw[~filled] = fallback
radii[phase] = pl.DataFrame(
{
"urn": supply["urn"].to_numpy()[in_phase],
"phase": phase,
"cutoff_km": raw,
"filled": filled,
"radius_km": np.clip(
raw * CUTOFF_CALIBRATION_FACTOR[phase],
MIN_RADIUS_KM,
MAX_RADIUS_KM,
),
}
)
print(
f" radius km: median {radii[phase]['radius_km'].median():.2f}, "
f"p90 {radii[phase]['radius_km'].quantile(0.9):.2f}"
)
# Attach each rated school's phase radius; rated schools outside the
# admissions model (special schools, selective schools, missing
# headcounts) cannot be given a defensible radius and are dropped.
rated = rated.with_columns(
pl.col("category").str.split("_").list.get(1).alias("phase")
)
rated_with_radius = rated.join(
pl.concat(list(radii.values())), on=["urn", "phase"], how="inner"
).join(supply.select("urn", "lat", "lng"), on="urn", how="inner")
dropped = len(rated) - len(rated_with_radius)
print(
f"Rated school/phase rows with radii: {len(rated_with_radius):,} "
f"(dropped {dropped:,}, incl. selective schools)"
)
if args.output is None and args.schools_output is None:
raise SystemExit("Provide --output and/or --schools-output")
if args.output is not None:
category_counts = {}
for category in set(c for cats in SCHOOL_GROUPS.values() for c in cats):
cat = rated_with_radius.filter(pl.col("category") == category)
cat_xy = _project_lat_lng_km(
cat["lat"].to_numpy(), cat["lng"].to_numpy(), origin_lat
)
category_counts[category] = count_covering_catchments(
pc_xy, pc_valid, cat_xy, cat["radius_km"].to_numpy(), len(arcgis)
)
print(f" {category}: {len(cat):,} schools")
result = pl.DataFrame(
{
"postcode": arcgis["postcode"],
**{
f"{group}_catchments": sum(category_counts[c] for c in categories)
for group, categories in SCHOOL_GROUPS.items()
},
}
)
for group in SCHOOL_GROUPS:
col = result[f"{group}_catchments"]
print(f" {group}_catchments: mean {col.mean():.2f}, max {col.max()}")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if args.schools_output is not None:
schools_out = rated_with_radius.select(
"urn", "category", "phase", "cutoff_km", "filled", "radius_km", "lat", "lng"
)
args.schools_output.parent.mkdir(parents=True, exist_ok=True)
schools_out.write_parquet(args.schools_output)
print(f"Wrote {args.schools_output}")
if __name__ == "__main__":
main()

View file

@ -1,199 +0,0 @@
"""Compute Ofsted-rated school proximity counts per postcode."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
# Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
# phase" labels such schools as just "Secondary", which previously hid them from
# every postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
``category`` rows per school, returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding"). Filtering on the graded column
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
ungraded outcome, but ONLY when there is no usable graded result
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(no_usable_grade & remains_outstanding)
.then(pl.lit("1"))
.when(no_usable_grade & remains_good)
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
# A school can yield up to two rows (primary and secondary).
primary = graded.filter(pl.col("_serves_primary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
.alias("category")
)
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("Postcode").alias("postcode"),
"category",
)
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--gias",
type=Path,
default=None,
help="GIAS open-school parquet; if given, only currently-open schools are counted",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
open_urns: set[int] | None = None
if args.gias is not None:
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
)
# Join with arcgis to get lat/lng for each school's postcode
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
if schools.is_empty():
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
counts_5km = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)
counts_2km = count_pois_per_postcode(
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
)
result = counts_5km.join(counts_2km, on="postcode")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -8,6 +8,7 @@ import polars as pl
from pipeline.transform.join_epc_pp import (
EPC_SOURCE_COLUMNS,
_join_address_parts,
_run,
_scan_epc_certificates,
)
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
assert df.schema["number_habitable_rooms"] == pl.Int16
def test_join_address_parts_empty_string_components():
# Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
# concat_str(ignore_nulls=True) alone leaked the separator into the
# display address (' 10 PALACE GREEN') and doubled it for empty middle
# components. Empty/whitespace-only parts must contribute nothing.
df = pl.DataFrame(
{
"saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"],
"paon": ["10", "10", "", "82", "", None, "10", "11 "],
"street": [
"PALACE GREEN",
"HIGH STREET",
"HIGH STREET",
"",
"",
None,
"PALACE GREEN",
"STATION ROAD",
],
}
)
out = df.select(
_join_address_parts("saon", "paon", "street").alias("address")
).get_column("address")
assert out.to_list() == [
"10 PALACE GREEN", # empty saon -> no leading space
"FLAT 1 10 HIGH STREET", # normal three-part address is unchanged
"FLAT 1 HIGH STREET", # empty middle component -> no double space
"FLAT 21 82", # empty street -> no trailing space
None, # all-empty -> null, not whitespace junk
None, # all-null -> null
"10 PALACE GREEN", # whitespace-only component treated as empty
"FLAT 2 11 STATION ROAD", # per-component padding is stripped
]
# Invariant: every produced address is trimmed and single-spaced.
produced = out.drop_nulls()
assert produced.str.starts_with(" ").sum() == 0
assert produced.str.ends_with(" ").sum() == 0
assert produced.str.contains(" ", literal=True).sum() == 0
def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
# Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
# published pp_address must not inherit a leading separator from it.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [""],
"street": ["Example Street"],
"locality": [""],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# No leading space, and the clean address still matches its EPC record.
assert df.select("pp_address", "epc_address").to_dicts() == [
{"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
]
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:

View file

@ -304,7 +304,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
@ -362,7 +362,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=broadband,
@ -1057,7 +1057,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(

View file

@ -1,9 +1,11 @@
import polars as pl
from pipeline.transform.poi_proximity import (
GREENSPACE_PARK_FUNCTIONS,
POI_GROUPS_2KM,
_build_poi_category_groups,
_dynamic_poi_metric_renames,
_greenspace_count_frame,
_groceries_categories,
)
from pipeline.utils.poi_counts import count_pois_per_postcode
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
"parks_2km": "Number of amenities (Park) within 2km",
"parks_5km": "Number of amenities (Park) within 5km",
}
def test_groceries_categories_exclude_speciality_food_retail() -> None:
"""The static groceries metric must not count bakeries/butchers/delis/
off-licences (speciality retail, ~a third of the group), while keeping
Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
pois = pl.DataFrame(
{
"category": [
"Tesco",
"Supermarket",
"Convenience Store",
"Greengrocer",
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
"Café",
],
"group": ["Groceries"] * 8 + ["Leisure"],
"lat": [51.5] * 9,
"lng": [-0.1] * 9,
}
)
assert _groceries_categories(pois) == [
"Convenience Store",
"Greengrocer",
"Supermarket",
"Tesco",
]
def test_park_group_excludes_playgrounds_and_play_space() -> None:
# "Play Space" (playgrounds) must not count as a Park; Public Park Or
# Garden and Playing Field (open recreation grounds) are in scope.
assert GREENSPACE_PARK_FUNCTIONS == {
"parks": ["Public Park Or Garden", "Playing Field"]
}
def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
# Three gates of one park (with a site centroid), one gate of another park
# without a centroid, and one centroid-fallback row with a null site_id.
greenspace = pl.DataFrame(
{
"lat": [51.50, 51.51, 51.52, 53.0, 54.0],
"lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
"category": ["Public Park Or Garden"] * 3
+ ["Playing Field", "Public Park Or Garden"],
"site_id": ["site-a", "site-a", "site-a", "site-b", None],
"site_lat": [51.505, 51.505, 51.505, None, None],
"site_lng": [-0.105, -0.105, -0.105, None, None],
}
)
result = _greenspace_count_frame(greenspace).sort("lat")
# One row per site (site-a collapses 3 → 1), null-site rows preserved.
assert result.height == 3
site_a = result.filter(pl.col("site_id") == "site-a")
# The representative point is the site centroid…
assert site_a["lat"].to_list() == [51.505]
assert site_a["lng"].to_list() == [-0.105]
# …or the first access point when no centroid is available.
site_b = result.filter(pl.col("site_id") == "site-b")
assert site_b["lat"].to_list() == [53.0]
def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
# The shipped parquet predates the site_id column; counting must not crash
# (it keeps the old access-point grain until regenerated).
legacy = pl.DataFrame(
{
"lat": [51.50, 51.51],
"lng": [-0.10, -0.11],
"category": ["Public Park Or Garden", "Play Space"],
}
)
assert _greenspace_count_frame(legacy).equals(legacy)

View file

@ -0,0 +1,354 @@
import numpy as np
import polars as pl
from pipeline.transform.school_catchments import (
capacity_fill_radii,
children_per_postcode,
classify_good_plus_schools,
count_covering_catchments,
equilibrium_cutoffs,
phase_intakes,
school_preference_bonuses,
)
def _school(phase, oeif, ungraded, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["urn"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, 1),
_school("Primary", "2", None, 2),
_school("Secondary", "1", None, 3),
_school("Secondary", "2", None, 4),
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_primary"),
(3, "outstanding_secondary"),
(4, "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", 1),
_school("Secondary", "Not judged", "School remains Outstanding", 2),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
]
assert _classify(rows) == {
(1, "good_primary"),
(2, "outstanding_secondary"),
(3, "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
2,
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, 1)]
assert _classify(rows) == {
(1, "good_primary"),
(1, "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, 1), # primary only
_aged_school("Secondary", "2", 11, 16, 2), # secondary only
_aged_school("Secondary", "1", 9, 13, 3), # middle -> both
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_secondary"),
(3, "outstanding_primary"),
(3, "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, 111),
_aged_school("Secondary", "2", 11, 16, 222),
]
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {(111, "outstanding_primary")}
def _gias_row(
urn,
type_group="Academies",
age_range="411",
pupils=210,
capacity=None,
admissions_policy=None,
):
return {
"urn": urn,
"name": f"School {urn}",
"lat": 51.5,
"lng": -0.1,
"type_group": type_group,
"age_range": age_range,
"pupils": pupils,
"capacity": capacity,
"admissions_policy": admissions_policy,
}
def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
intakes = phase_intakes(
pl.DataFrame(
[
# 4-11 = cohorts 4..10, all 7 primary: full fill target.
_gias_row(1, age_range="411", pupils=210),
# 11-16 = cohorts 11..15, all 5 secondary.
_gias_row(2, age_range="1116", pupils=500),
# 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
# gets 7 of 7.5 cohort weights.
_gias_row(3, age_range="311", pupils=240),
# All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
_gias_row(4, age_range="416", pupils=1200),
# 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
# secondary gets 5 of 6.2 cohort weights.
_gias_row(5, age_range="1118", pupils=1240),
]
)
).sort("urn")
assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
def test_phase_intakes_excludes_non_state_and_selective_schools():
intakes = phase_intakes(
pl.DataFrame(
[
_gias_row(1, type_group="Independent schools"),
_gias_row(2, type_group="Special schools"),
_gias_row(3, type_group="Welsh schools"),
# Grammar school intakes are test-based and region-wide; a
# distance catchment would be fabricated.
_gias_row(4, admissions_policy="Selective"),
_gias_row(5, pupils=None, capacity=300),
_gias_row(6, pupils=None, capacity=None), # no usable headcount
_gias_row(7, age_range=None), # no parsable cohorts
# Over-full school keeps its demonstrated size.
_gias_row(8, pupils=350, capacity=300),
_gias_row(9, admissions_policy="Non-selective"),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [5, 8, 9]
assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
def test_school_preference_bonuses_follow_derived_grade():
rows = [
{**_school("Primary", "1", None, 1)},
{**_school("Primary", "2", None, 2)},
{**_school("Primary", "3", None, 3)},
{**_school("Primary", "4", None, 4)},
{**_school("Primary", None, "Some aspects not as strong", 5)}, # unrated
{**_school("Primary", "Not judged", "School remains Good", 6)},
]
bonuses = dict(
school_preference_bonuses(
pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
).iter_rows()
)
assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
"lat": [51.5, 51.5, 52.0],
"lng": [-0.1, -0.1, -0.2],
"lsoa21cd": ["E01000001", "E01000001", "E01000002"],
}
)
lsoa_children = pl.DataFrame(
{
"lsoa21": ["E01000001", "E01000002"],
"aged_0_4": [100, 30],
"aged_5_9": [100, 10],
"aged_10_14": [100, 20],
"aged_15_19": [100, 40],
}
)
result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
# Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
# the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
# Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
# One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
# each. The two nearest postcodes exactly fill it, so the cutoff is the
# marginal admitted child's distance and the 3km postcode is shut out.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0]]),
np.array([10.0]),
np.array([0.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs.tolist() == [2.0]
def test_equilibrium_rejected_demand_cascades_to_next_school():
# School A (5 places) at the origin, school B (5 places) at 10km.
# P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
# with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
# exceeds its target, so it keeps no binding cutoff.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [10.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0], [1.5, 0.0]]),
np.array([5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_preference_bonus_steers_demand_to_better_school():
# Two schools equidistant from the only postcode; school A is rated
# better (0.5km bonus) so all children choose it; B attracts nobody.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.5, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_logit_choice_smears_demand_across_schools():
# With a positive temperature some families prefer the further school, so
# both schools receive applications: the near school still fills and keeps
# a binding cutoff, and the far school now attracts mass it would never
# see under deterministic choice.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([4.0, 4.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=1.0,
)
# Each school gets half the 10 children (equidistant, equal utility),
# exceeding both fill targets: both cutoffs bind at the postcode.
assert cutoffs.tolist() == [1.0, 1.0]
def test_capacity_fill_radii_covers_fill_target_population():
# Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
# cumulate past the target at 2km. A school needing more children than
# exist within the cap keeps the cap.
radii = capacity_fill_radii(
np.array([[0.0, 0.0], [0.0, 0.0]]),
np.array([6.0, 1000.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
max_radius_km=25.0,
)
assert radii.tolist() == [2.0, 25.0]
def test_count_covering_catchments_respects_radius_and_validity():
pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
pc_valid = np.array([True, True, True, False])
school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
radii = np.array([4.0, 1.5])
counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
# pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
# pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
assert counts.tolist() == [1, 2, 0, 0]
def test_count_covering_catchments_empty_schools():
counts = count_covering_catchments(
np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
)
assert counts.tolist() == [0, 0]

View file

@ -1,139 +0,0 @@
import polars as pl
from pipeline.transform.school_proximity import classify_good_plus_schools
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, "AA1 1AA"),
_school("Primary", "2", None, "AA1 1AB"),
_school("Secondary", "1", None, "AA1 1AC"),
_school("Secondary", "2", None, "AA1 1AD"),
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_primary"),
("AA1 1AC", "outstanding_secondary"),
("AA1 1AD", "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AE", "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
"AA1 1AD",
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"URN": 100000,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AA", "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_secondary"),
("AA1 1AC", "outstanding_primary"),
("AA1 1AC", "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
]
rows[0]["URN"] = 111
rows[1]["URN"] = 222
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {("AA1 1AA", "outstanding_primary")}

View file

@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
assert n2_grocery.height == 1
def test_transform_drops_miscategorised_tags(tmp_path):
# Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
# slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
# alternative medicine), Hospital & Clinic (untyped healthcare/yes),
# Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
# apparatus). They must be dropped entirely.
dropped = [
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"tourism/artwork",
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
"amenity/fountain",
"amenity/courthouse",
"leisure/fitness_station",
]
raw = pl.DataFrame(
{
"id": [f"n{i}" for i in range(len(dropped))],
"name": [f"POI {i}" for i in range(len(dropped))],
"category": dropped,
"lat": [51.50] * len(dropped),
"lng": [-0.10] * len(dropped),
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
def test_transform_splits_hospital_and_clinic(tmp_path):
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3"],
"name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
"category": [
"amenity/hospital",
"amenity/clinic",
"healthcare/clinic",
],
"lat": [51.50, 51.51, 51.52],
"lng": [-0.10, -0.11, -0.12],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
assert "Hospital & Clinic" not in out["category"].to_list()
def test_transform_maps_chalet_to_hotel(tmp_path):
# Holiday-let chalets are accommodation, not Tourist Attractions.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["Seaview Chalet"],
"category": ["tourism/chalet"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
# leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
# unnamed (anonymous tracks/gallops/fishing spots); only named public
# facilities survive as a Sports Centre.
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3", "n4"],
"name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
"category": [
"leisure/track",
"leisure/fishing",
"leisure/track",
"leisure/horse_riding",
],
"lat": [51.50, 51.51, 51.52, 51.53],
"lng": [-0.10, -0.11, -0.12, -0.13],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
named = out.filter(pl.col("id").is_in(["n3", "n4"]))
assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
# NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
# flow through with the Public Transport group and its own emoji.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["A Cafe"],
"category": ["amenity/cafe"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
pl.DataFrame(
{
"id": ["naptan-1", "naptan-2"],
"name": ["Test Rail Station", "Weaste"],
"category": ["Rail station", "Tram & Metro stop"],
"lat": [51.51, 51.52],
"lng": [-0.13, -0.14],
}
).write_parquet(inputs["naptan_path"])
out = transform(**inputs).collect()
tram = out.filter(pl.col("category") == "Tram & Metro stop")
assert tram.height == 1
assert tram["group"].to_list() == ["Public Transport"]
assert tram["emoji"].to_list() == ["🚊"]
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.

View file

@ -86,6 +86,28 @@ DROP_CATEGORIES = {
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Boating/cycle-hire infrastructure formerly miscategorised as
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
# ramps and moorings are not entertainment venues.
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
"tourism/artwork",
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
# "Gym & Fitness".
"leisure/fitness_station",
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
# under "Hospital & Clinic" / "Pharmacy".
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
# Street fountains and courthouses formerly bucketed as
# "Tourist Attraction".
"amenity/fountain",
"amenity/courthouse",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# bicycle_rental/boat_rental/marina/slipway used to live here and
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
"leisure/hackerspace",
"leisure/yes",
],
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
# leisure/fitness_station (outdoor pull-up bars / trim-trail
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
"amenity/dojo",
"amenity/dancing_school",
],
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
# herbalists, generic "health" shops) are not dispensing pharmacies
# — see DROP_CATEGORIES.
],
),
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
# clinic are very different amenities for a homebuyer, so they are split.
(
"Health",
"Hospital",
"🏥",
[
"amenity/hospital",
"healthcare/hospital",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
"Clinic",
"🩺",
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
],
),
(
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
# tourism/artwork (statues, murals, village signs) was 93% of this
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
],
),
(
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
# amenity/fountain (street furniture) and amenity/courthouse are
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
# Holiday-let chalets are accommodation, not tourist attractions
# (where they previously sat).
"tourism/chalet",
],
),
(
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
# fishing spots; only named public facilities count as a Sports Centre.
"leisure/track",
"leisure/horse_riding",
"leisure/fishing",
}
@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
"Tram & Metro stop": "🚊",
}
@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# See school_catchments: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)