Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -28,6 +28,17 @@ MINOR_CRIME_TYPES = (
"Other crime",
)
# Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest
# current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent
# crime", "Public disorder and weapons") are unrecognised and silently dropped,
# which understates pre-2013 serious crime and creates an artificial 2012->2013
# step in the by-year series. Applied with `.replace` (not `.replace_strict`) so
# unmapped current types pass through unchanged.
LEGACY_CRIME_TYPE_ALIASES = {
"Violent crime": "Violence and sexual offences",
"Public disorder and weapons": "Public order",
}
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
csvs = sorted(crime_dir.rglob("*.csv"))
@ -96,6 +107,7 @@ def transform_crime(
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("LSOA code", "Month", "Crime type")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type")
@ -147,7 +159,10 @@ def _write_crime_by_year(
& (pl.col("LSOA code") != "")
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
).with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
)
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
# Using crime-type-specific months would over-scale years where a rare type appears

View file

@ -17,7 +17,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.crime import find_street_crime_csvs
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
.drop_nulls(["lon", "lat"])
.filter(pl.col("lon").is_between(-9.5, 5.0))
.filter(pl.col("lat").is_between(49.0, 57.0))
# Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
# values always match the frontend's canonical filter list (a no-op for
# the recent months this overlay normally covers).
.with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("lon", "lat", "month", "crime_type")
.len()
.rename({"len": "count"})

View file

@ -44,6 +44,7 @@ import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
LEGACY_CRIME_TYPE_ALIASES,
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
@ -150,6 +151,11 @@ def _accumulate_counts(
& (pl.col("Crime type") != "")
& pl.col("year").is_in(years)
)
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
# "Public disorder and weapons") to their current equivalents before
# indexing, so ~1.9M historical incidents are counted instead of
# dropped. `.replace` leaves current types unchanged.
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
# Map crime types to indices with default=None so an unrecognised
# type yields a null index we can *report* rather than silently drop
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).

View file

@ -18,11 +18,49 @@ from ..utils import (
normalize_postcode_key,
)
pl.Config.set_tbl_cols(-1)
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
MIN_PRICE = 50_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
(1958) rather than the lower bound, which previously biased every band-derived
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
that year. Already-numeric inputs (a year produced by an earlier call) pass
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
"""
text = (
band.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
)
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
year = (
pl.when(text.str.starts_with("before "))
.then(None)
.when(high.is_not_null())
.then(((low + high) / 2).round(0).cast(pl.Int32))
.otherwise(low)
)
return (
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
.then(year)
.otherwise(None)
.cast(pl.UInt16, strict=False)
)
EPC_SOURCE_COLUMNS = [
"address",
"postcode",
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
# For new-builds (old_new == "Y"), use the first transaction date year as
# the exact construction date; otherwise fall back to the EPC age band.
epc_band_year = (
pl.col("construction_age_band")
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
transfer_year = (
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
)

View file

@ -17,7 +17,11 @@ from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
from pipeline.transform.price_estimation.knn import (
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
)
from pipeline.utils.fuzzy_join import (
normalize_address_key,
normalize_postcode_key,
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
"Air Quality and Road Safety Score",
# Ethnicity
"% South Asian",
"% East Asian",
"% East/SE Asian",
"% Black",
"% Mixed",
"% White",
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
# implausible years -> null). Already-numeric inputs pass through unchanged.
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None) -> int:
@ -1956,7 +1956,9 @@ def _build(
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
# it sorts/filters correctly; null (not a fabricated 10) when no availability
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
broadband = (
pl.scan_parquet(broadband_path)
.select(
@ -1969,13 +1971,12 @@ def _build(
.then(100)
.when(pl.col("SFBB availability (% premises)") > 0)
.then(30)
.otherwise(10)
.otherwise(None)
.cast(pl.UInt16)
.alias("max_download_speed"),
)
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
)
area_side_tables = {
"iod": iod,
@ -2052,9 +2053,20 @@ def _build(
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
# Null out implausible per-sqm values (outside the kNN comparable band):
# bulk/block transactions divided by a single unit's floor area otherwise
# produce figures up to ~£1.5M/sqm.
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area"))
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)

View file

@ -5,6 +5,7 @@ from pathlib import Path
from pyproj import Transformer
from shapely import make_valid, set_precision
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
from shapely.ops import unary_union
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
just the intermediate Shapely object: coordinate snapping during
serialization can otherwise leave a self-intersecting ring that only shows up
once the feature is read back from disk. Any such geometry is repaired with
``make_valid`` before returning so written features are always valid.
"""
geom = _largest_polygonal(geom)
if geom is None:
return None
@ -55,12 +63,28 @@ def to_wgs84_geojson(
transformer = _get_to_wgs84()
wgs84 = transform_geometry(transformer.transform, simplified)
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
try:
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
except GEOSException:
# Precision snapping can fail on pathological geometries; fall back to a
# plain validity repair without coordinate snapping.
wgs84 = make_valid(wgs84)
wgs84 = _largest_polygonal(wgs84)
if wgs84 is None:
return None
return mapping(wgs84)
geojson_dict = mapping(wgs84)
# The geometry that actually reaches disk is the GeoJSON dict, so validate
# *that* (not the pre-serialization object) and repair if needed.
round_trip = shape(geojson_dict)
if round_trip.is_empty or not round_trip.is_valid:
round_trip = _largest_polygonal(make_valid(round_trip))
if round_trip is None or round_trip.is_empty:
return None
geojson_dict = mapping(round_trip)
return geojson_dict
def _fill_holes(geom):
@ -119,7 +143,11 @@ def merge_fragments(
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _fill_holes(combined)
# Do NOT _fill_holes here: interior holes carved by the greenspace
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
# Filling them would re-add the removed area and negate the
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
# chain were already removed by the _fill_holes above (pre-subtraction).
# Revert if subtraction + fragment selection lost >90% of area
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
combined = pre_green

View file

@ -893,3 +893,54 @@ class TestSubtractGreenspace:
result = subtract_greenspace(postcode, tree, geoms)
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
def test_geojson_round_trips_to_valid_geometry(self):
from shapely.geometry import shape
geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
assert geojson is not None
rt = shape(geojson)
assert not rt.is_empty
assert rt.is_valid
def test_written_district_features_are_all_valid(self, tmp_path):
from shapely.geometry import shape
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": MultiPolygon(
[
box(530200, 180000, 530250, 180050),
box(530200, 180060, 530250, 180110),
]
),
}
assert write_district_geojson(postcodes, tmp_path) == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
for feature in collection["features"]:
geom = shape(feature["geometry"])
assert geom.is_valid
assert not geom.is_empty
class TestGreenspaceHolePreserved:
"""Interior holes carved by greenspace subtraction must survive merge_fragments
(the post-subtraction _fill_holes that previously negated them was removed)."""
def test_interior_lake_hole_survives_merge_fragments(self):
from shapely.strtree import STRtree
postcode = box(0, 0, 100, 100) # 10000 sqm
lake = box(30, 30, 70, 70) # 1600 sqm fully-interior hole (16% removal)
result = merge_fragments(
[("TEST1", postcode)],
greenspace_tree=STRtree([lake]),
greenspace_geoms=[lake],
)
merged = result["TEST1"]
assert len(list(merged.interiors)) == 1
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)

View file

@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
.struct.field("price")
.alias("input_price"),
)
.with_columns(
# Date of the input (second-to-last) sale, used by the kNN leakage
# filter to exclude the target property's own prior sale from its
# comparables. Built from year+month (day defaults to the 1st).
pl.date(
pl.col("input_year").cast(pl.Int32),
pl.col("input_month").cast(pl.Int32),
1,
).alias("input_date"),
)
.with_columns(
(
pl.col("actual_year").cast(pl.Float64)

View file

@ -18,6 +18,8 @@ import polars as pl
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
build_knn_pool,
knn_median_psm,
)
@ -31,7 +33,13 @@ from pipeline.transform.price_estimation.utils import (
MAX_KNN_TO_INDEX_RATIO = 2.0
MIN_KNN_TO_INDEX_RATIO = 0.5
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
# Cap the final estimate at this multiple of the last known price as a guard
# against data errors. Set to ~exp(MAX_LOG_ADJUSTMENT) (~20x) so it is
# consistent with the log-index clip already applied to the index move: many
# UK sectors legitimately grew >6x since the 1990s (e.g. parts of inner London
# 12-14x), so the previous 6x cap truncated genuine appreciation rather than
# only catching outliers.
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
def guarded_blend_estimates(
@ -222,11 +230,22 @@ def main():
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Derive estimated price per sqm where both estimated price and floor area exist
# Derive estimated price per sqm where both estimated price and floor area
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
# from bulk/block transactions or floor-area errors and are not meaningful
# per-unit prices.
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
.round(0)
.cast(pl.Int32, strict=False)
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& (_est_psm >= MIN_COMPARABLE_PSM)
& (_est_psm <= MAX_COMPARABLE_PSM)
)
.then(_est_psm.round(0).cast(pl.Int32, strict=False))
.otherwise(None)
.alias("Est. price per sqm"),
)

View file

@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
extract_centroids,
@ -165,12 +166,50 @@ def solve_robust_index(
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
weights = base_weights.copy()
for _ in range(IRLS_ITERATIONS):
data = signs_arr * weights[rows_arr]
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
b = log_ratios * weights
if n_pen:
all_data = np.concatenate([data, pen_vals_arr])
all_rows = np.concatenate([rows_arr, pen_rows_arr])
all_cols = np.concatenate([cols_arr, pen_cols_arr])
b = np.concatenate([log_ratios * weights, pen_b])
else:
all_data, all_rows, all_cols = data, rows_arr, cols_arr
b = log_ratios * weights
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
# Residuals

View file

@ -96,8 +96,11 @@ def spatial_smooth(
for i, sec in enumerate(sectors_with_coords):
n = counts.get(sec, 0)
self_w = n / (n + SPATIAL_BLEND_K)
if self_w > 0.95:
continue # enough data, skip smoothing
if self_w > 0.90:
# Enough data, skip smoothing. Relaxed from 0.95 so higher-volume
# cells (n ~270-570) that still carry single-year noise get a light
# spatial blend, complementing the temporal smoothness prior.
continue
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
# Skip self (index 0, distance ~0)

View file

@ -81,8 +81,21 @@ def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
last_prices=np.array([100_000.0, 100_000.0]),
)
# Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
assert blended[0] == 120_000.0
assert blended[1] == 600_000.0
# Property 1: a 10x uplift over the last price is legitimate appreciation and
# is no longer truncated (cap raised from 6x to 20x).
assert blended[1] == 1_000_000.0
def test_guarded_blend_caps_uplift_at_20x_last_price():
# 50x index estimate over the last price is capped at the 20x ceiling.
blended = guarded_blend_estimates(
index_est=np.array([5_000_000.0]),
knn_est=np.array([np.nan]),
last_prices=np.array([100_000.0]),
)
assert blended[0] == 2_000_000.0 # 100_000 * 20
def test_bungalow_is_not_a_dead_price_index_type_group():
@ -92,3 +105,50 @@ def test_bungalow_is_not_a_dead_price_index_type_group():
assert "Bungalow" not in TYPE_GROUPS
assert df["type_group"].to_list() == [None, None]
def test_temporal_regularization_damps_curvature_without_breaking_solve():
"""The second-difference prior reduces year-to-year curvature and keeps the
index well-formed (all years present, finite, contiguous)."""
from pipeline.transform.price_estimation import index as index_mod
years = np.arange(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years}
y1, y2, lr, w = [], [], [], []
for y in years[:-1]: # adjacent-year pairs following a smooth trend
y1.append(y)
y2.append(y + 1)
lr.append(true[y + 1] - true[y])
w.append(1.0)
# A spurious single-year jump at 2015 (poorly identified curvature spike).
y1.append(2014)
y2.append(2015)
lr.append(0.5)
w.append(1.0)
y1, y2 = np.array(y1), np.array(y2)
lr, w = np.array(lr, float), np.array(w, float)
def solve(lmbda):
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
try:
return index_mod.solve_robust_index(y1, y2, lr, w)
finally:
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
unregularised = solve(0.0)
regularised = solve(0.2)
# Index is well-formed for both.
assert set(regularised) == set(range(2010, 2021))
assert all(np.isfinite(v) for v in regularised.values())
assert regularised[2010] == 0.0 # baseline year pinned to 0
def max_curvature(d):
betas = np.array([d[y] for y in sorted(d)])
return float(np.abs(np.diff(betas, 2)).max())
# Regularisation strictly reduces curvature, and never flattens the genuine
# uptrend (the index still rises end to end).
assert max_curvature(regularised) < max_curvature(unregularised)
assert regularised[2020] > regularised[2010]

View file

@ -22,6 +22,13 @@ FLAT_TYPES = ["Flats/Maisonettes"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
SHRINKAGE_K = 50
# Temporal regularization for the repeat-sales index: a second-difference
# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
# to the IRLS solve. A mild penalty damps single-year index spikes (which would
# otherwise distort the estimate of any property whose last sale landed on a
# noisy year) without flattening genuine multi-year trends.
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
def type_group_expr():
"""Polars expression: Property type -> type_group."""

View file

@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
}
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
variants). Filtering on the graded column alone dropped ~7,000 genuinely
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
is never overridden.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(
no_usable_grade
& ungraded.str.starts_with("School remains Outstanding")
)
.then(pl.lit("1"))
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
return graded.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
)
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
@ -30,42 +90,14 @@ def main():
)
args = parser.parse_args()
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
# Post-2025 reform the single "Overall effectiveness" grade was retired;
# the legacy 14 scale is now carried forward under "Latest OEIF overall
# effectiveness" (OEIF = the previous Ofsted Education Inspection
# Framework). The new report-card columns use text judgements instead.
ofsted = pl.read_parquet(args.ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
)
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
)
# Assign category based on phase and rating. Good+ groups include both
# category variants; outstanding groups count grade 1 only.
ofsted = ofsted.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
)
# Join with arcgis to get lat/lng for each school's postcode

View file

@ -226,3 +226,44 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
def test_transform_crime_maps_legacy_crime_types(tmp_path):
"""Pre-2014 police.uk type names are aliased to current equivalents instead
of being dropped."""
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2013-01"
month_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(month_dir / "2013-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
"2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
"3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "crime_by_year.parquet"
transform_crime(crime_dir, output, by_year_output)
row = pl.read_parquet(output).to_dicts()[0]
# Single month -> annualised x12. Legacy names mapped to current columns.
assert row["Violence and sexual offences (avg/yr)"] == 12.0
assert row["Public order (avg/yr)"] == 12.0
assert row["Burglary (avg/yr)"] == 12.0
# The legacy names must NOT survive as their own columns.
assert "Violent crime (avg/yr)" not in row
assert "Public disorder and weapons (avg/yr)" not in row
by_year = pl.read_parquet(by_year_output).row(0, named=True)
serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
# Serious = Violence and sexual offences (12) + Burglary (12) = 24
assert serious[2013] == 24.0
minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
assert minor[2013] == 12.0 # Public order

View file

@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
err = capsys.readouterr().err
assert "Cyber fraud" in err
assert "WARNING" in err
def test_legacy_crime_types_are_mapped(tmp_path):
"""Pre-2014 crime-type names are aliased to current equivalents in the
spatial transform instead of being dropped as unknown types."""
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(
crime,
"2013-01",
[
_crime_row("2013-01", 1005, 1005, "Violent crime"),
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
row = pl.read_parquet(output).to_dicts()[0]
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
assert row["Violence and sexual offences (avg/yr)"] == 12.0
assert row["Public order (avg/yr)"] == 12.0
by_year_row = pl.read_parquet(by_year).row(0, named=True)
assert by_year_row["Violence and sexual offences (by year)"] == [
{"year": 2013, "count": 12.0}
]
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]

View file

@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
"epc_address": "1 Example Street",
"current_energy_rating": "C",
"total_floor_area": 85.0,
"construction_age_band": 1950,
# Band midpoint of 1950-1966, not the lower bound.
"construction_age_band": 1958,
"was_council_house": "Yes",
}
]
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
"current_energy_rating": None,
}
]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl
from pipeline.transform.join_epc_pp import epc_band_to_year
df = pl.DataFrame(
{
"b": [
"England and Wales: 1950-1966", # midpoint 1958
"1900-1929", # midpoint 1914
"England and Wales: before 1900", # too wide -> null
"2012 onwards", # single year
"1012", # implausible -> null
"2202", # implausible -> null
None, # null -> null
"1958", # already-numeric-as-string -> pass through
]
}
)
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
assert years == [1958, 1914, None, 2012, None, None, None, 1958]

View file

@ -0,0 +1,82 @@
import polars as pl
from pipeline.transform.school_proximity import classify_good_plus_schools
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, "AA1 1AA"),
_school("Primary", "2", None, "AA1 1AB"),
_school("Secondary", "1", None, "AA1 1AC"),
_school("Secondary", "2", None, "AA1 1AD"),
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_primary"),
("AA1 1AC", "outstanding_secondary"),
("AA1 1AD", "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Concerns)"/"(Improving)" variants are still good+.
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
"AA1 1AD",
),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AC", "good_primary"),
("AA1 1AD", "outstanding_secondary"),
}
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()