Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -273,27 +273,24 @@ def _write_avg_yr(
|
|||
for type_idx, name in enumerate(ALL_CRIME_TYPES):
|
||||
data[f"{name} (avg/yr)"] = avg[:, type_idx]
|
||||
|
||||
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
|
||||
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
|
||||
# average over the years in which ANY of those types occurred. This keeps the
|
||||
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
|
||||
# Summing the per-type avg/yr values instead (as the merge previously did)
|
||||
# divides each type by its OWN years-present and overstates the rollup when a
|
||||
# postcode's serious/minor types occur in disjoint years.
|
||||
# Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
|
||||
# columns, so each rollup always equals the sum of the parts shown beside it
|
||||
# and can never fall below one of its own components. (Previously the rollup
|
||||
# re-derived a union-years-present mean: it divided the summed counts by the
|
||||
# number of years in which ANY component type occurred, whereas each
|
||||
# component divides by its OWN years-present. When a postcode's serious/minor
|
||||
# types occurred in disjoint years the union denominator was larger, so the
|
||||
# rollup came out smaller than the sum of its parts.) The by-year rollup
|
||||
# series in _write_by_year is likewise the per-year sum of the component
|
||||
# bars, so headline and chart both present the rollup as the sum of its parts.
|
||||
for rollup_name, rollup_types in (
|
||||
("Serious crime", SERIOUS_CRIME_TYPES),
|
||||
("Minor crime", MINOR_CRIME_TYPES),
|
||||
):
|
||||
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
|
||||
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
|
||||
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
|
||||
rollup_years_present = np.clip(
|
||||
(rollup_counts > 0).sum(axis=1), 1, None
|
||||
).astype(np.float64)
|
||||
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
|
||||
np.float32
|
||||
)
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(
|
||||
avg[:, rollup_idx].sum(axis=1), 1
|
||||
).astype(np.float32)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
|
||||
|
|
|
|||
|
|
@ -36,6 +36,16 @@ MIN_PRICE = 10_000
|
|||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
|
||||
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
|
||||
# habitable rooms) that otherwise propagate verbatim into the published per-
|
||||
# property columns. Values outside these bands are nulled (treated as unknown)
|
||||
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
|
||||
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
|
||||
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
|
||||
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
|
||||
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
|
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
pl.when(pl.col("number_habitable_rooms") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("number_habitable_rooms"))
|
||||
# Null implausible EPC dimensions so data-entry errors don't reach
|
||||
# the published per-property columns (Interior height, Total floor
|
||||
# area, Number of bedrooms & living rooms). Treated as unknown.
|
||||
pl.when(
|
||||
(pl.col("number_habitable_rooms") >= 1)
|
||||
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
|
||||
)
|
||||
.then(pl.col("number_habitable_rooms"))
|
||||
.otherwise(None)
|
||||
.alias("number_habitable_rooms"),
|
||||
pl.when(
|
||||
pl.col("floor_height").is_between(
|
||||
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
|
||||
)
|
||||
)
|
||||
.then(pl.col("floor_height"))
|
||||
.otherwise(None)
|
||||
.alias("floor_height"),
|
||||
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
|
||||
.then(pl.col("total_floor_area"))
|
||||
.otherwise(None)
|
||||
.alias("total_floor_area"),
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import argparse
|
|||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
# Named "Tree canopy" (not "Street tree") because the underlying density unions
|
||||
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
|
||||
# woodland-edge postcode's score reflects forest canopy, not only street trees.
|
||||
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
|
||||
LISTED_BUILDING_FEATURE = "Listed building"
|
||||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
|
|
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
|
|||
|
||||
|
||||
def _is_current_planning_record(end_date: object) -> bool:
|
||||
"""A planning record is current when it has no end-date OR its end-date is
|
||||
still in the future. The planning.data.gov.uk `end-date` field marks when a
|
||||
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
|
||||
area and must NOT be dropped — the previous "any non-empty date = ended"
|
||||
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
|
||||
if end_date is None:
|
||||
return True
|
||||
if isinstance(end_date, str):
|
||||
return end_date.strip() == ""
|
||||
text = end_date.strip()
|
||||
if text == "":
|
||||
return True
|
||||
try:
|
||||
return date.fromisoformat(text[:10]) > date.today()
|
||||
except ValueError:
|
||||
# Unparseable end-date: keep the record rather than silently drop it.
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
|
|||
)
|
||||
|
||||
|
||||
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
|
||||
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
|
||||
|
||||
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
|
||||
like median age and IoD. The IoD table defines the LSOA universe every
|
||||
postcode resolves into, so a missing LSOA would silently null the ethnicity
|
||||
columns for those postcodes; require full coverage instead.
|
||||
"""
|
||||
iod_lsoas = pl.read_parquet(
|
||||
iod_path, columns=["LSOA code (2021)"]
|
||||
).rename({"LSOA code (2021)": "lsoa21"})
|
||||
|
||||
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
|
||||
missing_ethnicity = iod_lsoas.join(
|
||||
ethnicity_lsoas, on="lsoa21", how="anti"
|
||||
).sort("lsoa21")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing LSOA coverage: "
|
||||
f"{missing_ethnicity.height} LSOAs, e.g. "
|
||||
f"{missing_ethnicity.head(10).to_dicts()}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_lad_source_coverage(
|
||||
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
|
||||
iod_path: Path, rental_prices_path: Path
|
||||
) -> None:
|
||||
iod_lads = (
|
||||
pl.read_parquet(
|
||||
|
|
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
|
|||
.unique(["lad"])
|
||||
)
|
||||
|
||||
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
|
||||
{"Geography_code": "lad"}
|
||||
)
|
||||
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing 2024 LAD coverage: "
|
||||
f"{missing_ethnicity.to_dicts()}"
|
||||
)
|
||||
|
||||
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
|
||||
{"area_code": "lad"}
|
||||
)
|
||||
|
|
@ -849,12 +879,10 @@ def _join_area_side_tables(
|
|||
broadband: pl.LazyFrame,
|
||||
) -> pl.LazyFrame:
|
||||
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
base = base.join(
|
||||
ethnicity,
|
||||
left_on="Local Authority District code (2024)",
|
||||
right_on="Geography_code",
|
||||
how="left",
|
||||
)
|
||||
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
|
||||
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
|
||||
# Local-Authority broadcast, with no change to the 6-bucket output schema.
|
||||
base = base.join(ethnicity, on="lsoa21", how="left")
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
|
||||
|
|
@ -1966,7 +1994,8 @@ def _build(
|
|||
"""
|
||||
if mode == "listings" and actual_listings_path is None:
|
||||
raise ValueError("listings mode requires actual_listings_path")
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_prices_path)
|
||||
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
|
|
@ -2225,7 +2254,7 @@ def main():
|
|||
"--ethnicity",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Ethnicity by local authority parquet file (optional)",
|
||||
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--crime",
|
||||
|
|
|
|||
|
|
@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
|
|||
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
||||
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
||||
|
||||
# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
|
||||
# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
|
||||
# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
|
||||
# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
|
||||
# building-scale footprint instead. (Genuine thin slivers, which still carry
|
||||
# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
|
||||
# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
|
||||
# falls through to the tiny _grid_footprint, so this can only improve the result.
|
||||
_POINT_RESCUE_BUFFER_M = 8.0
|
||||
_POINTLIKE_AREA_M2 = 1.0
|
||||
_POINTLIKE_PERIMETER_M = 4.0
|
||||
|
||||
|
||||
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
||||
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
||||
|
|
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
|||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
||||
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
|
||||
|
||||
A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon — the
|
||||
signature of a tower-block postcode whose UPRNs all share one coordinate)
|
||||
gets a building-scale buffer so it is not reduced to an invisible sub-metre
|
||||
dot; thin slivers that still carry length keep the minimal buffer.
|
||||
"""
|
||||
buffer_m = _MIN_FOOTPRINT_BUFFER_M
|
||||
try:
|
||||
if (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
):
|
||||
buffer_m = _POINT_RESCUE_BUFFER_M
|
||||
except GEOSException:
|
||||
pass
|
||||
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
|
||||
if footprint is None:
|
||||
return None
|
||||
return _snap_to_wgs84_geojson(footprint)
|
||||
|
|
|
|||
|
|
@ -906,6 +906,37 @@ class TestToWgs84Geojson:
|
|||
assert result is not None
|
||||
assert result["type"] == "Polygon"
|
||||
|
||||
def test_pointlike_input_gets_building_scale_footprint(self):
|
||||
"""A tower-block postcode (all UPRNs at one point) must not collapse to a
|
||||
sub-metre dot; it gets a building-scale footprint instead."""
|
||||
import pyproj
|
||||
from shapely.geometry import Point, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
result = to_wgs84_geojson(Point(360000, 170000))
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
|
||||
|
||||
def test_thin_sliver_keeps_minimal_buffer(self):
|
||||
"""A genuine elongated sliver still carries length, so it is NOT inflated
|
||||
to building scale — only truly pointlike inputs are."""
|
||||
import pyproj
|
||||
from shapely.geometry import LineString, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
|
||||
result = to_wgs84_geojson(sliver)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
|
||||
|
||||
def test_coordinates_have_limited_precision(self):
|
||||
"""GeoJSON coordinates should be rounded to 6 decimal places."""
|
||||
import json
|
||||
|
|
|
|||
|
|
@ -230,11 +230,28 @@ def main():
|
|||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Null the absolute "Estimated current price" itself when its implied
|
||||
# per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
|
||||
# AND the floor area is known: these come from bulk/block transfers or
|
||||
# garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
|
||||
# estimate) and are not meaningful single-dwelling values. Previously only
|
||||
# the derived per-sqm was nulled, leaving the absurd headline price visible.
|
||||
_raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
|
||||
)
|
||||
.then(None)
|
||||
.otherwise(pl.col("Estimated current price"))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area
|
||||
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
|
||||
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
|
||||
# from bulk/block transactions or floor-area errors and are not meaningful
|
||||
# per-unit prices.
|
||||
# exist. Now that the implausible-psm estimates are nulled above, the band
|
||||
# filter here mainly guards the floor-area>0 case.
|
||||
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
LATEST_COMPLETE_YEAR,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
|
|
@ -395,14 +396,22 @@ def build_index(
|
|||
The index is still forward-filled to CURRENT_YEAR.
|
||||
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
|
||||
"""
|
||||
pairs = extract_pairs(input_path, max_year2=max_pair_year)
|
||||
# Solve the index only on COMPLETE calendar years: exclude the partial
|
||||
# current year, whose thin repeat-sale set yields wild betas. The index is
|
||||
# still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
|
||||
# follows the established trend rather than a partial-year spike. Backtest
|
||||
# passes a stricter max_pair_year, which is honoured.
|
||||
estimation_cap = (
|
||||
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
|
||||
)
|
||||
pairs = extract_pairs(input_path, max_year2=estimation_cap)
|
||||
centroids = extract_centroids(postcodes_path or input_path)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = CURRENT_YEAR
|
||||
|
||||
hedonic_idx = compute_hedonic_index(
|
||||
input_path, min_year, max_year, max_sale_year=max_pair_year
|
||||
input_path, min_year, max_year, max_sale_year=estimation_cap
|
||||
)
|
||||
|
||||
# Precompute hierarchy
|
||||
|
|
|
|||
|
|
@ -6,6 +6,13 @@ import numpy as np
|
|||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2026
|
||||
# Latest COMPLETE calendar year. The current year's transactions are only
|
||||
# partially reported (Land Registry lags ~2-3 months), so a sector's thin
|
||||
# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
|
||||
# single sector). The index is SOLVED only on complete years (<= this) and
|
||||
# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
|
||||
# projections follow the established trend instead of a partial-year spike.
|
||||
LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
|
||||
_today = date.today()
|
||||
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
|
||||
|
||||
|
|
|
|||
|
|
@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
|
|||
}
|
||||
|
||||
|
||||
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
# Age thresholds for deciding which phase(s) a school serves. A school serves
|
||||
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
|
||||
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
|
||||
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
|
||||
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
|
||||
# phase" labels such schools as just "Secondary", which previously hid them from
|
||||
# every postcode's primary-school count.
|
||||
PRIMARY_MAX_AGE = 10
|
||||
SECONDARY_MIN_AGE = 12
|
||||
|
||||
|
||||
def classify_good_plus_schools(
|
||||
ofsted: pl.DataFrame, open_urns: set[int] | None = None
|
||||
) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
|
||||
returning a ``(postcode, category)`` frame.
|
||||
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
|
||||
``category`` rows per school, returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
|
|
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
|||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
|
||||
variants). Filtering on the graded column alone dropped ~7,000 genuinely
|
||||
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
|
||||
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
|
||||
is never overridden.
|
||||
remains Good"/"School remains Outstanding"). Filtering on the graded column
|
||||
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
|
||||
ungraded outcome, but ONLY when there is no usable graded result
|
||||
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
|
||||
|
||||
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
|
||||
(Concerns)" outcome signals inspectors found issues warranting an earlier
|
||||
graded re-inspection, so marketing it as a good+ school is misleading.
|
||||
|
||||
Phase assignment uses the statutory age range when available (so all-through
|
||||
and middle schools count toward BOTH primary and secondary), falling back to
|
||||
the coarse "Ofsted phase" label when age columns are absent. When
|
||||
``open_urns`` is given, schools whose URN is not in the current GIAS open
|
||||
register are dropped so closed/merged schools are not counted.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
has_concern = ungraded.str.contains(r"\(Concerns\)")
|
||||
remains_outstanding = (
|
||||
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
|
||||
)
|
||||
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(
|
||||
no_usable_grade
|
||||
& ungraded.str.starts_with("School remains Outstanding")
|
||||
)
|
||||
.when(no_usable_grade & remains_outstanding)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
|
||||
.when(no_usable_grade & remains_good)
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
|
||||
# Drop schools no longer open (closed/merged) when the GIAS open register is
|
||||
# provided, so stale Ofsted "latest inspection" rows are not counted.
|
||||
if open_urns is not None and "URN" in graded.columns:
|
||||
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
|
||||
|
||||
# Decide which phase(s) each school serves.
|
||||
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
|
||||
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
|
||||
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
|
||||
serves_primary = (
|
||||
pl.when(low.is_not_null())
|
||||
.then(low <= PRIMARY_MAX_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Primary")
|
||||
)
|
||||
serves_secondary = (
|
||||
pl.when(high.is_not_null())
|
||||
.then(high >= SECONDARY_MIN_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Secondary")
|
||||
)
|
||||
else:
|
||||
serves_primary = pl.col("Ofsted phase") == "Primary"
|
||||
serves_secondary = pl.col("Ofsted phase") == "Secondary"
|
||||
|
||||
graded = graded.with_columns(
|
||||
serves_primary.alias("_serves_primary"),
|
||||
serves_secondary.alias("_serves_secondary"),
|
||||
)
|
||||
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
return graded.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
# A school can yield up to two rows (primary and secondary).
|
||||
primary = graded.filter(pl.col("_serves_primary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
.alias("category")
|
||||
).select(
|
||||
)
|
||||
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
.alias("category")
|
||||
)
|
||||
return pl.concat([primary, secondary]).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
|
@ -85,12 +138,24 @@ def main():
|
|||
parser.add_argument(
|
||||
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gias",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="GIAS open-school parquet; if given, only currently-open schools are counted",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
|
||||
open_urns: set[int] | None = None
|
||||
if args.gias is not None:
|
||||
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
|
||||
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
|
||||
print(f"GIAS open register: {len(open_urns):,} open school URNs")
|
||||
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
|
|
|
|||
|
|
@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
|||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||
|
||||
|
||||
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
||||
def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
|
||||
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
|
||||
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
|
||||
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
|
||||
# bars (which span the UNION of years any serious type occurred), NOT the sum
|
||||
# of the per-type means. Summing per-type means divides each type by its OWN
|
||||
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
|
||||
# per-year serious total by the years any serious type occurred (2) -> 12.
|
||||
# "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
|
||||
# (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
|
||||
# shown beside it and can never fall below a single component. (The previous
|
||||
# union-years-present mean would have divided the per-year serious total by the
|
||||
# 2 years any serious type occurred, giving a misleading 12 that sits below
|
||||
# both the burglary and robbery rollup contributions.)
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
|
|
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
|||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
avg = pl.read_parquet(output).row(0, named=True)
|
||||
# The precomputed rollup headline exists and equals the mean of the bars (12),
|
||||
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
|
||||
assert "Serious crime (avg/yr)" in avg
|
||||
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
# Rollup == sum of its component (avg/yr) columns.
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(
|
||||
avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
|
||||
)
|
||||
|
||||
# The by-year rollup series remains the per-year sum of the component bars.
|
||||
serious_bars = {
|
||||
p["year"]: p["count"]
|
||||
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
|
||||
|
|
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
|||
2014: pytest.approx(12.0, abs=0.05),
|
||||
2024: pytest.approx(12.0, abs=0.05),
|
||||
}
|
||||
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
|
||||
|
||||
|
||||
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ from pipeline.transform.merge import (
|
|||
_split_normal_outputs,
|
||||
_tree_density_by_postcode,
|
||||
_validate_lad_source_coverage,
|
||||
_validate_lsoa_source_coverage,
|
||||
_validate_postcode_feature_output,
|
||||
_validate_property_postcodes,
|
||||
)
|
||||
|
|
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
|
|
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|||
],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame(
|
||||
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
|
|
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
|
|||
"Local Authority District name (2024)": ["Barnsley"],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Rental data is missing"):
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
|
||||
iod_path
|
||||
)
|
||||
# Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
|
||||
# LSOAs are required to all be present.
|
||||
pl.DataFrame(
|
||||
{"lsoa21": ["E01000001", "E01000002", "E01000003"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
|
||||
|
||||
def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
|
||||
iod_path
|
||||
)
|
||||
pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
|
||||
|
||||
with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
||||
|
|
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
"Property type": ["Terraced", None],
|
||||
"Leasehold/Freehold": ["Leasehold", None],
|
||||
"Last known price": [500_000, None],
|
||||
"Street tree density percentile": [42.0, 42.0],
|
||||
"Tree canopy density percentile": [42.0, 42.0],
|
||||
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
|
||||
"_actual_listing_url": ["url0", "url1"],
|
||||
"_actual_asking_price": [600_000, 700_000],
|
||||
|
|
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Last known price": pl.Int64,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Tree canopy density percentile": pl.Float32,
|
||||
"_actual_listing_url": pl.Utf8,
|
||||
"_actual_asking_price": pl.Int64,
|
||||
"_actual_asking_price_per_sqm": pl.Int32,
|
||||
|
|
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
|
||||
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
|
||||
# Postcode-level feature carried through to both matched and unmatched rows.
|
||||
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
|
||||
assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
|
||||
# Match status reflects historical context availability.
|
||||
assert finalized["Historical property match status"].to_list() == [
|
||||
"matched",
|
||||
|
|
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
|||
"Property type": ["Terraced", "Terraced"],
|
||||
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
|
||||
"Last known price": [500_000, 480_000],
|
||||
"Street tree density percentile": [42.0, 42.0],
|
||||
"Tree canopy density percentile": [42.0, 42.0],
|
||||
# Same listing URL on both collapsed rows — the fan-out to fix.
|
||||
"_actual_listing_url": ["url0", "url0"],
|
||||
"_actual_asking_price": [600_000, 600_000],
|
||||
|
|
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
|||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Last known price": pl.Int64,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Tree canopy density percentile": pl.Float32,
|
||||
"_actual_listing_url": pl.Utf8,
|
||||
"_actual_asking_price": pl.Int64,
|
||||
"_actual_asking_price_per_sqm": pl.Int32,
|
||||
|
|
|
|||
|
|
@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
|||
rows = [
|
||||
_school("Primary", None, "School remains Good", "AA1 1AA"),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
|
||||
# "(Concerns)"/"(Improving)" variants are still good+.
|
||||
# "(Improving)" is still good+ ...
|
||||
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AE", "good_primary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_concerns_are_not_good_plus():
|
||||
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
|
||||
# must NOT be counted as good+ schools.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
|
||||
_school(
|
||||
"Secondary",
|
||||
|
|
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
|||
"AA1 1AD",
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AC", "good_primary"),
|
||||
("AA1 1AD", "outstanding_secondary"),
|
||||
}
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
|
|
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
|
|||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": None,
|
||||
"URN": 100000,
|
||||
"Statutory lowest age": low,
|
||||
"Statutory highest age": high,
|
||||
}
|
||||
|
||||
|
||||
def test_all_through_school_counts_toward_both_primary_and_secondary():
|
||||
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
|
||||
# serves primary-age children too, so it must count in BOTH metrics.
|
||||
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AA", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_age_ranges_assign_single_phase_for_standard_schools():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
|
||||
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_secondary"),
|
||||
("AA1 1AC", "outstanding_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_closed_schools_excluded_when_open_register_given():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
|
||||
]
|
||||
rows[0]["URN"] = 111
|
||||
rows[1]["URN"] = 222
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
|
||||
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
# URN 222 is not in the open register, so it is dropped.
|
||||
assert pairs == {("AA1 1AA", "outstanding_primary")}
|
||||
|
|
|
|||
|
|
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
|
|||
"emergency/water_tank",
|
||||
"leisure/bleachers",
|
||||
"leisure/schoolyard",
|
||||
# Park "furniture" / incidental features — not parks; they massively
|
||||
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
"public_transport/pay_scale_area",
|
||||
"shop/taxi",
|
||||
"amenity/feeding_place",
|
||||
|
|
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
|
|||
"tourism/village_sign",
|
||||
"tourism/wilderness_hut",
|
||||
"tourism/yes",
|
||||
# Public transport (from NaPTAN instead)
|
||||
# Public transport (from NaPTAN instead). public_transport/platform is the
|
||||
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
|
||||
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
|
||||
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
|
||||
# a single stop. stop_position is left dropped to avoid double-counting the
|
||||
# same stop (platform + stop_position).
|
||||
"public_transport/entrance",
|
||||
"public_transport/platform",
|
||||
"public_transport/station",
|
||||
"public_transport/stop_position",
|
||||
# Education amenities — schools come from GIAS instead. OSM coverage for
|
||||
|
|
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🌳",
|
||||
[
|
||||
"leisure/park",
|
||||
# leisure/garden is dominated by private residential gardens (98%+
|
||||
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
|
||||
# so only named (public/notable) gardens count as a Park.
|
||||
"leisure/garden",
|
||||
"leisure/common",
|
||||
"leisure/nature_reserve",
|
||||
"leisure/dog_park",
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
[
|
||||
"leisure/sports_centre",
|
||||
"leisure/sports_hall",
|
||||
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
|
||||
# (98% unnamed = private/garden pools) are name-gated in transform()
|
||||
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
|
||||
"leisure/pitch",
|
||||
"leisure/track",
|
||||
"leisure/golf_course",
|
||||
|
|
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"amenity/townhall",
|
||||
],
|
||||
),
|
||||
# ── Public transport (OSM supplement to NaPTAN) ──────────
|
||||
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
|
||||
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
|
||||
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
|
||||
# transform() (osm_stops_near_naptan).
|
||||
(
|
||||
"Public Transport",
|
||||
"Bus stop",
|
||||
"🚏",
|
||||
[
|
||||
"public_transport/platform",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
|
||||
# These tags are overwhelmingly private/incidental when unnamed: a nameless
|
||||
# `leisure/garden` is a private residential garden (not a public park), and a
|
||||
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
|
||||
# Keeping only named instances stops them inflating Park / Sports Centre counts
|
||||
# while preserving genuinely public, notable facilities (which carry a name).
|
||||
REQUIRE_NAME_CATEGORIES = {
|
||||
"leisure/garden",
|
||||
"leisure/pitch",
|
||||
"leisure/practice_pitch",
|
||||
"leisure/swimming_pool",
|
||||
"leisure/paddling_pool",
|
||||
}
|
||||
|
||||
|
||||
# Build flat lookup: OSM category → (group, friendly_name, emoji)
|
||||
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
|
||||
osm_key: (group, name, emoji)
|
||||
|
|
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
)
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
||||
def transform_gias_schools(
|
||||
gias_path: Path, ofsted_path: Path, boundary_path: Path
|
||||
) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata.
|
||||
Ofsted ratings are joined by URN so each school carries its latest OEIF
|
||||
overall effectiveness grade (Outstanding/Good/Requires improvement/
|
||||
Inadequate/Not judged), surfaced in the map popup."""
|
||||
Inadequate/Not judged), surfaced in the map popup.
|
||||
|
||||
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
|
||||
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
|
||||
England-only Education layer (and depress apparent Ofsted coverage, since
|
||||
Wales is inspected by Estyn, not Ofsted)."""
|
||||
icon_category_expr = _school_icon_category_expr()
|
||||
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
|
||||
ofsted = _load_ofsted_ratings(ofsted_path)
|
||||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return (
|
||||
schools = (
|
||||
pl.scan_parquet(gias_path)
|
||||
.join(ofsted, on="urn", how="left")
|
||||
.select(
|
||||
|
|
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
|||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
schools["lat"].to_numpy(),
|
||||
schools["lng"].to_numpy(),
|
||||
)
|
||||
return schools.filter(pl.Series(mask)).lazy()
|
||||
|
||||
|
||||
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
|
||||
|
|
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
|
|||
return tokens
|
||||
|
||||
|
||||
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
|
||||
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
|
||||
# so the colocated OSM platform is dropped to avoid double-counting; OSM
|
||||
# platforms with no nearby NaPTAN stop (the gaps) are kept.
|
||||
BUS_STOP_DEDUP_RADIUS_M = 50.0
|
||||
|
||||
|
||||
def osm_stops_near_naptan(
|
||||
osm_stops: pl.DataFrame,
|
||||
naptan_stops: pl.DataFrame,
|
||||
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
|
||||
) -> list[str]:
|
||||
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
|
||||
|
||||
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
|
||||
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
|
||||
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
|
||||
"""
|
||||
if osm_stops.is_empty() or naptan_stops.is_empty():
|
||||
return []
|
||||
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
n_lat = naptan_stops["lat"].to_numpy().astype(float)
|
||||
n_lng = naptan_stops["lng"].to_numpy().astype(float)
|
||||
o_lat = osm_stops["lat"].to_numpy().astype(float)
|
||||
o_lng = osm_stops["lng"].to_numpy().astype(float)
|
||||
o_ids = osm_stops["id"].to_list()
|
||||
|
||||
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
|
||||
cos_lat = float(np.cos(np.radians(mean_lat)))
|
||||
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
|
||||
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
|
||||
|
||||
tree = cKDTree(n_xy)
|
||||
dist, _ = tree.query(o_xy, k=1)
|
||||
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
|
||||
|
||||
|
||||
def osm_groceries_colocated_with_geolytix(
|
||||
osm_groceries: pl.DataFrame,
|
||||
geolytix: pl.DataFrame,
|
||||
|
|
@ -1601,6 +1694,19 @@ def transform(
|
|||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
||||
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
|
||||
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
|
||||
# while `category` still holds the raw OSM key, before the friendly mapping.
|
||||
lf = lf.filter(
|
||||
~(
|
||||
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
|
||||
& (
|
||||
pl.col("name").is_null()
|
||||
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Build lookup expressions from the 3-tuple mapping
|
||||
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
|
||||
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
|
||||
|
|
@ -1665,11 +1771,37 @@ def transform(
|
|||
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
|
||||
)
|
||||
|
||||
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
|
||||
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
|
||||
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
|
||||
# with NaPTAN ATCO ids.
|
||||
osm_bus_stops = (
|
||||
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
|
||||
.select("id", "lat", "lng")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
|
||||
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
|
||||
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
|
||||
print(
|
||||
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
|
||||
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
|
||||
f"{kept_osm:,} to fill NaPTAN gaps"
|
||||
)
|
||||
if covered_bus_ids:
|
||||
lf = lf.filter(
|
||||
~(
|
||||
(pl.col("group") == "Public Transport")
|
||||
& (pl.col("category") == "Bus stop")
|
||||
& pl.col("id").is_in(covered_bus_ids)
|
||||
)
|
||||
)
|
||||
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
grocery_pois.lazy(),
|
||||
transform_gias_schools(gias_path, ofsted_path),
|
||||
transform_gias_schools(gias_path, ofsted_path, boundary_path),
|
||||
]
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue