Improve data

This commit is contained in:
Andras Schmelczer 2026-06-10 07:54:25 +01:00
parent b4d66a28c1
commit 85da1941aa
31 changed files with 901 additions and 319 deletions

View file

@ -273,27 +273,24 @@ def _write_avg_yr(
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
# average over the years in which ANY of those types occurred. This keeps the
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
# Summing the per-type avg/yr values instead (as the merge previously did)
# divides each type by its OWN years-present and overstates the rollup when a
# postcode's serious/minor types occur in disjoint years.
# Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
# columns, so each rollup always equals the sum of the parts shown beside it
# and can never fall below one of its own components. (Previously the rollup
# re-derived a union-years-present mean: it divided the summed counts by the
# number of years in which ANY component type occurred, whereas each
# component divides by its OWN years-present. When a postcode's serious/minor
# types occurred in disjoint years the union denominator was larger, so the
# rollup came out smaller than the sum of its parts.) The by-year rollup
# series in _write_by_year is likewise the per-year sum of the component
# bars, so headline and chart both present the rollup as the sum of its parts.
for rollup_name, rollup_types in (
("Serious crime", SERIOUS_CRIME_TYPES),
("Minor crime", MINOR_CRIME_TYPES),
):
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
rollup_years_present = np.clip(
(rollup_counts > 0).sum(axis=1), 1, None
).astype(np.float64)
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
np.float32
)
data[f"{rollup_name} (avg/yr)"] = np.round(
avg[:, rollup_idx].sum(axis=1), 1
).astype(np.float32)
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")

View file

@ -36,6 +36,16 @@ MIN_PRICE = 10_000
MIN_BUILD_YEAR = 1700
MAX_BUILD_YEAR = 2030
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
# habitable rooms) that otherwise propagate verbatim into the published per-
# property columns. Values outside these bands are nulled (treated as unknown)
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
"""Map an EPC construction age band to a single representative build year.
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
)
.filter(pl.col("epc_address").is_not_null())
.with_columns(
pl.when(pl.col("number_habitable_rooms") == 0)
.then(None)
.otherwise(pl.col("number_habitable_rooms"))
# Null implausible EPC dimensions so data-entry errors don't reach
# the published per-property columns (Interior height, Total floor
# area, Number of bedrooms & living rooms). Treated as unknown.
pl.when(
(pl.col("number_habitable_rooms") >= 1)
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
)
.then(pl.col("number_habitable_rooms"))
.otherwise(None)
.alias("number_habitable_rooms"),
pl.when(
pl.col("floor_height").is_between(
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
)
)
.then(pl.col("floor_height"))
.otherwise(None)
.alias("floor_height"),
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
.then(pl.col("total_floor_area"))
.otherwise(None)
.alias("total_floor_area"),
)
)

View file

@ -2,6 +2,7 @@ import argparse
import re
import tempfile
from dataclasses import dataclass
from datetime import date
from typing import Literal
import numpy as np
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
TREE_DENSITY_FEATURE = "Street tree density percentile"
# Named "Tree canopy" (not "Street tree") because the underlying density unions
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
# woodland-edge postcode's score reflects forest canopy, not only street trees.
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
def _is_current_planning_record(end_date: object) -> bool:
"""A planning record is current when it has no end-date OR its end-date is
still in the future. The planning.data.gov.uk `end-date` field marks when a
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
area and must NOT be dropped the previous "any non-empty date = ended"
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
if end_date is None:
return True
if isinstance(end_date, str):
return end_date.strip() == ""
text = end_date.strip()
if text == "":
return True
try:
return date.fromisoformat(text[:10]) > date.today()
except ValueError:
# Unparseable end-date: keep the record rather than silently drop it.
return True
return False
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
)
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
like median age and IoD. The IoD table defines the LSOA universe every
postcode resolves into, so a missing LSOA would silently null the ethnicity
columns for those postcodes; require full coverage instead.
"""
iod_lsoas = pl.read_parquet(
iod_path, columns=["LSOA code (2021)"]
).rename({"LSOA code (2021)": "lsoa21"})
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
missing_ethnicity = iod_lsoas.join(
ethnicity_lsoas, on="lsoa21", how="anti"
).sort("lsoa21")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing LSOA coverage: "
f"{missing_ethnicity.height} LSOAs, e.g. "
f"{missing_ethnicity.head(10).to_dicts()}"
)
def _validate_lad_source_coverage(
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
iod_path: Path, rental_prices_path: Path
) -> None:
iod_lads = (
pl.read_parquet(
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
.unique(["lad"])
)
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
{"Geography_code": "lad"}
)
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing 2024 LAD coverage: "
f"{missing_ethnicity.to_dicts()}"
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"}
)
@ -849,12 +879,10 @@ def _join_area_side_tables(
broadband: pl.LazyFrame,
) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
base = base.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
# Local-Authority broadcast, with no change to the 6-bucket output schema.
base = base.join(ethnicity, on="lsoa21", how="left")
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
@ -1966,7 +1994,8 @@ def _build(
"""
if mode == "listings" and actual_listings_path is None:
raise ValueError("listings mode requires actual_listings_path")
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
_validate_lad_source_coverage(iod_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
@ -2225,7 +2254,7 @@ def main():
"--ethnicity",
type=Path,
required=True,
help="Ethnicity by local authority parquet file (optional)",
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
)
parser.add_argument(
"--crime",

View file

@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
# tolerance), we fatten it just enough to survive snapping rather than drop it.
_MIN_FOOTPRINT_BUFFER_M = 0.5
# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
# building-scale footprint instead. (Genuine thin slivers, which still carry
# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
# falls through to the tiny _grid_footprint, so this can only improve the result.
_POINT_RESCUE_BUFFER_M = 8.0
_POINTLIKE_AREA_M2 = 1.0
_POINTLIKE_PERIMETER_M = 4.0
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon the
signature of a tower-block postcode whose UPRNs all share one coordinate)
gets a building-scale buffer so it is not reduced to an invisible sub-metre
dot; thin slivers that still carry length keep the minimal buffer.
"""
buffer_m = _MIN_FOOTPRINT_BUFFER_M
try:
if (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
):
buffer_m = _POINT_RESCUE_BUFFER_M
except GEOSException:
pass
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
if footprint is None:
return None
return _snap_to_wgs84_geojson(footprint)

View file

@ -906,6 +906,37 @@ class TestToWgs84Geojson:
assert result is not None
assert result["type"] == "Polygon"
def test_pointlike_input_gets_building_scale_footprint(self):
"""A tower-block postcode (all UPRNs at one point) must not collapse to a
sub-metre dot; it gets a building-scale footprint instead."""
import pyproj
from shapely.geometry import Point, shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
result = to_wgs84_geojson(Point(360000, 170000))
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
import pyproj
from shapely.geometry import LineString, shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
result = to_wgs84_geojson(sliver)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
def test_coordinates_have_limited_precision(self):
"""GeoJSON coordinates should be rounded to 6 decimal places."""
import json

View file

@ -230,11 +230,28 @@ def main():
).height
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
# Null the absolute "Estimated current price" itself when its implied
# per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
# AND the floor area is known: these come from bulk/block transfers or
# garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
# estimate) and are not meaningful single-dwelling values. Previously only
# the derived per-sqm was nulled, leaving the absurd headline price visible.
_raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
pl.when(
pl.col("Estimated current price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
& ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
)
.then(None)
.otherwise(pl.col("Estimated current price"))
.alias("Estimated current price"),
)
# Derive estimated price per sqm where both estimated price and floor area
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
# from bulk/block transactions or floor-area errors and are not meaningful
# per-unit prices.
# exist. Now that the implausible-psm estimates are nulled above, the band
# filter here mainly guards the floor-area>0 case.
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
df = df.with_columns(
pl.when(

View file

@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
)
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
LATEST_COMPLETE_YEAR,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
@ -395,14 +396,22 @@ def build_index(
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
"""
pairs = extract_pairs(input_path, max_year2=max_pair_year)
# Solve the index only on COMPLETE calendar years: exclude the partial
# current year, whose thin repeat-sale set yields wild betas. The index is
# still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
# follows the established trend rather than a partial-year spike. Backtest
# passes a stricter max_pair_year, which is honoured.
estimation_cap = (
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
)
pairs = extract_pairs(input_path, max_year2=estimation_cap)
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
max_year = CURRENT_YEAR
hedonic_idx = compute_hedonic_index(
input_path, min_year, max_year, max_sale_year=max_pair_year
input_path, min_year, max_year, max_sale_year=estimation_cap
)
# Precompute hierarchy

View file

@ -6,6 +6,13 @@ import numpy as np
import polars as pl
CURRENT_YEAR = 2026
# Latest COMPLETE calendar year. The current year's transactions are only
# partially reported (Land Registry lags ~2-3 months), so a sector's thin
# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
# single sector). The index is SOLVED only on complete years (<= this) and
# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
# projections follow the established trend instead of a partial-year spike.
LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12

View file

@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
}
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
# Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
# phase" labels such schools as just "Secondary", which previously hid them from
# every postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
returning a ``(postcode, category)`` frame.
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
``category`` rows per school, returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
variants). Filtering on the graded column alone dropped ~7,000 genuinely
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
is never overridden.
remains Good"/"School remains Outstanding"). Filtering on the graded column
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
ungraded outcome, but ONLY when there is no usable graded result
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(
no_usable_grade
& ungraded.str.starts_with("School remains Outstanding")
)
.when(no_usable_grade & remains_outstanding)
.then(pl.lit("1"))
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
.when(no_usable_grade & remains_good)
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
return graded.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
# A school can yield up to two rows (primary and secondary).
primary = graded.filter(pl.col("_serves_primary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
.alias("category")
).select(
)
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("Postcode").alias("postcode"),
"category",
)
@ -85,12 +138,24 @@ def main():
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--gias",
type=Path,
default=None,
help="GIAS open-school parquet; if given, only currently-open schools are counted",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
open_urns: set[int] | None = None
if args.gias is not None:
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")

View file

@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
# bars (which span the UNION of years any serious type occurred), NOT the sum
# of the per-type means. Summing per-type means divides each type by its OWN
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
# per-year serious total by the years any serious type occurred (2) -> 12.
# "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
# (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
# shown beside it and can never fall below a single component. (The previous
# union-years-present mean would have divided the per-year serious total by the
# 2 years any serious type occurred, giving a misleading 12 that sits below
# both the burglary and robbery rollup contributions.)
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
avg = pl.read_parquet(output).row(0, named=True)
# The precomputed rollup headline exists and equals the mean of the bars (12),
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
assert "Serious crime (avg/yr)" in avg
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
# Rollup == sum of its component (avg/yr) columns.
assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(
avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
)
# The by-year rollup series remains the per-year sum of the component bars.
serious_bars = {
p["year"]: p["count"]
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
2014: pytest.approx(12.0, abs=0.05),
2024: pytest.approx(12.0, abs=0.05),
}
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):

View file

@ -34,6 +34,7 @@ from pipeline.transform.merge import (
_split_normal_outputs,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_lsoa_source_coverage,
_validate_postcode_feature_output,
_validate_property_postcodes,
)
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
],
}
).write_parquet(iod_path)
pl.DataFrame(
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
rental_path
)
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
_validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
"Local Authority District name (2024)": ["Barnsley"],
}
).write_parquet(iod_path)
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
rental_path
)
with pytest.raises(ValueError, match="Rental data is missing"):
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
_validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
# Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
# LSOAs are required to all be present.
pl.DataFrame(
{"lsoa21": ["E01000001", "E01000002", "E01000003"]}
).write_parquet(ethnicity_path)
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
"Property type": ["Terraced", None],
"Leasehold/Freehold": ["Leasehold", None],
"Last known price": [500_000, None],
"Street tree density percentile": [42.0, 42.0],
"Tree canopy density percentile": [42.0, 42.0],
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
"_actual_listing_url": ["url0", "url1"],
"_actual_asking_price": [600_000, 700_000],
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
# Match status reflects historical context availability.
assert finalized["Historical property match status"].to_list() == [
"matched",
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
"Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000],
"Street tree density percentile": [42.0, 42.0],
"Tree canopy density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000],
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,

View file

@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Concerns)"/"(Improving)" variants are still good+.
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AE", "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
"AA1 1AD",
),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AC", "good_primary"),
("AA1 1AD", "outstanding_secondary"),
}
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"URN": 100000,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AA", "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_secondary"),
("AA1 1AC", "outstanding_primary"),
("AA1 1AC", "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
]
rows[0]["URN"] = 111
rows[1]["URN"] = 222
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {("AA1 1AA", "outstanding_primary")}

View file

@ -33,6 +33,14 @@ DROP_CATEGORIES = {
"emergency/water_tank",
"leisure/bleachers",
"leisure/schoolyard",
# Park "furniture" / incidental features — not parks; they massively
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
"public_transport/pay_scale_area",
"shop/taxi",
"amenity/feeding_place",
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
"tourism/village_sign",
"tourism/wilderness_hut",
"tourism/yes",
# Public transport (from NaPTAN instead)
# Public transport (from NaPTAN instead). public_transport/platform is the
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
# a single stop. stop_position is left dropped to avoid double-counting the
# same stop (platform + stop_position).
"public_transport/entrance",
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🌳",
[
"leisure/park",
# leisure/garden is dominated by private residential gardens (98%+
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
# so only named (public/notable) gardens count as a Park.
"leisure/garden",
"leisure/common",
"leisure/nature_reserve",
"leisure/dog_park",
"leisure/bandstand",
"leisure/bird_hide",
"leisure/firepit",
"leisure/outdoor_seating",
"leisure/picnic_table",
"leisure/wildlife_hide",
],
),
(
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"leisure/sports_centre",
"leisure/sports_hall",
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
# (98% unnamed = private/garden pools) are name-gated in transform()
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
"leisure/pitch",
"leisure/track",
"leisure/golf_course",
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/townhall",
],
),
# ── Public transport (OSM supplement to NaPTAN) ──────────
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
# transform() (osm_stops_near_naptan).
(
"Public Transport",
"Bus stop",
"🚏",
[
"public_transport/platform",
],
),
]
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
# These tags are overwhelmingly private/incidental when unnamed: a nameless
# `leisure/garden` is a private residential garden (not a public park), and a
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
# Keeping only named instances stops them inflating Park / Sports Centre counts
# while preserving genuinely public, notable facilities (which carry a name).
REQUIRE_NAME_CATEGORIES = {
"leisure/garden",
"leisure/pitch",
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
}
# Build flat lookup: OSM category → (group, friendly_name, emoji)
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
osm_key: (group, name, emoji)
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
def transform_gias_schools(
gias_path: Path, ofsted_path: Path, boundary_path: Path
) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
Inadequate/Not judged), surfaced in the map popup.
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
England-only Education layer (and depress apparent Ofsted coverage, since
Wales is inspected by Estyn, not Ofsted)."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return (
schools = (
pl.scan_parquet(gias_path)
.join(ofsted, on="urn", how="left")
.select(
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
.collect()
)
mask = in_england_mask(
boundary_path,
schools["lat"].to_numpy(),
schools["lng"].to_numpy(),
)
return schools.filter(pl.Series(mask)).lazy()
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
return tokens
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
# so the colocated OSM platform is dropped to avoid double-counting; OSM
# platforms with no nearby NaPTAN stop (the gaps) are kept.
BUS_STOP_DEDUP_RADIUS_M = 50.0
def osm_stops_near_naptan(
osm_stops: pl.DataFrame,
naptan_stops: pl.DataFrame,
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
) -> list[str]:
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
"""
if osm_stops.is_empty() or naptan_stops.is_empty():
return []
from scipy.spatial import cKDTree
n_lat = naptan_stops["lat"].to_numpy().astype(float)
n_lng = naptan_stops["lng"].to_numpy().astype(float)
o_lat = osm_stops["lat"].to_numpy().astype(float)
o_lng = osm_stops["lng"].to_numpy().astype(float)
o_ids = osm_stops["id"].to_list()
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
cos_lat = float(np.cos(np.radians(mean_lat)))
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
tree = cKDTree(n_xy)
dist, _ = tree.query(o_xy, k=1)
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
def osm_groceries_colocated_with_geolytix(
osm_groceries: pl.DataFrame,
geolytix: pl.DataFrame,
@ -1601,6 +1694,19 @@ def transform(
# Drop unwanted categories
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
# while `category` still holds the raw OSM key, before the friendly mapping.
lf = lf.filter(
~(
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
& (
pl.col("name").is_null()
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
)
)
)
# Build lookup expressions from the 3-tuple mapping
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
@ -1665,11 +1771,37 @@ def transform(
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
)
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
# with NaPTAN ATCO ids.
osm_bus_stops = (
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
.select("id", "lat", "lng")
.collect(engine="streaming")
)
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
print(
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
f"{kept_osm:,} to fill NaPTAN gaps"
)
if covered_bus_ids:
lf = lf.filter(
~(
(pl.col("group") == "Public Transport")
& (pl.col("category") == "Bus stop")
& pl.col("id").is_in(covered_bus_ids)
)
)
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
transform_gias_schools(gias_path, ofsted_path, boundary_path),
]
return pl.concat(frames, how="diagonal_relaxed")