Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -1,4 +1,22 @@
|
|||
"""Download Census 2021 ethnic group (TS021) by LSOA.
|
||||
|
||||
Downloads the 20-category ethnic-group breakdown (TS021, classification
|
||||
C2021_ETH_20) from the NOMIS API at LSOA 2021 granularity, folds the 19 detailed
|
||||
leaf categories into our 6 output buckets, and emits one row per LSOA with the
|
||||
percentage in each bucket.
|
||||
|
||||
Sourcing at LSOA (~33,755 England areas) rather than Local Authority (~319) is a
|
||||
~100x granularity gain with no change to the 6-bucket output schema: two very
|
||||
different neighbourhoods in one borough no longer share an identical ethnicity
|
||||
profile. The join key downstream (merge.py) is `lsoa21`, the same key already
|
||||
used for median age and IoD.
|
||||
|
||||
Source: NOMIS (ONS Census 2021 — TS021 dataset, NM_2041_1)
|
||||
License: Open Government Licence v3.0
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
|
@ -6,143 +24,168 @@ import polars as pl
|
|||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
# NOMIS API: Census 2021 TS021 (ethnic group, 20 categories) by LSOA 2021
|
||||
# (TYPE151). c2021_eth_20=1..19 selects the 19 detailed leaf categories
|
||||
# (excluding the 5 broad aggregates 1001-1005 and the 0 = Total, which we
|
||||
# re-derive ourselves). measures=20100 selects the absolute count.
|
||||
BASE_URL = (
|
||||
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2041_1.data.csv"
|
||||
"?geography=TYPE151"
|
||||
"&c2021_eth_20=1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"
|
||||
"&measures=20100"
|
||||
"&select=GEOGRAPHY_CODE,C2021_ETH_20_NAME,OBS_VALUE"
|
||||
)
|
||||
PAGE_SIZE = 25000
|
||||
|
||||
URL = "https://www.ethnicity-facts-figures.service.gov.uk/uk-population-by-ethnicity/national-and-regional-populations/regional-ethnic-diversity/latest/downloads/population-by-ethnicity-and-local-authority-2021.csv"
|
||||
|
||||
GEOGRAPHY_CODE_REPLACEMENTS = {
|
||||
# 2023 Cumberland unitary authority
|
||||
"E07000026": "E06000063", # Allerdale
|
||||
"E07000028": "E06000063", # Carlisle
|
||||
"E07000029": "E06000063", # Copeland
|
||||
# 2023 Westmorland and Furness unitary authority
|
||||
"E07000027": "E06000064", # Barrow-in-Furness
|
||||
"E07000030": "E06000064", # Eden
|
||||
"E07000031": "E06000064", # South Lakeland
|
||||
# 2023 North Yorkshire unitary authority
|
||||
"E07000163": "E06000065", # Craven
|
||||
"E07000164": "E06000065", # Hambleton
|
||||
"E07000165": "E06000065", # Harrogate
|
||||
"E07000166": "E06000065", # Richmondshire
|
||||
"E07000167": "E06000065", # Ryedale
|
||||
"E07000168": "E06000065", # Scarborough
|
||||
"E07000169": "E06000065", # Selby
|
||||
# 2023 Somerset unitary authority
|
||||
"E07000187": "E06000066", # Mendip
|
||||
"E07000188": "E06000066", # Sedgemoor
|
||||
"E07000189": "E06000066", # South Somerset
|
||||
"E07000246": "E06000066", # Somerset West and Taunton
|
||||
# Map the 19 detailed NOMIS C2021_ETH_20 leaf categories to our 6 output groups.
|
||||
# The split mirrors the previous Local-Authority source exactly:
|
||||
# * "Other Asian" routes to East/SE Asian (not South Asian). The ONS "Other
|
||||
# Asian" bucket is predominantly East/Southeast Asian (Filipino, Vietnamese,
|
||||
# Thai, Japanese, Korean, ...) rather than South Asian, so routing it here
|
||||
# avoids inflating "% South Asian". The split is approximate (the bucket also
|
||||
# holds some South Asian groups such as Sri Lankan/Nepalese).
|
||||
GROUP_MAP = {
|
||||
# White
|
||||
"White: English, Welsh, Scottish, Northern Irish or British": "White",
|
||||
"White: Irish": "White",
|
||||
"White: Gypsy or Irish Traveller": "White",
|
||||
"White: Roma": "White",
|
||||
"White: Other White": "White",
|
||||
# South Asian
|
||||
"Asian, Asian British or Asian Welsh: Indian": "South Asian",
|
||||
"Asian, Asian British or Asian Welsh: Pakistani": "South Asian",
|
||||
"Asian, Asian British or Asian Welsh: Bangladeshi": "South Asian",
|
||||
# East / Southeast Asian
|
||||
"Asian, Asian British or Asian Welsh: Chinese": "East Asian",
|
||||
"Asian, Asian British or Asian Welsh: Other Asian": "South East Asian",
|
||||
# Black
|
||||
"Black, Black British, Black Welsh, Caribbean or African: African": "Black",
|
||||
"Black, Black British, Black Welsh, Caribbean or African: Caribbean": "Black",
|
||||
"Black, Black British, Black Welsh, Caribbean or African: Other Black": "Black",
|
||||
# Mixed
|
||||
"Mixed or Multiple ethnic groups: White and Asian": "Mixed",
|
||||
"Mixed or Multiple ethnic groups: White and Black African": "Mixed",
|
||||
"Mixed or Multiple ethnic groups: White and Black Caribbean": "Mixed",
|
||||
"Mixed or Multiple ethnic groups: Other Mixed or Multiple ethnic groups": "Mixed",
|
||||
# Other
|
||||
"Other ethnic group: Arab": "Other",
|
||||
"Other ethnic group: Any other ethnic group": "Other",
|
||||
}
|
||||
|
||||
# The 6 output groups, in a fixed order so the largest-remainder rounding below
|
||||
# is deterministic regardless of pivot column ordering.
|
||||
OUTPUT_GROUPS = ["White", "South Asian", "East/SE Asian", "Black", "Mixed", "Other"]
|
||||
assert set(GROUP_MAP.values()) == set(OUTPUT_GROUPS), (
|
||||
"GROUP_MAP values must be exactly the OUTPUT_GROUPS"
|
||||
)
|
||||
|
||||
|
||||
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
||||
# Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
|
||||
# then aggregate back to the broad groups plus a South Asian / East/SE Asian
|
||||
# split (Indian/Pakistani/Bangladeshi vs Chinese + other East/SE Asian).
|
||||
detailed = df.filter(
|
||||
(pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All")
|
||||
"""Fold the 19 NOMIS leaf categories into 6-bucket percentages per LSOA.
|
||||
|
||||
`df` is the long-format NOMIS download with columns GEOGRAPHY_CODE,
|
||||
C2021_ETH_20_NAME (the detailed leaf label) and OBS_VALUE (a count). A
|
||||
missing/extra/relabelled leaf category would silently drop people from the
|
||||
denominator, so we validate the category set against GROUP_MAP first and
|
||||
fail loudly otherwise.
|
||||
"""
|
||||
found = set(df["C2021_ETH_20_NAME"].unique().to_list())
|
||||
expected = set(GROUP_MAP)
|
||||
if found != expected:
|
||||
missing = sorted(expected - found)
|
||||
unexpected = sorted(found - expected)
|
||||
raise ValueError(
|
||||
"Census ethnic-group categories do not match the expected NOMIS "
|
||||
"TS021 C2021_ETH_20 leaf set.\n"
|
||||
f" expected {len(expected)} categories, found {len(found)}\n"
|
||||
f" missing: {missing}\n"
|
||||
f" unexpected: {unexpected}\n"
|
||||
"Refusing to compute percentages against an unrecognised breakdown."
|
||||
)
|
||||
|
||||
# Map each leaf to its output group and sum counts per (LSOA, group). Summing
|
||||
# counts (not rounded percentages) keeps the denominator exact.
|
||||
grouped = (
|
||||
df.with_columns(
|
||||
pl.col("C2021_ETH_20_NAME").replace_strict(GROUP_MAP).alias("group"),
|
||||
pl.col("OBS_VALUE").cast(pl.Float64, strict=False).alias("_count"),
|
||||
)
|
||||
.group_by("GEOGRAPHY_CODE", "group")
|
||||
.agg(pl.col("_count").sum())
|
||||
)
|
||||
wide = grouped.pivot(on="group", index="GEOGRAPHY_CODE", values="_count").rename(
|
||||
{"GEOGRAPHY_CODE": "lsoa21"}
|
||||
)
|
||||
|
||||
# Map detailed categories to our output groups
|
||||
group_map = {
|
||||
# White
|
||||
"White British": "White",
|
||||
"White Irish": "White",
|
||||
"Gypsy Or Irish Traveller": "White",
|
||||
"Roma": "White",
|
||||
"Any Other White Background": "White",
|
||||
# South Asian
|
||||
"Indian": "South Asian",
|
||||
"Pakistani": "South Asian",
|
||||
"Bangladeshi": "South Asian",
|
||||
# East / Southeast Asian. The ONS "Any Other Asian Background" bucket is
|
||||
# predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
|
||||
# Japanese, Korean, ...) rather than South Asian, so route it here rather
|
||||
# than inflating "% South Asian". The split is approximate (the ONS
|
||||
# bucket also holds some South Asian groups such as Sri Lankan/Nepalese).
|
||||
"Chinese": "East/SE Asian",
|
||||
"Any Other Asian Background": "East/SE Asian",
|
||||
# Black
|
||||
"Black African": "Black",
|
||||
"Black Caribbean": "Black",
|
||||
"Any Other Black Background": "Black",
|
||||
# Mixed
|
||||
"Mixed White And Asian": "Mixed",
|
||||
"Mixed White And Black African": "Mixed",
|
||||
"Mixed White And Black Caribbean": "Mixed",
|
||||
"Any Other Mixed/Multiple Ethnic Background": "Mixed",
|
||||
# Other
|
||||
"Arab": "Other",
|
||||
"Any Other Ethnic Background": "Other",
|
||||
}
|
||||
# A group with no people in an LSOA is absent from the long rows, so the pivot
|
||||
# leaves a null; treat it as 0 before normalising.
|
||||
wide = wide.with_columns(pl.col(OUTPUT_GROUPS).fill_null(0.0))
|
||||
|
||||
detailed = detailed.with_columns(
|
||||
pl.col("Ethnicity").replace_strict(group_map).alias("group"),
|
||||
pl.col("Geography_code")
|
||||
.replace(GEOGRAPHY_CODE_REPLACEMENTS)
|
||||
.alias("output_geography_code"),
|
||||
pl.col("Ethnic Population").cast(pl.Float64, strict=False).alias("_population"),
|
||||
)
|
||||
|
||||
# Sum counts, not rounded percentages, so old districts can be safely
|
||||
# recombined into their current unitary authorities.
|
||||
grouped = detailed.group_by("output_geography_code", "group").agg(
|
||||
pl.col("_population").sum()
|
||||
)
|
||||
wide = grouped.pivot(
|
||||
on="group", index="output_geography_code", values="_population"
|
||||
).rename({"output_geography_code": "Geography_code"})
|
||||
|
||||
# Normalize so each row sums to exactly 100%, then round using largest-remainder
|
||||
# method to preserve the sum. Independent rounding of 6 values can drift ±0.3.
|
||||
group_cols = [c for c in wide.columns if c != "Geography_code"]
|
||||
row_total = sum(pl.col(c) for c in group_cols)
|
||||
# Scale each group so they sum to exactly 100
|
||||
# Normalize so each row sums to exactly 100%, then round with the
|
||||
# largest-remainder method to preserve the sum. Independent rounding of 6
|
||||
# values can drift +/-0.3.
|
||||
row_total = sum(pl.col(c) for c in OUTPUT_GROUPS)
|
||||
wide = wide.with_columns(
|
||||
[(pl.col(c) / row_total * 100.0).alias(c) for c in group_cols]
|
||||
[(pl.col(c) / row_total * 100.0).alias(c) for c in OUTPUT_GROUPS]
|
||||
)
|
||||
# Round to 1 decimal, then adjust the largest group to absorb residual
|
||||
rounded_cols = [pl.col(c).round(1).alias(c) for c in group_cols]
|
||||
wide = wide.with_columns(rounded_cols)
|
||||
rounded_sum = sum(pl.col(c) for c in group_cols)
|
||||
# Round to 1 decimal, then adjust the largest group to absorb the residual.
|
||||
wide = wide.with_columns([pl.col(c).round(1).alias(c) for c in OUTPUT_GROUPS])
|
||||
rounded_sum = sum(pl.col(c) for c in OUTPUT_GROUPS)
|
||||
residual = (100.0 - rounded_sum).round(1)
|
||||
# Find which group is largest per row and add the residual there
|
||||
largest_col = pl.concat_list(group_cols).list.arg_max()
|
||||
largest_col = pl.concat_list(OUTPUT_GROUPS).list.arg_max()
|
||||
wide = wide.with_columns(
|
||||
[
|
||||
pl.when(largest_col == i)
|
||||
.then(pl.col(c) + residual)
|
||||
.otherwise(pl.col(c))
|
||||
.alias(c)
|
||||
for i, c in enumerate(group_cols)
|
||||
for i, c in enumerate(OUTPUT_GROUPS)
|
||||
]
|
||||
)
|
||||
|
||||
# Rename columns to be descriptive
|
||||
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
|
||||
wide = wide.rename(rename_map)
|
||||
return wide
|
||||
rename_map = {col: f"% {col}" for col in OUTPUT_GROUPS}
|
||||
return wide.rename(rename_map)
|
||||
|
||||
|
||||
def download_and_convert(output_path: Path) -> None:
|
||||
print("Downloading ethnicity data...")
|
||||
response = httpx.get(URL, follow_redirects=True, timeout=60)
|
||||
response.raise_for_status()
|
||||
print("Downloading Census 2021 ethnic group (TS021) by LSOA from NOMIS...")
|
||||
frames = []
|
||||
offset = 0
|
||||
while True:
|
||||
url = f"{BASE_URL}&recordoffset={offset}"
|
||||
response = httpx.get(url, follow_redirects=True, timeout=120)
|
||||
response.raise_for_status()
|
||||
if len(response.content) == 0:
|
||||
break
|
||||
chunk = pl.read_csv(BytesIO(response.content))
|
||||
if chunk.height == 0:
|
||||
break
|
||||
frames.append(chunk)
|
||||
print(f" Fetched {chunk.height} rows (offset={offset})")
|
||||
if chunk.height < PAGE_SIZE:
|
||||
break
|
||||
offset += PAGE_SIZE
|
||||
|
||||
df = pl.read_csv(response.content)
|
||||
print(f"Raw shape: {df.head(100)}")
|
||||
df = pl.concat(frames)
|
||||
print(f"Total rows: {df.height}")
|
||||
|
||||
# Filter to England only (E-prefixed LSOA codes); the merge joins on the
|
||||
# English postcode universe and the IoD coverage check is England-wide.
|
||||
df = df.filter(pl.col("GEOGRAPHY_CODE").str.starts_with("E"))
|
||||
|
||||
wide = _ethnicity_percentages(df)
|
||||
|
||||
print(f"Output shape: {wide.shape}")
|
||||
print(f"England LSOAs: {wide.height}")
|
||||
print(f"Columns: {wide.columns}")
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wide.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download and convert ethnicity by local authority data"
|
||||
description="Download Census 2021 ethnic group (TS021) by LSOA"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
|
|
|
|||
|
|
@ -192,6 +192,10 @@ def _read_csv_from_zip(zip_bytes: bytes) -> pl.DataFrame:
|
|||
infer_schema_length=20000,
|
||||
null_values=_NULL_VALUES,
|
||||
truncate_ragged_lines=True,
|
||||
# Force the phone number to stay a string: schema inference reads it as
|
||||
# an integer and strips the leading 0 (e.g. 020 8427 7222 -> 2084277222),
|
||||
# making nearly every school phone number un-diallable.
|
||||
schema_overrides={"TelephoneNum": pl.String},
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,65 +1,118 @@
|
|||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.download.ethnicity import _ethnicity_percentages
|
||||
from pipeline.download.ethnicity import GROUP_MAP, OUTPUT_GROUPS, _ethnicity_percentages
|
||||
|
||||
|
||||
def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
|
||||
rows = []
|
||||
for code, white, indian in [
|
||||
("E07000026", 80, 20),
|
||||
("E07000028", 10, 90),
|
||||
]:
|
||||
total = white + indian
|
||||
rows.extend(
|
||||
[
|
||||
{
|
||||
"Geography_code": code,
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": "White British",
|
||||
"Ethnic Population": white,
|
||||
"Value1": white / total * 100,
|
||||
},
|
||||
{
|
||||
"Geography_code": code,
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": "Indian",
|
||||
"Ethnic Population": indian,
|
||||
"Value1": indian / total * 100,
|
||||
},
|
||||
]
|
||||
)
|
||||
def _long_rows(geo: str, counts: dict[str, int]) -> list[dict]:
|
||||
"""Build NOMIS-shaped long rows for one LSOA from {leaf_label: count}.
|
||||
|
||||
result = _ethnicity_percentages(pl.DataFrame(rows))
|
||||
|
||||
cumberland = result.filter(pl.col("Geography_code") == "E06000063")
|
||||
assert cumberland.select("% White", "% South Asian").to_dicts() == [
|
||||
{"% White": 45.0, "% South Asian": 55.0}
|
||||
]
|
||||
|
||||
|
||||
def test_ethnicity_routes_any_other_asian_to_east_se_asian():
|
||||
"""'Any Other Asian Background' and 'Chinese' both fold into '% East/SE Asian'
|
||||
(not '% South Asian'), fixing the East/SE Asian undercount."""
|
||||
rows = [
|
||||
Every one of the 19 leaf categories must be present in the download (NOMIS
|
||||
emits a 0-count row when an LSOA has none), so categories not given default
|
||||
to 0 to mirror that.
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"Geography_code": "E06000001",
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": ethnicity,
|
||||
"Ethnic Population": pop,
|
||||
"Value1": 0.0,
|
||||
"GEOGRAPHY_CODE": geo,
|
||||
"C2021_ETH_20_NAME": label,
|
||||
"OBS_VALUE": counts.get(label, 0),
|
||||
}
|
||||
for ethnicity, pop in [
|
||||
("Chinese", 30),
|
||||
("Any Other Asian Background", 20),
|
||||
("Indian", 50),
|
||||
]
|
||||
for label in GROUP_MAP
|
||||
]
|
||||
|
||||
result = _ethnicity_percentages(pl.DataFrame(rows))
|
||||
area = result.filter(pl.col("Geography_code") == "E06000001")
|
||||
|
||||
def test_ethnicity_percentages_keyed_by_lsoa_with_six_buckets():
|
||||
df = pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000001",
|
||||
{
|
||||
"White: English, Welsh, Scottish, Northern Irish or British": 60,
|
||||
"White: Other White": 10,
|
||||
"Asian, Asian British or Asian Welsh: Indian": 20,
|
||||
"Black, Black British, Black Welsh, Caribbean or African: African": 10,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
result = _ethnicity_percentages(df)
|
||||
|
||||
assert result.columns[0] == "lsoa21"
|
||||
assert set(result.columns) == {"lsoa21", *(f"% {g}" for g in OUTPUT_GROUPS)}
|
||||
row = result.filter(pl.col("lsoa21") == "E01000001").to_dicts()[0]
|
||||
assert row["% White"] == 70.0
|
||||
assert row["% South Asian"] == 20.0
|
||||
assert row["% Black"] == 10.0
|
||||
# Percentages always sum to exactly 100 (largest-remainder rounding).
|
||||
assert round(sum(row[f"% {g}"] for g in OUTPUT_GROUPS), 1) == 100.0
|
||||
|
||||
|
||||
def test_ethnicity_routes_other_asian_to_east_se_asian():
|
||||
"""'Other Asian' and 'Chinese' both fold into '% East/SE Asian' (not
|
||||
'% South Asian'), preserving the East/SE Asian split from the LAD source."""
|
||||
df = pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000002",
|
||||
{
|
||||
"Asian, Asian British or Asian Welsh: Chinese": 30,
|
||||
"Asian, Asian British or Asian Welsh: Other Asian": 20,
|
||||
"Asian, Asian British or Asian Welsh: Indian": 50,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
result = _ethnicity_percentages(df)
|
||||
area = result.filter(pl.col("lsoa21") == "E01000002")
|
||||
|
||||
assert "% East/SE Asian" in result.columns
|
||||
assert "% East Asian" not in result.columns
|
||||
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
|
||||
{"% East/SE Asian": 50.0, "% South Asian": 50.0}
|
||||
]
|
||||
|
||||
|
||||
def test_ethnicity_percentages_independent_per_lsoa():
|
||||
"""Two LSOAs get independent profiles — the LSOA granularity is the point."""
|
||||
df = pl.concat(
|
||||
[
|
||||
pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000010",
|
||||
{"White: Other White": 100},
|
||||
)
|
||||
),
|
||||
pl.DataFrame(
|
||||
_long_rows(
|
||||
"E01000011",
|
||||
{"Asian, Asian British or Asian Welsh: Pakistani": 100},
|
||||
)
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
result = _ethnicity_percentages(df).sort("lsoa21")
|
||||
|
||||
assert result["% White"].to_list() == [100.0, 0.0]
|
||||
assert result["% South Asian"].to_list() == [0.0, 100.0]
|
||||
|
||||
|
||||
def test_ethnicity_percentages_rejects_unexpected_category():
|
||||
rows = _long_rows("E01000003", {"White: Other White": 10})
|
||||
rows.append(
|
||||
{
|
||||
"GEOGRAPHY_CODE": "E01000003",
|
||||
"C2021_ETH_20_NAME": "White: A Brand New Census Category",
|
||||
"OBS_VALUE": 5,
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="do not match the expected"):
|
||||
_ethnicity_percentages(pl.DataFrame(rows))
|
||||
|
||||
|
||||
def test_ethnicity_percentages_rejects_missing_category():
|
||||
# Drop one leaf entirely: its people would vanish from the denominator.
|
||||
rows = [r for r in _long_rows("E01000004", {"White: Other White": 10}) if
|
||||
r["C2021_ETH_20_NAME"] != "Other ethnic group: Arab"]
|
||||
|
||||
with pytest.raises(ValueError, match="missing"):
|
||||
_ethnicity_percentages(pl.DataFrame(rows))
|
||||
|
|
|
|||
|
|
@ -1011,11 +1011,6 @@ def main() -> None:
|
|||
action="store_true",
|
||||
help="Skip TfL TransXChange download and conversion",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-national-rail",
|
||||
action="store_true",
|
||||
help="Skip National Rail CIF download and conversion",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir: Path = args.output
|
||||
|
|
@ -1039,13 +1034,20 @@ def main() -> None:
|
|||
download_tfl_transxchange(raw_dir)
|
||||
convert_tfl_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# 3. National Rail CIF → GTFS
|
||||
if args.skip_national_rail:
|
||||
print("Skipping National Rail (--skip-national-rail)")
|
||||
else:
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
if cif is not None:
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
|
||||
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
|
||||
# silently overstates every train commute. Missing credentials are a HARD
|
||||
# error, so a rail-less network can never ship.
|
||||
cif = download_national_rail_cif(raw_dir)
|
||||
if cif is None:
|
||||
raise RuntimeError(
|
||||
"National Rail timetable was not downloaded — set "
|
||||
"NATIONAL_RAIL_EMAIL / NATIONAL_RAIL_PASSWORD (register free at "
|
||||
"https://opendata.nationalrail.co.uk/). National Rail heavy rail is "
|
||||
"required; without it the transit network models every train journey "
|
||||
"as bus-only and overstates commute times."
|
||||
)
|
||||
convert_national_rail_to_gtfs(raw_dir, output_dir)
|
||||
|
||||
# Summary
|
||||
print()
|
||||
|
|
|
|||
|
|
@ -273,27 +273,24 @@ def _write_avg_yr(
|
|||
for type_idx, name in enumerate(ALL_CRIME_TYPES):
|
||||
data[f"{name} (avg/yr)"] = avg[:, type_idx]
|
||||
|
||||
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
|
||||
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
|
||||
# average over the years in which ANY of those types occurred. This keeps the
|
||||
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
|
||||
# Summing the per-type avg/yr values instead (as the merge previously did)
|
||||
# divides each type by its OWN years-present and overstates the rollup when a
|
||||
# postcode's serious/minor types occur in disjoint years.
|
||||
# Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
|
||||
# columns, so each rollup always equals the sum of the parts shown beside it
|
||||
# and can never fall below one of its own components. (Previously the rollup
|
||||
# re-derived a union-years-present mean: it divided the summed counts by the
|
||||
# number of years in which ANY component type occurred, whereas each
|
||||
# component divides by its OWN years-present. When a postcode's serious/minor
|
||||
# types occurred in disjoint years the union denominator was larger, so the
|
||||
# rollup came out smaller than the sum of its parts.) The by-year rollup
|
||||
# series in _write_by_year is likewise the per-year sum of the component
|
||||
# bars, so headline and chart both present the rollup as the sum of its parts.
|
||||
for rollup_name, rollup_types in (
|
||||
("Serious crime", SERIOUS_CRIME_TYPES),
|
||||
("Minor crime", MINOR_CRIME_TYPES),
|
||||
):
|
||||
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
|
||||
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
|
||||
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
|
||||
rollup_years_present = np.clip(
|
||||
(rollup_counts > 0).sum(axis=1), 1, None
|
||||
).astype(np.float64)
|
||||
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
|
||||
np.float32
|
||||
)
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(
|
||||
avg[:, rollup_idx].sum(axis=1), 1
|
||||
).astype(np.float32)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
|
||||
|
|
|
|||
|
|
@ -36,6 +36,16 @@ MIN_PRICE = 10_000
|
|||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
# Plausibility bounds for raw EPC dimensions. EPC lodgements contain data-entry
|
||||
# errors (0 m storey heights, 116 m "interior height", 9,210 m² floor areas, 99
|
||||
# habitable rooms) that otherwise propagate verbatim into the published per-
|
||||
# property columns. Values outside these bands are nulled (treated as unknown)
|
||||
# rather than shown. Bounds are deliberately wide so only clear errors are cut.
|
||||
MIN_FLOOR_HEIGHT_M = 1.5 # below this a storey is not habitable
|
||||
MAX_FLOOR_HEIGHT_M = 6.0 # above this is a data error, not a normal storey
|
||||
MAX_TOTAL_FLOOR_AREA_M2 = 2000.0 # ~21,500 sqft; larger is a bulk/garbage record
|
||||
MAX_HABITABLE_ROOMS = 20 # dwellings above this are data errors
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
|
@ -132,10 +142,28 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
)
|
||||
.filter(pl.col("epc_address").is_not_null())
|
||||
.with_columns(
|
||||
pl.when(pl.col("number_habitable_rooms") == 0)
|
||||
.then(None)
|
||||
.otherwise(pl.col("number_habitable_rooms"))
|
||||
# Null implausible EPC dimensions so data-entry errors don't reach
|
||||
# the published per-property columns (Interior height, Total floor
|
||||
# area, Number of bedrooms & living rooms). Treated as unknown.
|
||||
pl.when(
|
||||
(pl.col("number_habitable_rooms") >= 1)
|
||||
& (pl.col("number_habitable_rooms") <= MAX_HABITABLE_ROOMS)
|
||||
)
|
||||
.then(pl.col("number_habitable_rooms"))
|
||||
.otherwise(None)
|
||||
.alias("number_habitable_rooms"),
|
||||
pl.when(
|
||||
pl.col("floor_height").is_between(
|
||||
MIN_FLOOR_HEIGHT_M, MAX_FLOOR_HEIGHT_M
|
||||
)
|
||||
)
|
||||
.then(pl.col("floor_height"))
|
||||
.otherwise(None)
|
||||
.alias("floor_height"),
|
||||
pl.when(pl.col("total_floor_area") <= MAX_TOTAL_FLOOR_AREA_M2)
|
||||
.then(pl.col("total_floor_area"))
|
||||
.otherwise(None)
|
||||
.alias("total_floor_area"),
|
||||
)
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import argparse
|
|||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
# Named "Tree canopy" (not "Street tree") because the underlying density unions
|
||||
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
|
||||
# woodland-edge postcode's score reflects forest canopy, not only street trees.
|
||||
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
|
||||
LISTED_BUILDING_FEATURE = "Listed building"
|
||||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
|
|
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
|
|||
|
||||
|
||||
def _is_current_planning_record(end_date: object) -> bool:
|
||||
"""A planning record is current when it has no end-date OR its end-date is
|
||||
still in the future. The planning.data.gov.uk `end-date` field marks when a
|
||||
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
|
||||
area and must NOT be dropped — the previous "any non-empty date = ended"
|
||||
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
|
||||
if end_date is None:
|
||||
return True
|
||||
if isinstance(end_date, str):
|
||||
return end_date.strip() == ""
|
||||
text = end_date.strip()
|
||||
if text == "":
|
||||
return True
|
||||
try:
|
||||
return date.fromisoformat(text[:10]) > date.today()
|
||||
except ValueError:
|
||||
# Unparseable end-date: keep the record rather than silently drop it.
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
|
|||
)
|
||||
|
||||
|
||||
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
|
||||
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
|
||||
|
||||
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
|
||||
like median age and IoD. The IoD table defines the LSOA universe every
|
||||
postcode resolves into, so a missing LSOA would silently null the ethnicity
|
||||
columns for those postcodes; require full coverage instead.
|
||||
"""
|
||||
iod_lsoas = pl.read_parquet(
|
||||
iod_path, columns=["LSOA code (2021)"]
|
||||
).rename({"LSOA code (2021)": "lsoa21"})
|
||||
|
||||
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
|
||||
missing_ethnicity = iod_lsoas.join(
|
||||
ethnicity_lsoas, on="lsoa21", how="anti"
|
||||
).sort("lsoa21")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing LSOA coverage: "
|
||||
f"{missing_ethnicity.height} LSOAs, e.g. "
|
||||
f"{missing_ethnicity.head(10).to_dicts()}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_lad_source_coverage(
|
||||
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
|
||||
iod_path: Path, rental_prices_path: Path
|
||||
) -> None:
|
||||
iod_lads = (
|
||||
pl.read_parquet(
|
||||
|
|
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
|
|||
.unique(["lad"])
|
||||
)
|
||||
|
||||
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
|
||||
{"Geography_code": "lad"}
|
||||
)
|
||||
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing 2024 LAD coverage: "
|
||||
f"{missing_ethnicity.to_dicts()}"
|
||||
)
|
||||
|
||||
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
|
||||
{"area_code": "lad"}
|
||||
)
|
||||
|
|
@ -849,12 +879,10 @@ def _join_area_side_tables(
|
|||
broadband: pl.LazyFrame,
|
||||
) -> pl.LazyFrame:
|
||||
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
base = base.join(
|
||||
ethnicity,
|
||||
left_on="Local Authority District code (2024)",
|
||||
right_on="Geography_code",
|
||||
how="left",
|
||||
)
|
||||
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
|
||||
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
|
||||
# Local-Authority broadcast, with no change to the 6-bucket output schema.
|
||||
base = base.join(ethnicity, on="lsoa21", how="left")
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
|
||||
|
|
@ -1966,7 +1994,8 @@ def _build(
|
|||
"""
|
||||
if mode == "listings" and actual_listings_path is None:
|
||||
raise ValueError("listings mode requires actual_listings_path")
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_prices_path)
|
||||
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
|
|
@ -2225,7 +2254,7 @@ def main():
|
|||
"--ethnicity",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Ethnicity by local authority parquet file (optional)",
|
||||
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--crime",
|
||||
|
|
|
|||
|
|
@ -53,6 +53,18 @@ _OUTPUT_PRECISION_DEG = 0.000001
|
|||
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
||||
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
||||
|
||||
# Building-scale buffer for POINTLIKE inputs that carry no real extent. Multi-
|
||||
# dwelling (tower-block) postcodes have every UPRN geocoded to a single shared
|
||||
# coordinate, so the boundary collapses to a point; a 0.5 m buffer then yields an
|
||||
# invisible ~0.8 m² dot covering hundreds of homes. Such inputs get a ~200 m²
|
||||
# building-scale footprint instead. (Genuine thin slivers, which still carry
|
||||
# length, keep the minimal buffer.) _resolve_overlaps runs afterwards, so any
|
||||
# overlap this introduces is trimmed; a postcode shaved back to sub-grid still
|
||||
# falls through to the tiny _grid_footprint, so this can only improve the result.
|
||||
_POINT_RESCUE_BUFFER_M = 8.0
|
||||
_POINTLIKE_AREA_M2 = 1.0
|
||||
_POINTLIKE_PERIMETER_M = 4.0
|
||||
|
||||
|
||||
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
||||
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
||||
|
|
@ -90,8 +102,23 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
|||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
||||
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
|
||||
|
||||
A POINTLIKE input (a point, or a near-zero-area/short-perimeter polygon — the
|
||||
signature of a tower-block postcode whose UPRNs all share one coordinate)
|
||||
gets a building-scale buffer so it is not reduced to an invisible sub-metre
|
||||
dot; thin slivers that still carry length keep the minimal buffer.
|
||||
"""
|
||||
buffer_m = _MIN_FOOTPRINT_BUFFER_M
|
||||
try:
|
||||
if (
|
||||
geom_bng.area < _POINTLIKE_AREA_M2
|
||||
and geom_bng.length < _POINTLIKE_PERIMETER_M
|
||||
):
|
||||
buffer_m = _POINT_RESCUE_BUFFER_M
|
||||
except GEOSException:
|
||||
pass
|
||||
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
|
||||
if footprint is None:
|
||||
return None
|
||||
return _snap_to_wgs84_geojson(footprint)
|
||||
|
|
|
|||
|
|
@ -906,6 +906,37 @@ class TestToWgs84Geojson:
|
|||
assert result is not None
|
||||
assert result["type"] == "Polygon"
|
||||
|
||||
def test_pointlike_input_gets_building_scale_footprint(self):
|
||||
"""A tower-block postcode (all UPRNs at one point) must not collapse to a
|
||||
sub-metre dot; it gets a building-scale footprint instead."""
|
||||
import pyproj
|
||||
from shapely.geometry import Point, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
result = to_wgs84_geojson(Point(360000, 170000))
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
|
||||
|
||||
def test_thin_sliver_keeps_minimal_buffer(self):
|
||||
"""A genuine elongated sliver still carries length, so it is NOT inflated
|
||||
to building scale — only truly pointlike inputs are."""
|
||||
import pyproj
|
||||
from shapely.geometry import LineString, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
||||
to_bng = pyproj.Transformer.from_crs(
|
||||
"EPSG:4326", "EPSG:27700", always_xy=True
|
||||
)
|
||||
sliver = LineString([(360000, 170000), (360040, 170000)]).buffer(0.05)
|
||||
result = to_wgs84_geojson(sliver)
|
||||
assert result is not None
|
||||
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
|
||||
assert area_m2 < 100, f"sliver inflated to {area_m2:.1f} m^2"
|
||||
|
||||
def test_coordinates_have_limited_precision(self):
|
||||
"""GeoJSON coordinates should be rounded to 6 decimal places."""
|
||||
import json
|
||||
|
|
|
|||
|
|
@ -230,11 +230,28 @@ def main():
|
|||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Null the absolute "Estimated current price" itself when its implied
|
||||
# per-sqm is implausible (outside [MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM])
|
||||
# AND the floor area is known: these come from bulk/block transfers or
|
||||
# garbage source prices (e.g. a £207.5M "sale" on a 93 m² terrace -> a £197M
|
||||
# estimate) and are not meaningful single-dwelling values. Previously only
|
||||
# the derived per-sqm was nulled, leaving the absurd headline price visible.
|
||||
_raw_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& ((_raw_est_psm < MIN_COMPARABLE_PSM) | (_raw_est_psm > MAX_COMPARABLE_PSM))
|
||||
)
|
||||
.then(None)
|
||||
.otherwise(pl.col("Estimated current price"))
|
||||
.alias("Estimated current price"),
|
||||
)
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area
|
||||
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
|
||||
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
|
||||
# from bulk/block transactions or floor-area errors and are not meaningful
|
||||
# per-unit prices.
|
||||
# exist. Now that the implausible-psm estimates are nulled above, the band
|
||||
# filter here mainly guards the floor-area>0 case.
|
||||
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
LATEST_COMPLETE_YEAR,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
|
|
@ -395,14 +396,22 @@ def build_index(
|
|||
The index is still forward-filled to CURRENT_YEAR.
|
||||
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
|
||||
"""
|
||||
pairs = extract_pairs(input_path, max_year2=max_pair_year)
|
||||
# Solve the index only on COMPLETE calendar years: exclude the partial
|
||||
# current year, whose thin repeat-sale set yields wild betas. The index is
|
||||
# still forward-filled/trend-extrapolated to CURRENT_YEAR below, so 2026
|
||||
# follows the established trend rather than a partial-year spike. Backtest
|
||||
# passes a stricter max_pair_year, which is honoured.
|
||||
estimation_cap = (
|
||||
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
|
||||
)
|
||||
pairs = extract_pairs(input_path, max_year2=estimation_cap)
|
||||
centroids = extract_centroids(postcodes_path or input_path)
|
||||
|
||||
min_year = int(pairs["year1"].min())
|
||||
max_year = CURRENT_YEAR
|
||||
|
||||
hedonic_idx = compute_hedonic_index(
|
||||
input_path, min_year, max_year, max_sale_year=max_pair_year
|
||||
input_path, min_year, max_year, max_sale_year=estimation_cap
|
||||
)
|
||||
|
||||
# Precompute hierarchy
|
||||
|
|
|
|||
|
|
@ -6,6 +6,13 @@ import numpy as np
|
|||
import polars as pl
|
||||
|
||||
CURRENT_YEAR = 2026
|
||||
# Latest COMPLETE calendar year. The current year's transactions are only
|
||||
# partially reported (Land Registry lags ~2-3 months), so a sector's thin
|
||||
# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
|
||||
# single sector). The index is SOLVED only on complete years (<= this) and
|
||||
# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
|
||||
# projections follow the established trend instead of a partial-year spike.
|
||||
LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
|
||||
_today = date.today()
|
||||
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12
|
||||
|
||||
|
|
|
|||
|
|
@ -15,11 +15,24 @@ SCHOOL_GROUPS = {
|
|||
}
|
||||
|
||||
|
||||
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
# Age thresholds for deciding which phase(s) a school serves. A school serves
|
||||
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
|
||||
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
|
||||
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
|
||||
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
|
||||
# phase" labels such schools as just "Secondary", which previously hid them from
|
||||
# every postcode's primary-school count.
|
||||
PRIMARY_MAX_AGE = 10
|
||||
SECONDARY_MIN_AGE = 12
|
||||
|
||||
|
||||
def classify_good_plus_schools(
|
||||
ofsted: pl.DataFrame, open_urns: set[int] | None = None
|
||||
) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
|
||||
returning a ``(postcode, category)`` frame.
|
||||
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
|
||||
``category`` rows per school, returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
|
|
@ -27,49 +40,89 @@ def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
|||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
|
||||
variants). Filtering on the graded column alone dropped ~7,000 genuinely
|
||||
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
|
||||
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
|
||||
is never overridden.
|
||||
remains Good"/"School remains Outstanding"). Filtering on the graded column
|
||||
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
|
||||
ungraded outcome, but ONLY when there is no usable graded result
|
||||
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
|
||||
|
||||
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
|
||||
(Concerns)" outcome signals inspectors found issues warranting an earlier
|
||||
graded re-inspection, so marketing it as a good+ school is misleading.
|
||||
|
||||
Phase assignment uses the statutory age range when available (so all-through
|
||||
and middle schools count toward BOTH primary and secondary), falling back to
|
||||
the coarse "Ofsted phase" label when age columns are absent. When
|
||||
``open_urns`` is given, schools whose URN is not in the current GIAS open
|
||||
register are dropped so closed/merged schools are not counted.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
has_concern = ungraded.str.contains(r"\(Concerns\)")
|
||||
remains_outstanding = (
|
||||
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
|
||||
)
|
||||
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(
|
||||
no_usable_grade
|
||||
& ungraded.str.starts_with("School remains Outstanding")
|
||||
)
|
||||
.when(no_usable_grade & remains_outstanding)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
|
||||
.when(no_usable_grade & remains_good)
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
|
||||
# Drop schools no longer open (closed/merged) when the GIAS open register is
|
||||
# provided, so stale Ofsted "latest inspection" rows are not counted.
|
||||
if open_urns is not None and "URN" in graded.columns:
|
||||
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
|
||||
|
||||
# Decide which phase(s) each school serves.
|
||||
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
|
||||
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
|
||||
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
|
||||
serves_primary = (
|
||||
pl.when(low.is_not_null())
|
||||
.then(low <= PRIMARY_MAX_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Primary")
|
||||
)
|
||||
serves_secondary = (
|
||||
pl.when(high.is_not_null())
|
||||
.then(high >= SECONDARY_MIN_AGE)
|
||||
.otherwise(pl.col("Ofsted phase") == "Secondary")
|
||||
)
|
||||
else:
|
||||
serves_primary = pl.col("Ofsted phase") == "Primary"
|
||||
serves_secondary = pl.col("Ofsted phase") == "Secondary"
|
||||
|
||||
graded = graded.with_columns(
|
||||
serves_primary.alias("_serves_primary"),
|
||||
serves_secondary.alias("_serves_secondary"),
|
||||
)
|
||||
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
return graded.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
# A school can yield up to two rows (primary and secondary).
|
||||
primary = graded.filter(pl.col("_serves_primary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
.alias("category")
|
||||
).select(
|
||||
)
|
||||
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
.alias("category")
|
||||
)
|
||||
return pl.concat([primary, secondary]).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
|
@ -85,12 +138,24 @@ def main():
|
|||
parser.add_argument(
|
||||
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gias",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="GIAS open-school parquet; if given, only currently-open schools are counted",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
|
||||
open_urns: set[int] | None = None
|
||||
if args.gias is not None:
|
||||
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
|
||||
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
|
||||
print(f"GIAS open register: {len(open_urns):,} open school URNs")
|
||||
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
|
|
|
|||
|
|
@ -252,14 +252,15 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
|||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||
|
||||
|
||||
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
||||
def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
|
||||
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
|
||||
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
|
||||
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
|
||||
# bars (which span the UNION of years any serious type occurred), NOT the sum
|
||||
# of the per-type means. Summing per-type means divides each type by its OWN
|
||||
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
|
||||
# per-year serious total by the years any serious type occurred (2) -> 12.
|
||||
# "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
|
||||
# (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
|
||||
# shown beside it and can never fall below a single component. (The previous
|
||||
# union-years-present mean would have divided the per-year serious total by the
|
||||
# 2 years any serious type occurred, giving a misleading 12 that sits below
|
||||
# both the burglary and robbery rollup contributions.)
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
|
|
@ -274,13 +275,16 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
|||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
avg = pl.read_parquet(output).row(0, named=True)
|
||||
# The precomputed rollup headline exists and equals the mean of the bars (12),
|
||||
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
|
||||
assert "Serious crime (avg/yr)" in avg
|
||||
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
# Rollup == sum of its component (avg/yr) columns.
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(
|
||||
avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
|
||||
)
|
||||
|
||||
# The by-year rollup series remains the per-year sum of the component bars.
|
||||
serious_bars = {
|
||||
p["year"]: p["count"]
|
||||
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
|
||||
|
|
@ -289,8 +293,6 @@ def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
|||
2014: pytest.approx(12.0, abs=0.05),
|
||||
2024: pytest.approx(12.0, abs=0.05),
|
||||
}
|
||||
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
|
||||
|
||||
|
||||
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ from pipeline.transform.merge import (
|
|||
_split_normal_outputs,
|
||||
_tree_density_by_postcode,
|
||||
_validate_lad_source_coverage,
|
||||
_validate_lsoa_source_coverage,
|
||||
_validate_postcode_feature_output,
|
||||
_validate_property_postcodes,
|
||||
)
|
||||
|
|
@ -297,7 +298,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -355,7 +356,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -531,7 +532,6 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
|
|
@ -547,19 +547,15 @@ def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|||
],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame(
|
||||
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
|
|
@ -567,13 +563,41 @@ def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) ->
|
|||
"Local Authority District name (2024)": ["Barnsley"],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Rental data is missing"):
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
|
||||
iod_path
|
||||
)
|
||||
# Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
|
||||
# LSOAs are required to all be present.
|
||||
pl.DataFrame(
|
||||
{"lsoa21": ["E01000001", "E01000002", "E01000003"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
|
||||
|
||||
def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
|
||||
iod_path
|
||||
)
|
||||
pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
|
||||
|
||||
with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
||||
|
|
@ -1027,7 +1051,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
|
|
@ -1427,7 +1451,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
"Property type": ["Terraced", None],
|
||||
"Leasehold/Freehold": ["Leasehold", None],
|
||||
"Last known price": [500_000, None],
|
||||
"Street tree density percentile": [42.0, 42.0],
|
||||
"Tree canopy density percentile": [42.0, 42.0],
|
||||
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
|
||||
"_actual_listing_url": ["url0", "url1"],
|
||||
"_actual_asking_price": [600_000, 700_000],
|
||||
|
|
@ -1458,7 +1482,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Last known price": pl.Int64,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Tree canopy density percentile": pl.Float32,
|
||||
"_actual_listing_url": pl.Utf8,
|
||||
"_actual_asking_price": pl.Int64,
|
||||
"_actual_asking_price_per_sqm": pl.Int32,
|
||||
|
|
@ -1496,7 +1520,7 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
|
||||
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
|
||||
# Postcode-level feature carried through to both matched and unmatched rows.
|
||||
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
|
||||
assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
|
||||
# Match status reflects historical context availability.
|
||||
assert finalized["Historical property match status"].to_list() == [
|
||||
"matched",
|
||||
|
|
@ -1524,7 +1548,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
|||
"Property type": ["Terraced", "Terraced"],
|
||||
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
|
||||
"Last known price": [500_000, 480_000],
|
||||
"Street tree density percentile": [42.0, 42.0],
|
||||
"Tree canopy density percentile": [42.0, 42.0],
|
||||
# Same listing URL on both collapsed rows — the fan-out to fix.
|
||||
"_actual_listing_url": ["url0", "url0"],
|
||||
"_actual_asking_price": [600_000, 600_000],
|
||||
|
|
@ -1555,7 +1579,7 @@ def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
|||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Last known price": pl.Int64,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Tree canopy density percentile": pl.Float32,
|
||||
"_actual_listing_url": pl.Utf8,
|
||||
"_actual_asking_price": pl.Int64,
|
||||
"_actual_asking_price_per_sqm": pl.Int32,
|
||||
|
|
|
|||
|
|
@ -42,7 +42,20 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
|||
rows = [
|
||||
_school("Primary", None, "School remains Good", "AA1 1AA"),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
|
||||
# "(Concerns)"/"(Improving)" variants are still good+.
|
||||
# "(Improving)" is still good+ ...
|
||||
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AE", "good_primary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_concerns_are_not_good_plus():
|
||||
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
|
||||
# must NOT be counted as good+ schools.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
|
||||
_school(
|
||||
"Secondary",
|
||||
|
|
@ -51,12 +64,7 @@ def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
|||
"AA1 1AD",
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AC", "good_primary"),
|
||||
("AA1 1AD", "outstanding_secondary"),
|
||||
}
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
|
|
@ -80,3 +88,52 @@ def test_non_primary_secondary_phases_excluded():
|
|||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": None,
|
||||
"URN": 100000,
|
||||
"Statutory lowest age": low,
|
||||
"Statutory highest age": high,
|
||||
}
|
||||
|
||||
|
||||
def test_all_through_school_counts_toward_both_primary_and_secondary():
|
||||
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
|
||||
# serves primary-age children too, so it must count in BOTH metrics.
|
||||
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AA", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_age_ranges_assign_single_phase_for_standard_schools():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
|
||||
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_secondary"),
|
||||
("AA1 1AC", "outstanding_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_closed_schools_excluded_when_open_register_given():
|
||||
rows = [
|
||||
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
|
||||
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
|
||||
]
|
||||
rows[0]["URN"] = 111
|
||||
rows[1]["URN"] = 222
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
|
||||
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
# URN 222 is not in the open register, so it is dropped.
|
||||
assert pairs == {("AA1 1AA", "outstanding_primary")}
|
||||
|
|
|
|||
|
|
@ -33,6 +33,14 @@ DROP_CATEGORIES = {
|
|||
"emergency/water_tank",
|
||||
"leisure/bleachers",
|
||||
"leisure/schoolyard",
|
||||
# Park "furniture" / incidental features — not parks; they massively
|
||||
# inflated the Park count (picnic_table ~15k, outdoor_seating ~5.8k).
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
"public_transport/pay_scale_area",
|
||||
"shop/taxi",
|
||||
"amenity/feeding_place",
|
||||
|
|
@ -182,9 +190,13 @@ DROP_CATEGORIES = {
|
|||
"tourism/village_sign",
|
||||
"tourism/wilderness_hut",
|
||||
"tourism/yes",
|
||||
# Public transport (from NaPTAN instead)
|
||||
# Public transport (from NaPTAN instead). public_transport/platform is the
|
||||
# EXCEPTION: it is mapped to "Bus stop" (see _CATEGORIES) to fill NaPTAN's
|
||||
# authority-level bus-stop gaps (e.g. West Cumbria, North Norfolk, where
|
||||
# NaPTAN has zero stops), then deduped against NaPTAN so covered areas keep
|
||||
# a single stop. stop_position is left dropped to avoid double-counting the
|
||||
# same stop (platform + stop_position).
|
||||
"public_transport/entrance",
|
||||
"public_transport/platform",
|
||||
"public_transport/station",
|
||||
"public_transport/stop_position",
|
||||
# Education amenities — schools come from GIAS instead. OSM coverage for
|
||||
|
|
@ -301,16 +313,13 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"🌳",
|
||||
[
|
||||
"leisure/park",
|
||||
# leisure/garden is dominated by private residential gardens (98%+
|
||||
# unnamed); it is name-gated in transform() via REQUIRE_NAME_CATEGORIES
|
||||
# so only named (public/notable) gardens count as a Park.
|
||||
"leisure/garden",
|
||||
"leisure/common",
|
||||
"leisure/nature_reserve",
|
||||
"leisure/dog_park",
|
||||
"leisure/bandstand",
|
||||
"leisure/bird_hide",
|
||||
"leisure/firepit",
|
||||
"leisure/outdoor_seating",
|
||||
"leisure/picnic_table",
|
||||
"leisure/wildlife_hide",
|
||||
],
|
||||
),
|
||||
(
|
||||
|
|
@ -329,6 +338,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
[
|
||||
"leisure/sports_centre",
|
||||
"leisure/sports_hall",
|
||||
# leisure/pitch (73% of the old bucket) and leisure/swimming_pool
|
||||
# (98% unnamed = private/garden pools) are name-gated in transform()
|
||||
# via REQUIRE_NAME_CATEGORIES so only named public facilities count.
|
||||
"leisure/pitch",
|
||||
"leisure/track",
|
||||
"leisure/golf_course",
|
||||
|
|
@ -1123,8 +1135,36 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"amenity/townhall",
|
||||
],
|
||||
),
|
||||
# ── Public transport (OSM supplement to NaPTAN) ──────────
|
||||
# OSM bus platforms fill NaPTAN's authority-level coverage gaps. Same group
|
||||
# / friendly name / emoji as the NaPTAN "Bus stop" rows so they merge into
|
||||
# one metric; OSM platforms that duplicate a NaPTAN stop are deduped in
|
||||
# transform() (osm_stops_near_naptan).
|
||||
(
|
||||
"Public Transport",
|
||||
"Bus stop",
|
||||
"🚏",
|
||||
[
|
||||
"public_transport/platform",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Raw OSM tags whose UNNAMED instances are dropped before category mapping.
|
||||
# These tags are overwhelmingly private/incidental when unnamed: a nameless
|
||||
# `leisure/garden` is a private residential garden (not a public park), and a
|
||||
# nameless `leisure/pitch`/`swimming_pool` is a school cage or back-garden pool.
|
||||
# Keeping only named instances stops them inflating Park / Sports Centre counts
|
||||
# while preserving genuinely public, notable facilities (which carry a name).
|
||||
REQUIRE_NAME_CATEGORIES = {
|
||||
"leisure/garden",
|
||||
"leisure/pitch",
|
||||
"leisure/practice_pitch",
|
||||
"leisure/swimming_pool",
|
||||
"leisure/paddling_pool",
|
||||
}
|
||||
|
||||
|
||||
# Build flat lookup: OSM category → (group, friendly_name, emoji)
|
||||
CATEGORY_MAP: dict[str, tuple[str, str, str]] = {
|
||||
osm_key: (group, name, emoji)
|
||||
|
|
@ -1431,18 +1471,25 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
)
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
||||
def transform_gias_schools(
|
||||
gias_path: Path, ofsted_path: Path, boundary_path: Path
|
||||
) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata.
|
||||
Ofsted ratings are joined by URN so each school carries its latest OEIF
|
||||
overall effectiveness grade (Outstanding/Good/Requires improvement/
|
||||
Inadequate/Not judged), surfaced in the map popup."""
|
||||
Inadequate/Not judged), surfaced in the map popup.
|
||||
|
||||
Clipped to England (like NaPTAN/GEOLYTIX) because the GIAS register is
|
||||
GB-wide, so ~1,400 Welsh/Scottish/IoM schools would otherwise leak into the
|
||||
England-only Education layer (and depress apparent Ofsted coverage, since
|
||||
Wales is inspected by Estyn, not Ofsted)."""
|
||||
icon_category_expr = _school_icon_category_expr()
|
||||
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
|
||||
ofsted = _load_ofsted_ratings(ofsted_path)
|
||||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return (
|
||||
schools = (
|
||||
pl.scan_parquet(gias_path)
|
||||
.join(ofsted, on="urn", how="left")
|
||||
.select(
|
||||
|
|
@ -1477,7 +1524,14 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
|||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
schools["lat"].to_numpy(),
|
||||
schools["lng"].to_numpy(),
|
||||
)
|
||||
return schools.filter(pl.Series(mask)).lazy()
|
||||
|
||||
|
||||
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
|
||||
|
|
@ -1511,6 +1565,45 @@ def _significant_tokens(name: str | None) -> set[str]:
|
|||
return tokens
|
||||
|
||||
|
||||
# OSM bus platforms are added to "Bus stop" to fill NaPTAN's authority-level
|
||||
# gaps. Where NaPTAN already has a stop within this radius the area is covered,
|
||||
# so the colocated OSM platform is dropped to avoid double-counting; OSM
|
||||
# platforms with no nearby NaPTAN stop (the gaps) are kept.
|
||||
BUS_STOP_DEDUP_RADIUS_M = 50.0
|
||||
|
||||
|
||||
def osm_stops_near_naptan(
|
||||
osm_stops: pl.DataFrame,
|
||||
naptan_stops: pl.DataFrame,
|
||||
radius_m: float = BUS_STOP_DEDUP_RADIUS_M,
|
||||
) -> list[str]:
|
||||
"""Return OSM bus-stop ids within ``radius_m`` of any NaPTAN bus stop.
|
||||
|
||||
Purely spatial (no name match): in NaPTAN-covered areas the OSM platform is
|
||||
a duplicate and is dropped; only OSM platforms that fill a NaPTAN gap (no
|
||||
NaPTAN stop within the radius) survive. Both frames need ``id``/``lat``/``lng``.
|
||||
"""
|
||||
if osm_stops.is_empty() or naptan_stops.is_empty():
|
||||
return []
|
||||
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
n_lat = naptan_stops["lat"].to_numpy().astype(float)
|
||||
n_lng = naptan_stops["lng"].to_numpy().astype(float)
|
||||
o_lat = osm_stops["lat"].to_numpy().astype(float)
|
||||
o_lng = osm_stops["lng"].to_numpy().astype(float)
|
||||
o_ids = osm_stops["id"].to_list()
|
||||
|
||||
mean_lat = float(np.mean(np.concatenate([n_lat, o_lat])))
|
||||
cos_lat = float(np.cos(np.radians(mean_lat)))
|
||||
n_xy = np.column_stack([n_lng * cos_lat * 111_320.0, n_lat * 110_540.0])
|
||||
o_xy = np.column_stack([o_lng * cos_lat * 111_320.0, o_lat * 110_540.0])
|
||||
|
||||
tree = cKDTree(n_xy)
|
||||
dist, _ = tree.query(o_xy, k=1)
|
||||
return [o_ids[i] for i in range(len(o_ids)) if dist[i] <= radius_m]
|
||||
|
||||
|
||||
def osm_groceries_colocated_with_geolytix(
|
||||
osm_groceries: pl.DataFrame,
|
||||
geolytix: pl.DataFrame,
|
||||
|
|
@ -1601,6 +1694,19 @@ def transform(
|
|||
# Drop unwanted categories
|
||||
lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
|
||||
|
||||
# Drop UNNAMED instances of private-dominated tags (gardens, pitches,
|
||||
# pools) so they don't inflate Park / Sports Centre proximity counts. Done
|
||||
# while `category` still holds the raw OSM key, before the friendly mapping.
|
||||
lf = lf.filter(
|
||||
~(
|
||||
pl.col("category").is_in(list(REQUIRE_NAME_CATEGORIES))
|
||||
& (
|
||||
pl.col("name").is_null()
|
||||
| (pl.col("name").cast(pl.String).str.strip_chars() == "")
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Build lookup expressions from the 3-tuple mapping
|
||||
group_mapping = {k: v[0] for k, v in CATEGORY_MAP.items()}
|
||||
name_mapping = {k: v[1] for k, v in CATEGORY_MAP.items()}
|
||||
|
|
@ -1665,11 +1771,37 @@ def transform(
|
|||
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
|
||||
)
|
||||
|
||||
# Drop OSM bus platforms that duplicate a NaPTAN bus stop, so the OSM
|
||||
# supplement only adds stops in NaPTAN's coverage gaps (no double-count in
|
||||
# covered areas). OSM bus stops carry id-prefix "n"/"a" so they never clash
|
||||
# with NaPTAN ATCO ids.
|
||||
osm_bus_stops = (
|
||||
lf.filter((pl.col("group") == "Public Transport") & (pl.col("category") == "Bus stop"))
|
||||
.select("id", "lat", "lng")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
naptan_bus_stops = naptan_df.filter(pl.col("category") == "Bus stop")
|
||||
covered_bus_ids = osm_stops_near_naptan(osm_bus_stops, naptan_bus_stops)
|
||||
kept_osm = osm_bus_stops.height - len(covered_bus_ids)
|
||||
print(
|
||||
f"OSM bus platforms: {osm_bus_stops.height:,} total, dropping "
|
||||
f"{len(covered_bus_ids):,} that duplicate a NaPTAN stop, keeping "
|
||||
f"{kept_osm:,} to fill NaPTAN gaps"
|
||||
)
|
||||
if covered_bus_ids:
|
||||
lf = lf.filter(
|
||||
~(
|
||||
(pl.col("group") == "Public Transport")
|
||||
& (pl.col("category") == "Bus stop")
|
||||
& pl.col("id").is_in(covered_bus_ids)
|
||||
)
|
||||
)
|
||||
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
grocery_pois.lazy(),
|
||||
transform_gias_schools(gias_path, ofsted_path),
|
||||
transform_gias_schools(gias_path, ofsted_path, boundary_path),
|
||||
]
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
|
|
|||
|
|
@ -10,6 +10,26 @@ EARTH_RADIUS_KM = 6371.0088
|
|||
KM_PER_DEGREE_LAT = 111.32
|
||||
DEFAULT_GRID_SIZE_DEGREES = 0.02
|
||||
|
||||
# Generous GB/UK bounding box. The ArcGIS postcode source stores grid-less
|
||||
# postcodes with a placeholder coordinate (lat=99.999999, lon=0.0); these are
|
||||
# finite, so an isfinite() check alone lets them through and produces absurd
|
||||
# ~5,000 km "nearest amenity" distances. Reject anything outside this box so
|
||||
# such postcodes get NaN distance / zero counts instead of a fabricated value.
|
||||
UK_LAT_MIN, UK_LAT_MAX = 49.0, 61.5
|
||||
UK_LON_MIN, UK_LON_MAX = -9.0, 2.5
|
||||
|
||||
|
||||
def valid_uk_coords_mask(lats: np.ndarray, lons: np.ndarray) -> np.ndarray:
|
||||
"""Boolean mask of coordinates that are finite AND within the UK bbox."""
|
||||
return (
|
||||
np.isfinite(lats)
|
||||
& np.isfinite(lons)
|
||||
& (lats >= UK_LAT_MIN)
|
||||
& (lats <= UK_LAT_MAX)
|
||||
& (lons >= UK_LON_MIN)
|
||||
& (lons <= UK_LON_MAX)
|
||||
)
|
||||
|
||||
|
||||
def _build_poi_grid(
|
||||
pois: pl.DataFrame, grid_size: float = 0.05
|
||||
|
|
@ -43,7 +63,12 @@ def _get_nearby_indices(
|
|||
grid_size: float = DEFAULT_GRID_SIZE_DEGREES,
|
||||
) -> np.ndarray | None:
|
||||
"""Get POI indices from all grid cells intersecting the radius bounding box."""
|
||||
if not np.isfinite(pc_lat) or not np.isfinite(pc_lon):
|
||||
if (
|
||||
not np.isfinite(pc_lat)
|
||||
or not np.isfinite(pc_lon)
|
||||
or not (UK_LAT_MIN <= pc_lat <= UK_LAT_MAX)
|
||||
or not (UK_LON_MIN <= pc_lon <= UK_LON_MAX)
|
||||
):
|
||||
return None
|
||||
|
||||
lat_delta = radius_km / KM_PER_DEGREE_LAT
|
||||
|
|
@ -182,7 +207,7 @@ def min_distance_per_postcode(
|
|||
pc_lats = postcodes_df["lat"].to_numpy()
|
||||
pc_lons = postcodes_df["lon"].to_numpy()
|
||||
pc_codes = postcodes_df["postcode"].to_list()
|
||||
valid_pc_mask = np.isfinite(pc_lats) & np.isfinite(pc_lons)
|
||||
valid_pc_mask = valid_uk_coords_mask(pc_lats, pc_lons)
|
||||
valid_pc_indices = np.flatnonzero(valid_pc_mask)
|
||||
|
||||
result_min_dist = {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue