This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -1,6 +1,10 @@
import polars as pl
from pipeline.transform.transform_poi import transform_grocery_retail_points
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
transform_grocery_retail_points,
)
def test_transform_grocery_retail_points_outputs_chain_categories():
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
]
def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
# Each Co-op society has <5 in-England stores; only after normalising to the
# shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
societies = [
"Central England Co-operative",
"Lincolnshire Co-operative",
"The Southern Co-operative",
"Midcounties Co-operative",
"Heart of England Co-operative",
]
raw = pl.DataFrame(
{
"id": list(range(1, len(societies) + 1)),
"retailer": societies,
"fascia": ["The Co-operative Food"] * len(societies),
"store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
"long_wgs": [-0.141] * len(societies),
"lat_wgs": [51.515] * len(societies),
}
)
pois = transform_grocery_retail_points(raw)
assert pois.height == len(societies)
assert pois["category"].unique().to_list() == ["Co-op"]
def test_transform_grocery_retail_points_accepts_base_fascias():
raw = pl.DataFrame(
{
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
{"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
{"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
]
def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
# URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
# grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
# the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
# URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
ofsted_path = tmp_path / "ofsted.parquet"
pl.DataFrame(
{
"URN": [1, 2, 3, 4, 5, 6, 7, 8],
"Latest OEIF overall effectiveness": [
"1",
"2",
"3",
"4",
None,
"Not judged",
"Not judged",
"3",
],
"Ungraded inspection overall outcome": [
None,
None,
None,
None,
"School remains Outstanding",
"School remains Good (Concerns)",
None,
"School remains Outstanding",
],
}
).write_parquet(ofsted_path)
ratings = (
_load_ofsted_ratings(ofsted_path)
.collect()
.sort("urn")
.to_dicts()
)
assert ratings == [
{"urn": 1, "ofsted_rating": "Outstanding"},
{"urn": 2, "ofsted_rating": "Good"},
{"urn": 3, "ofsted_rating": "Requires improvement"},
{"urn": 4, "ofsted_rating": "Inadequate"},
{"urn": 5, "ofsted_rating": "Outstanding"},
{"urn": 6, "ofsted_rating": "Good"},
{"urn": 7, "ofsted_rating": "Not judged"},
{"urn": 8, "ofsted_rating": "Requires improvement"},
]
def test_school_icon_category_handles_one_sided_age_ranges():
# gias._format_age_range emits "up to {high}", "{low}+" and "{low}{high}".
# All three (plus null) must classify, not fall through to "School".
df = pl.DataFrame(
{
"phase": [None, None, None, None, None],
"type_group": [None, None, None, None, None],
# "up to 5" -> nursery; "16+" -> sixth form; "318" -> all-through;
# "411" -> primary; null age_range with null phase -> "School".
"age_range": ["up to 5", "16+", "318", "411", None],
},
# Production reads these from a scanned parquet as String; an all-null
# Python list would otherwise infer the Null dtype and break .str ops.
schema_overrides={
"phase": pl.String,
"type_group": pl.String,
"age_range": pl.String,
},
)
categories = df.select(
_school_icon_category_expr().alias("category")
)["category"].to_list()
assert categories == [
"Nursery school",
"Sixth form",
"All-through school",
"Primary school",
"School",
]