idk

2026-06-02 13:46:18 +01:00 · 2026-06-02 13:46:18 +01:00 · d43da9708c
commit d43da9708c
parent a04ac2d857
47 changed files with 4120 additions and 573 deletions
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -1,6 +1,10 @@
 import polars as pl

-from pipeline.transform.transform_poi import transform_grocery_retail_points
+from pipeline.transform.transform_poi import (
+    _load_ofsted_ratings,
+    _school_icon_category_expr,
+    transform_grocery_retail_points,
+)


 def test_transform_grocery_retail_points_outputs_chain_categories():
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
    ]


+def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
+    # Each Co-op society has <5 in-England stores; only after normalising to the
+    # shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
+    societies = [
+        "Central England Co-operative",
+        "Lincolnshire Co-operative",
+        "The Southern Co-operative",
+        "Midcounties Co-operative",
+        "Heart of England Co-operative",
+    ]
+    raw = pl.DataFrame(
+        {
+            "id": list(range(1, len(societies) + 1)),
+            "retailer": societies,
+            "fascia": ["The Co-operative Food"] * len(societies),
+            "store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
+            "long_wgs": [-0.141] * len(societies),
+            "lat_wgs": [51.515] * len(societies),
+        }
+    )
+
+    pois = transform_grocery_retail_points(raw)
+
+    assert pois.height == len(societies)
+    assert pois["category"].unique().to_list() == ["Co-op"]
+
+
 def test_transform_grocery_retail_points_accepts_base_fascias():
    raw = pl.DataFrame(
        {
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
        {"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
        {"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
    ]
+
+
+def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
+    # URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
+    # grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
+    # the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
+    # URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
+    ofsted_path = tmp_path / "ofsted.parquet"
+    pl.DataFrame(
+        {
+            "URN": [1, 2, 3, 4, 5, 6, 7, 8],
+            "Latest OEIF overall effectiveness": [
+                "1",
+                "2",
+                "3",
+                "4",
+                None,
+                "Not judged",
+                "Not judged",
+                "3",
+            ],
+            "Ungraded inspection overall outcome": [
+                None,
+                None,
+                None,
+                None,
+                "School remains Outstanding",
+                "School remains Good (Concerns)",
+                None,
+                "School remains Outstanding",
+            ],
+        }
+    ).write_parquet(ofsted_path)
+
+    ratings = (
+        _load_ofsted_ratings(ofsted_path)
+        .collect()
+        .sort("urn")
+        .to_dicts()
+    )
+
+    assert ratings == [
+        {"urn": 1, "ofsted_rating": "Outstanding"},
+        {"urn": 2, "ofsted_rating": "Good"},
+        {"urn": 3, "ofsted_rating": "Requires improvement"},
+        {"urn": 4, "ofsted_rating": "Inadequate"},
+        {"urn": 5, "ofsted_rating": "Outstanding"},
+        {"urn": 6, "ofsted_rating": "Good"},
+        {"urn": 7, "ofsted_rating": "Not judged"},
+        {"urn": 8, "ofsted_rating": "Requires improvement"},
+    ]
+
+
+def test_school_icon_category_handles_one_sided_age_ranges():
+    # gias._format_age_range emits "up to {high}", "{low}+" and "{low}–{high}".
+    # All three (plus null) must classify, not fall through to "School".
+    df = pl.DataFrame(
+        {
+            "phase": [None, None, None, None, None],
+            "type_group": [None, None, None, None, None],
+            # "up to 5" -> nursery; "16+" -> sixth form; "3–18" -> all-through;
+            # "4–11" -> primary; null age_range with null phase -> "School".
+            "age_range": ["up to 5", "16+", "3–18", "4–11", None],
+        },
+        # Production reads these from a scanned parquet as String; an all-null
+        # Python list would otherwise infer the Null dtype and break .str ops.
+        schema_overrides={
+            "phase": pl.String,
+            "type_group": pl.String,
+            "age_range": pl.String,
+        },
+    )
+
+    categories = df.select(
+        _school_icon_category_expr().alias("category")
+    )["category"].to_list()
+
+    assert categories == [
+        "Nursery school",
+        "Sixth form",
+        "All-through school",
+        "Primary school",
+        "School",
+    ]