This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
)
df = df.filter(pl.Series(mask))
eligible_retailers = (
df.group_by("retailer")
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
df = df.with_columns(
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category")
)
eligible_categories = (
df.group_by("category")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("retailer")
.select("category")
)
df = df.join(eligible_retailers, on="retailer", how="semi")
df = df.join(eligible_categories, on="category", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category"),
pl.struct(["fascia", "retailer"])
.map_elements(
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# age_range is "<min><max>" using an em-dash; both ends may be missing.
age_parts = pl.col("age_range").str.split_exact("", 1)
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
# gias._format_age_range emits three shapes: "<low><high>" (em-dash),
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
# integer as low and the trailing integer as high, then suppress the wrong
# end for the one-sided shapes so they don't collapse to a single bound.
age = pl.col("age_range")
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
# "up to N": no low bound; "N+": no high bound.
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
only have a report card) is preserved verbatim; null grades drop out."""
the conventional Ofsted labels; when there is no usable graded result
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(ungraded.str.starts_with("School remains Outstanding"))
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(ungraded.str.starts_with("School remains Good"))
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)