idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
|
|||
)
|
||||
df = df.filter(pl.Series(mask))
|
||||
|
||||
eligible_retailers = (
|
||||
df.group_by("retailer")
|
||||
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
|
||||
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
|
||||
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
|
||||
df = df.with_columns(
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category")
|
||||
)
|
||||
eligible_categories = (
|
||||
df.group_by("category")
|
||||
.len()
|
||||
.filter(pl.col("len") >= min_chain_locations)
|
||||
.select("retailer")
|
||||
.select("category")
|
||||
)
|
||||
df = df.join(eligible_retailers, on="retailer", how="semi")
|
||||
df = df.join(eligible_categories, on="category", how="semi")
|
||||
|
||||
return df.with_columns(
|
||||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||||
pl.coalesce(["store_name", "fascia", "retailer"])
|
||||
.str.replace_all("''", "'")
|
||||
.alias("name"),
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category"),
|
||||
pl.struct(["fascia", "retailer"])
|
||||
.map_elements(
|
||||
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
|
||||
|
|
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
|
|||
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
||||
# primary") so we normalise before matching.
|
||||
phase = pl.col("phase").str.to_lowercase()
|
||||
# age_range is "<min>–<max>" using an em-dash; both ends may be missing.
|
||||
age_parts = pl.col("age_range").str.split_exact("–", 1)
|
||||
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
|
||||
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
|
||||
# gias._format_age_range emits three shapes: "<low>–<high>" (em-dash),
|
||||
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
|
||||
# integer as low and the trailing integer as high, then suppress the wrong
|
||||
# end for the one-sided shapes so they don't collapse to a single bound.
|
||||
age = pl.col("age_range")
|
||||
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
|
||||
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
|
||||
# "up to N": no low bound; "N+": no high bound.
|
||||
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
|
||||
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
|
||||
return (
|
||||
pl.when(pl.col("type_group") == "Universities")
|
||||
.then(pl.lit("University"))
|
||||
|
|
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
|
|||
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||||
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
||||
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
||||
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
|
||||
only have a report card) is preserved verbatim; null grades drop out."""
|
||||
the conventional Ofsted labels; when there is no usable graded result
|
||||
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
|
||||
report-card framework) we fall back to "Ungraded inspection overall outcome"
|
||||
so genuinely good/outstanding schools aren't dropped — mirroring
|
||||
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||
# See school_proximity: the ungraded outcome carries "School remains Good"/
|
||||
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
|
||||
# suffixes) when the graded column is null/"Not judged".
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
label = (
|
||||
pl.when(grade_col == "1")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||
|
|
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
||||
.when(grade_col == "4")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
||||
.when(ungraded.str.starts_with("School remains Outstanding"))
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||
.when(ungraded.str.starts_with("School remains Good"))
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
|
||||
.when(grade_col == "Not judged")
|
||||
.then(pl.lit("Not judged"))
|
||||
.otherwise(None)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue