This commit is contained in:
Andras Schmelczer 2026-05-26 19:45:13 +01:00
parent c645b0f1d4
commit 39ef5c6646
79 changed files with 5660 additions and 2199 deletions

View file

@ -1316,17 +1316,122 @@ def transform_grocery_retail_points(
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata."""
return pl.scan_parquet(gias_path).select(
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
"Nursery school": "🧸",
"Primary school": "🎒",
"Secondary school": "🏫",
"All-through school": "🏫",
"Sixth form": "📚",
"Further education college": "📚",
"University": "🎓",
"Special school": "🤝",
"School": "🏫",
}
def _school_icon_category_expr() -> pl.Expr:
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
wins for universities, FE colleges and special schools (which span multiple
phases); otherwise phase determines the bucket. For independent and other
non-statutory schools where GIAS leaves phase null, fall back to the
age_range bounds so they still split into the right pill."""
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# age_range is "<min><max>" using an em-dash; both ends may be missing.
age_parts = pl.col("age_range").str.split_exact("", 1)
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
.when(pl.col("type_group") == "Special schools")
.then(pl.lit("Special school"))
.when(pl.col("type_group") == "Colleges")
.then(pl.lit("Further education college"))
.when(phase == "nursery")
.then(pl.lit("Nursery school"))
.when(phase.is_in(["primary", "middle deemed primary"]))
.then(pl.lit("Primary school"))
.when(phase.is_in(["secondary", "middle deemed secondary"]))
.then(pl.lit("Secondary school"))
.when(phase == "all-through")
.then(pl.lit("All-through school"))
.when(phase.is_in(["16 plus", "sixth form"]))
.then(pl.lit("Sixth form"))
# Age-range fallback for null-phase rows (≈3k Independents + Academies
# GIAS doesn't classify by phase).
.when(max_age <= 5)
.then(pl.lit("Nursery school"))
.when(min_age >= 16)
.then(pl.lit("Sixth form"))
.when((min_age <= 6) & (max_age >= 16))
.then(pl.lit("All-through school"))
.when(max_age <= 11)
.then(pl.lit("Primary school"))
.when(min_age >= 10)
.then(pl.lit("Secondary school"))
.otherwise(pl.lit("School"))
)
OFSTED_OEIF_LABELS = {
"1": "Outstanding",
"2": "Good",
"3": "Requires improvement",
"4": "Inadequate",
}
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
only have a report card) is preserved verbatim; null grades drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(grade_col == "2")
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "3")
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)
)
return (
pl.scan_parquet(ofsted_path)
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
label.alias("ofsted_rating"),
)
.filter(pl.col("ofsted_rating").is_not_null())
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
pl.lit("School").alias("category"),
pl.lit("School").alias("icon_category"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
pl.lit("🏫").alias("emoji"),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
@ -1346,6 +1451,7 @@ def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
@ -1355,6 +1461,7 @@ def transform(
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
ofsted_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1420,7 +1527,12 @@ def transform(
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
]
return pl.concat(frames, how="diagonal_relaxed")
@ -1453,6 +1565,12 @@ def main():
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--ofsted",
type=Path,
required=True,
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
@ -1464,6 +1582,7 @@ def main():
args.boundary,
args.grocery_retail_points,
args.gias,
args.ofsted,
).collect(engine="streaming")
df.write_parquet(args.output)