perfect-postcode/pipeline/transform/school_proximity.py
2026-05-04 16:19:09 +01:00

97 lines
3.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Compute Ofsted-rated school proximity counts per postcode."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
# Post-2025 reform the single "Overall effectiveness" grade was retired;
# the legacy 14 scale is now carried forward under "Latest OEIF overall
# effectiveness" (OEIF = the previous Ofsted Education Inspection
# Framework). The new report-card columns use text judgements instead.
ofsted = pl.read_parquet(args.ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
)
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
)
# Assign category based on phase and rating. Good+ groups include both
# category variants; outstanding groups count grade 1 only.
ofsted = ofsted.with_columns(
pl.when(pl.col("Ofsted phase") == "Primary")
.then(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
)
.otherwise(
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
)
.alias("category")
).select(
pl.col("Postcode").alias("postcode"),
"category",
)
# Join with arcgis to get lat/lng for each school's postcode
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
counts_5km = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)
counts_2km = count_pois_per_postcode(
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
)
result = counts_5km.join(counts_2km, on="postcode")
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()