From 6a42b81a2acd4dd9415432eb3c05f3a7e951bbb8 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sun, 1 Feb 2026 09:26:10 +0000 Subject: [PATCH] Add school proximity --- pipeline/transform/school_proximity.py | 73 ++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 pipeline/transform/school_proximity.py diff --git a/pipeline/transform/school_proximity.py b/pipeline/transform/school_proximity.py new file mode 100644 index 0000000..3e10b90 --- /dev/null +++ b/pipeline/transform/school_proximity.py @@ -0,0 +1,73 @@ +"""Compute good-rated school proximity counts per postcode.""" + +import argparse +from pathlib import Path + +import polars as pl + +from pipeline.utils.poi_counts import _count_pois_per_postcode + +SCHOOL_GROUPS = { + "good_primary": ["good_primary"], + "good_secondary": ["good_secondary"], +} + + +def main(): + parser = argparse.ArgumentParser( + description="Count good+ primary/secondary schools within 2km per postcode" + ) + parser.add_argument( + "--ofsted", type=Path, required=True, help="Ofsted inspection parquet" + ) + parser.add_argument( + "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet" + ) + parser.add_argument( + "--output", type=Path, required=True, help="Output parquet path" + ) + args = parser.parse_args() + + # Load Ofsted data: filter to good+ (1, 2) primary/secondary schools + ofsted = pl.read_parquet(args.ofsted).filter( + pl.col("Ofsted phase").is_in(["Primary", "Secondary"]) + & pl.col("Overall effectiveness").is_in(["1", "2"]) + ) + + print(f"Good+ schools: {len(ofsted):,}") + + # Assign category based on phase + ofsted = ofsted.with_columns( + pl.when(pl.col("Ofsted phase") == "Primary") + .then(pl.lit("good_primary")) + .otherwise(pl.lit("good_secondary")) + .alias("category") + ).select( + pl.col("Postcode").alias("postcode"), + "category", + ) + + # Join with arcgis to get lat/lng for each school's postcode + arcgis = pl.read_parquet(args.arcgis).select( + pl.col("pcds").alias("postcode"), + "lat", + pl.col("long").alias("lng"), + ) + + schools = ofsted.join(arcgis, on="postcode", how="inner") + print(f"Schools with coordinates: {len(schools):,}") + + # Load all postcodes for proximity counting + postcodes = arcgis.rename({"lng": "lon"}) + + result = _count_pois_per_postcode( + postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS + ) + + result.write_parquet(args.output) + size_mb = args.output.stat().st_size / (1024 * 1024) + print(f"Wrote {args.output} ({size_mb:.1f} MB)") + + +if __name__ == "__main__": + main()