Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -102,15 +102,11 @@ _AREA_COLUMNS = [
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Good+ primary schools within 2km",
"Good+ secondary schools within 2km",
"Outstanding primary schools within 5km",
"Outstanding secondary schools within 5km",
"Outstanding primary schools within 2km",
"Outstanding secondary schools within 2km",
# Schools (modelled historical catchment areas covering the postcode)
"Good+ primary school catchments",
"Good+ secondary school catchments",
"Outstanding primary school catchments",
"Outstanding secondary school catchments",
# Demographics
"Median age",
# Politics
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"good_primary_catchments": "Good+ primary school catchments",
"good_secondary_catchments": "Good+ secondary school catchments",
"outstanding_primary_catchments": "Outstanding primary school catchments",
"outstanding_secondary_catchments": "Outstanding secondary school catchments",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
@ -874,7 +866,7 @@ def _join_area_side_tables(
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_proximity: pl.LazyFrame,
school_catchments: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
@ -905,7 +897,7 @@ def _join_area_side_tables(
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_proximity, on="postcode", how="left")
base = base.join(school_catchments, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
@ -1970,7 +1962,7 @@ def _build(
ethnicity_path: Path,
crime_path: Path,
noise_path: Path,
school_proximity_path: Path,
school_catchments_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
@ -2080,7 +2072,7 @@ def _build(
)
.select("postcode", "noise_lden_db")
)
school_proximity = pl.scan_parquet(school_proximity_path)
school_catchments = pl.scan_parquet(school_catchments_path)
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
@ -2120,7 +2112,7 @@ def _build(
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_proximity": school_proximity,
"school_catchments": school_catchments,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
@ -2267,10 +2259,10 @@ def main():
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
)
parser.add_argument(
"--school-proximity",
"--school-catchments",
type=Path,
required=True,
help="School proximity counts parquet file",
help="School catchment counts parquet file",
)
parser.add_argument(
"--broadband",
@ -2376,7 +2368,7 @@ def main():
ethnicity_path=args.ethnicity,
crime_path=args.crime,
noise_path=args.noise,
school_proximity_path=args.school_proximity,
school_catchments_path=args.school_catchments,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,