diff --git a/Makefile.data b/Makefile.data index 21d7b6e..4471f33 100644 --- a/Makefile.data +++ b/Makefile.data @@ -37,8 +37,6 @@ NAPTAN := $(DATA_DIR)/naptan.parquet BROADBAND := $(DATA_DIR)/broadband.parquet SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet RENTAL := $(DATA_DIR)/rental_prices.parquet -GEOSURE_DIR := $(DATA_DIR)/geosure -GEOSURE := $(DATA_DIR)/geosure.parquet INSPIRE_DIR := $(DATA_DIR)/inspire OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg UPRN_LOOKUP := $(DATA_DIR)/uprn_lookup.parquet @@ -55,7 +53,6 @@ ENGLAND_BOUNDARY := $(DATA_DIR)/england_boundary.geojson RM_OUTCODES := frontend/src/lib/rightmove-outcodes.json # Sentinel files for directory targets (Make doesn't track directories well) -GEOSURE_STAMP := $(GEOSURE_DIR)/.done INSPIRE_STAMP := $(INSPIRE_DIR)/.done PMTILES_VERSION := 1.22.3 diff --git a/pipeline/transform/merge.py b/pipeline/transform/merge.py index b691760..a8d4f4b 100644 --- a/pipeline/transform/merge.py +++ b/pipeline/transform/merge.py @@ -58,14 +58,8 @@ _AREA_COLUMNS = [ # Schools "Good+ primary schools within 5km", "Good+ secondary schools within 5km", - # GeoSure - "Environmental risk", - "Collapsible deposits risk", - "Compressible ground risk", - "Landslide risk", - "Running sand risk", - "Shrink-swell risk", - "Soluble rocks risk", + # Demographics + "Median age", ] diff --git a/pipeline/transform/transform_geosure.py b/pipeline/transform/transform_geosure.py deleted file mode 100644 index 07e21e0..0000000 --- a/pipeline/transform/transform_geosure.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Spatial-join GeoSure 5km hex grid risk data to postcode centroids. - -Reads six ESRI Shapefiles (one per hazard type), converts polygons from -BNG to WGS84, and queries postcode centroids against them using an STRtree. -Outputs a postcode-level parquet with six risk columns plus an overall max. - -Source: Ordnance Survey GeoSure (Open Data) -""" - -import argparse -from pathlib import Path - -import numpy as np -import polars as pl -import shapefile as shp -from pyproj import Transformer -from shapely import STRtree, points -from shapely.geometry import shape as to_shapely - -_GEOSURE_RISKS = [ - ("CollapsibleDeposits", "collapsible_deposits_risk"), - ("CompressibleGround", "compressible_ground_risk"), - ("Landslides", "landslide_risk"), - ("RunningSand", "running_sand_risk"), - ("ShrinkSwell", "shrink_swell_risk"), - ("SolubleRocks", "soluble_rocks_risk"), -] - - -def transform(geosure_dir: Path, arcgis_path: Path) -> pl.DataFrame: - """Spatial-join GeoSure 5km hex grid risk data to postcode centroids.""" - to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) - - print("Loading postcode centroids for GeoSure join...") - postcodes_df = pl.read_parquet(arcgis_path, columns=["pcds", "lat", "long"]) - lats = postcodes_df["lat"].to_numpy() - lons = postcodes_df["long"].to_numpy() - pts = points(np.column_stack([lons, lats])) - - result = pl.DataFrame({"postcode": postcodes_df["pcds"]}) - risk_cols = [] - - for risk_key, col_name in _GEOSURE_RISKS: - shp_files = list(geosure_dir.rglob(f"*_GS_{risk_key}_*.shp")) - if not shp_files: - print(f" Warning: No shapefile found for {risk_key}") - result = result.with_columns(pl.lit(None).cast(pl.UInt8).alias(col_name)) - risk_cols.append(col_name) - continue - - print(f" Reading {shp_files[0].name}...") - reader = shp.Reader(str(shp_files[0])) - - hex_polys = [] - hex_classes = [] - for sr in reader.shapeRecords(): - geo = sr.shape.__geo_interface__ - new_coords = [] - for ring in geo["coordinates"]: - xs, ys = zip(*ring) - t_lons, t_lats = to_wgs84.transform(list(xs), list(ys)) - new_coords.append(list(zip(t_lons, t_lats))) - geo["coordinates"] = new_coords - hex_polys.append(to_shapely(geo)) - hex_classes.append(int(sr.record["CLASS"])) - - classes_arr = np.array(hex_classes, dtype=np.uint8) - - print(f" Querying {len(pts)} postcodes against {len(hex_polys)} hexagons...") - tree = STRtree(hex_polys) - pt_idx, hex_idx = tree.query(pts, predicate="intersects") - - risk_values = np.zeros(len(pts), dtype=np.uint8) - np.maximum.at(risk_values, pt_idx, classes_arr[hex_idx]) - - result = result.with_columns(pl.Series(col_name, risk_values)) - risk_cols.append(col_name) - - # Overall environmental risk = max across all 6 - result = result.with_columns( - pl.max_horizontal(*risk_cols).alias("environmental_risk") - ) - - # Convert 0 → null, 1/2/3 → Low/Moderate/Significant - label_map = {1: "Low", 2: "Moderate", 3: "Significant"} - for col in risk_cols + ["environmental_risk"]: - result = result.with_columns( - pl.col(col) - .replace_strict(label_map, default=None, return_dtype=pl.Utf8) - .alias(col) - ) - - print(f" GeoSure join complete: {result.height} postcodes") - return result - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Spatial-join GeoSure ground stability data to postcode centroids" - ) - parser.add_argument( - "--geosure", - type=Path, - required=True, - help="GeoSure shapefile directory", - ) - parser.add_argument( - "--arcgis", - type=Path, - required=True, - help="ArcGIS postcode data parquet (for lat/lon coordinates)", - ) - parser.add_argument( - "--output", - type=Path, - required=True, - help="Output parquet file path", - ) - args = parser.parse_args() - - result = transform(args.geosure, args.arcgis) - - args.output.parent.mkdir(parents=True, exist_ok=True) - result.write_parquet(args.output, compression="zstd") - size_mb = args.output.stat().st_size / (1024 * 1024) - print(f"Wrote {args.output} ({size_mb:.1f} MB)") - - -if __name__ == "__main__": - main()