Add geosure data
This commit is contained in:
parent
c715475351
commit
c91561d7fe
4 changed files with 216 additions and 1 deletions
|
|
@ -13,7 +13,7 @@ vars:
|
||||||
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
|
WIDE_OUTPUT: "{{.DATA_DIR}}/wide.parquet"
|
||||||
EPC: "{{.DATA_DIR}}/certificates.csv"
|
EPC: "{{.DATA_DIR}}/certificates.csv"
|
||||||
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
|
JOURNEY_TIMES_BANK: "{{.DATA_DIR}}/journey_times_bank.parquet"
|
||||||
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia_checkpoint.parquet"
|
JOURNEY_TIMES_FITZROVIA: "{{.DATA_DIR}}/journey_times_fitzrovia.parquet"
|
||||||
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
|
ETHNICITY_OUTPUT: "{{.DATA_DIR}}/ethnicity_by_la.parquet"
|
||||||
CRIME_DIR: "{{.DATA_DIR}}/crime"
|
CRIME_DIR: "{{.DATA_DIR}}/crime"
|
||||||
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
|
CRIME_OUTPUT: "{{.DATA_DIR}}/crime_by_lsoa.parquet"
|
||||||
|
|
@ -23,6 +23,8 @@ vars:
|
||||||
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
|
BROADBAND_OUTPUT: "{{.DATA_DIR}}/broadband.parquet"
|
||||||
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
|
SCHOOL_PROXIMITY_OUTPUT: "{{.DATA_DIR}}/school_proximity.parquet"
|
||||||
POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes"
|
POSTCODES_OUTPUT: "{{.DATA_DIR}}/postcodes"
|
||||||
|
GEOSURE_OUTPUT: "{{.DATA_DIR}}/geosure"
|
||||||
|
GEOSURE_PARQUET: "{{.DATA_DIR}}/geosure.parquet"
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
download:tiles:
|
download:tiles:
|
||||||
|
|
@ -141,6 +143,13 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}}
|
- uv run python -m pipeline.download.postcodes --output {{.POSTCODES_OUTPUT}}
|
||||||
|
|
||||||
|
download:geosure:
|
||||||
|
desc: Download OS GeoSure ground stability data (5km hex grid)
|
||||||
|
status:
|
||||||
|
- test -d {{.GEOSURE_OUTPUT}}
|
||||||
|
cmds:
|
||||||
|
- uv run python -m pipeline.download.geosure --output {{.GEOSURE_OUTPUT}}
|
||||||
|
|
||||||
download:noise:
|
download:noise:
|
||||||
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
|
desc: Download Defra noise data (road, rail, airport) sampled at postcode centroids
|
||||||
deps:
|
deps:
|
||||||
|
|
@ -197,6 +206,16 @@ tasks:
|
||||||
cmds:
|
cmds:
|
||||||
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
|
- uv run python -m pipeline.transform.school_proximity --ofsted {{.OFSTED_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.SCHOOL_PROXIMITY_OUTPUT}}
|
||||||
|
|
||||||
|
transform:geosure:
|
||||||
|
desc: Spatial-join GeoSure ground stability data to postcode centroids
|
||||||
|
deps:
|
||||||
|
- download:geosure
|
||||||
|
- download:arcgis
|
||||||
|
status:
|
||||||
|
- test -f {{.GEOSURE_PARQUET}}
|
||||||
|
cmds:
|
||||||
|
- uv run python -m pipeline.transform.transform_geosure --geosure {{.GEOSURE_OUTPUT}} --arcgis {{.ARCGIS_OUTPUT}} --output {{.GEOSURE_PARQUET}}
|
||||||
|
|
||||||
download:journey-times:
|
download:journey-times:
|
||||||
desc: "Fetch TfL journey times: task download:journey-times"
|
desc: "Fetch TfL journey times: task download:journey-times"
|
||||||
deps:
|
deps:
|
||||||
|
|
@ -218,6 +237,7 @@ tasks:
|
||||||
# - transform:crime
|
# - transform:crime
|
||||||
# - transform:poi-proximity
|
# - transform:poi-proximity
|
||||||
# - transform:school-proximity
|
# - transform:school-proximity
|
||||||
|
# - transform:geosure
|
||||||
# - prompt:journey-times
|
# - prompt:journey-times
|
||||||
status:
|
status:
|
||||||
- test -f {{.WIDE_OUTPUT}}
|
- test -f {{.WIDE_OUTPUT}}
|
||||||
|
|
@ -235,4 +255,5 @@ tasks:
|
||||||
--noise {{.NOISE_OUTPUT}}
|
--noise {{.NOISE_OUTPUT}}
|
||||||
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
|
--school-proximity {{.SCHOOL_PROXIMITY_OUTPUT}}
|
||||||
--broadband {{.BROADBAND_OUTPUT}}
|
--broadband {{.BROADBAND_OUTPUT}}
|
||||||
|
--geosure {{.GEOSURE_PARQUET}}
|
||||||
--output {{.WIDE_OUTPUT}}
|
--output {{.WIDE_OUTPUT}}
|
||||||
|
|
|
||||||
44
pipeline/download/geosure.py
Normal file
44
pipeline/download/geosure.py
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
"""Download OS GeoSure ground stability data (5km hex grid).
|
||||||
|
|
||||||
|
Downloads the GB-Hex-5km-GeoSure dataset from Ordnance Survey as an ESRI
|
||||||
|
Shapefile and extracts it.
|
||||||
|
|
||||||
|
Source: https://osdatahub.os.uk/downloads/open/GeoSure
|
||||||
|
License: Open Government Licence v3.0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pipeline.utils import download, extract_zip
|
||||||
|
|
||||||
|
URL = "https://api.os.uk/downloads/v1/products/GB-Hex-5km-GeoSure/downloads?area=GB&format=ESRI%C2%AE+Shapefile&redirect"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Download OS GeoSure ground stability data"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Output directory for extracted shapefile",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as cache_dir:
|
||||||
|
zip_path = Path(cache_dir) / "geosure.zip"
|
||||||
|
|
||||||
|
download(URL, zip_path, timeout=300)
|
||||||
|
extract_zip(zip_path, args.output)
|
||||||
|
|
||||||
|
shp_files = list(args.output.rglob("*.shp"))
|
||||||
|
print(f"Extracted {len(shp_files)} shapefiles to {args.output}")
|
||||||
|
for f in shp_files:
|
||||||
|
print(f" {f.relative_to(args.output)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -41,6 +41,7 @@ def _build_wide(
|
||||||
noise_path: Path,
|
noise_path: Path,
|
||||||
school_proximity_path: Path,
|
school_proximity_path: Path,
|
||||||
broadband_path: Path,
|
broadband_path: Path,
|
||||||
|
geosure_path: Path,
|
||||||
) -> pl.DataFrame:
|
) -> pl.DataFrame:
|
||||||
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
"""Build the wide dataframe by joining epc_pp with all auxiliary data."""
|
||||||
wide = pl.scan_parquet(epc_pp_path)
|
wide = pl.scan_parquet(epc_pp_path)
|
||||||
|
|
@ -136,6 +137,9 @@ def _build_wide(
|
||||||
)
|
)
|
||||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||||
|
|
||||||
|
geosure = pl.scan_parquet(geosure_path)
|
||||||
|
wide = wide.join(geosure, on="postcode", how="left")
|
||||||
|
|
||||||
# Use built_form (Terraced, Semi-detached) when available, otherwise epc_property_type
|
# Use built_form (Terraced, Semi-detached) when available, otherwise epc_property_type
|
||||||
wide = wide.with_columns(
|
wide = wide.with_columns(
|
||||||
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
|
pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
|
||||||
|
|
@ -208,6 +212,14 @@ def _build_wide(
|
||||||
"max_download_speed": "Max available download speed (Mbps)",
|
"max_download_speed": "Max available download speed (Mbps)",
|
||||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||||
|
"transaction_year": "Transaction year",
|
||||||
|
"environmental_risk": "Environmental risk",
|
||||||
|
"collapsible_deposits_risk": "Collapsible deposits risk",
|
||||||
|
"compressible_ground_risk": "Compressible ground risk",
|
||||||
|
"landslide_risk": "Landslide risk",
|
||||||
|
"running_sand_risk": "Running sand risk",
|
||||||
|
"shrink_swell_risk": "Shrink-swell risk",
|
||||||
|
"soluble_rocks_risk": "Soluble rocks risk",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -276,6 +288,12 @@ def main():
|
||||||
required=True,
|
required=True,
|
||||||
help="Broadband performance by output area parquet file",
|
help="Broadband performance by output area parquet file",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--geosure",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="GeoSure ground stability parquet file",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output", type=Path, required=True, help="Output parquet file path"
|
"--output", type=Path, required=True, help="Output parquet file path"
|
||||||
)
|
)
|
||||||
|
|
@ -293,6 +311,7 @@ def main():
|
||||||
noise_path=args.noise,
|
noise_path=args.noise,
|
||||||
school_proximity_path=args.school_proximity,
|
school_proximity_path=args.school_proximity,
|
||||||
broadband_path=args.broadband,
|
broadband_path=args.broadband,
|
||||||
|
geosure_path=args.geosure,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f"Columns: {wide.columns}")
|
print(f"Columns: {wide.columns}")
|
||||||
|
|
|
||||||
131
pipeline/transform/transform_geosure.py
Normal file
131
pipeline/transform/transform_geosure.py
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
"""Spatial-join GeoSure 5km hex grid risk data to postcode centroids.
|
||||||
|
|
||||||
|
Reads six ESRI Shapefiles (one per hazard type), converts polygons from
|
||||||
|
BNG to WGS84, and queries postcode centroids against them using an STRtree.
|
||||||
|
Outputs a postcode-level parquet with six risk columns plus an overall max.
|
||||||
|
|
||||||
|
Source: Ordnance Survey GeoSure (Open Data)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import polars as pl
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
_GEOSURE_RISKS = [
|
||||||
|
("CollapsibleDeposits", "collapsible_deposits_risk"),
|
||||||
|
("CompressibleGround", "compressible_ground_risk"),
|
||||||
|
("Landslides", "landslide_risk"),
|
||||||
|
("RunningSand", "running_sand_risk"),
|
||||||
|
("ShrinkSwell", "shrink_swell_risk"),
|
||||||
|
("SolubleRocks", "soluble_rocks_risk"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def transform(geosure_dir: Path, arcgis_path: Path) -> pl.DataFrame:
|
||||||
|
"""Spatial-join GeoSure 5km hex grid risk data to postcode centroids."""
|
||||||
|
import shapefile as shp
|
||||||
|
from pyproj import Transformer
|
||||||
|
from shapely import STRtree, points
|
||||||
|
from shapely.geometry import shape as to_shapely
|
||||||
|
|
||||||
|
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||||
|
|
||||||
|
print("Loading postcode centroids for GeoSure join...")
|
||||||
|
postcodes_df = pl.read_parquet(arcgis_path, columns=["pcds", "lat", "long"])
|
||||||
|
lats = postcodes_df["lat"].to_numpy()
|
||||||
|
lons = postcodes_df["long"].to_numpy()
|
||||||
|
pts = points(np.column_stack([lons, lats]))
|
||||||
|
|
||||||
|
result = pl.DataFrame({"postcode": postcodes_df["pcds"]})
|
||||||
|
risk_cols = []
|
||||||
|
|
||||||
|
for risk_key, col_name in _GEOSURE_RISKS:
|
||||||
|
shp_files = list(geosure_dir.rglob(f"*_GS_{risk_key}_*.shp"))
|
||||||
|
if not shp_files:
|
||||||
|
print(f" Warning: No shapefile found for {risk_key}")
|
||||||
|
result = result.with_columns(pl.lit(None).cast(pl.UInt8).alias(col_name))
|
||||||
|
risk_cols.append(col_name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" Reading {shp_files[0].name}...")
|
||||||
|
reader = shp.Reader(str(shp_files[0]))
|
||||||
|
|
||||||
|
hex_polys = []
|
||||||
|
hex_classes = []
|
||||||
|
for sr in reader.shapeRecords():
|
||||||
|
geo = sr.shape.__geo_interface__
|
||||||
|
new_coords = []
|
||||||
|
for ring in geo["coordinates"]:
|
||||||
|
xs, ys = zip(*ring)
|
||||||
|
t_lons, t_lats = to_wgs84.transform(list(xs), list(ys))
|
||||||
|
new_coords.append(list(zip(t_lons, t_lats)))
|
||||||
|
geo["coordinates"] = new_coords
|
||||||
|
hex_polys.append(to_shapely(geo))
|
||||||
|
hex_classes.append(int(sr.record["CLASS"]))
|
||||||
|
|
||||||
|
classes_arr = np.array(hex_classes, dtype=np.uint8)
|
||||||
|
|
||||||
|
print(f" Querying {len(pts)} postcodes against {len(hex_polys)} hexagons...")
|
||||||
|
tree = STRtree(hex_polys)
|
||||||
|
pt_idx, hex_idx = tree.query(pts, predicate="intersects")
|
||||||
|
|
||||||
|
risk_values = np.zeros(len(pts), dtype=np.uint8)
|
||||||
|
np.maximum.at(risk_values, pt_idx, classes_arr[hex_idx])
|
||||||
|
|
||||||
|
result = result.with_columns(pl.Series(col_name, risk_values))
|
||||||
|
risk_cols.append(col_name)
|
||||||
|
|
||||||
|
# Overall environmental risk = max across all 6
|
||||||
|
result = result.with_columns(
|
||||||
|
pl.max_horizontal(*risk_cols).alias("environmental_risk")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert 0 → null, 1/2/3 → Low/Moderate/Significant
|
||||||
|
label_map = {1: "Low", 2: "Moderate", 3: "Significant"}
|
||||||
|
for col in risk_cols + ["environmental_risk"]:
|
||||||
|
result = result.with_columns(
|
||||||
|
pl.col(col)
|
||||||
|
.replace_strict(label_map, default=None, return_dtype=pl.Utf8)
|
||||||
|
.alias(col)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" GeoSure join complete: {result.height} postcodes")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Spatial-join GeoSure ground stability data to postcode centroids"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--geosure",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="GeoSure shapefile directory",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--arcgis",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="ArcGIS postcode data parquet (for lat/lon coordinates)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Output parquet file path",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
result = transform(args.geosure, args.arcgis)
|
||||||
|
|
||||||
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
result.write_parquet(args.output, compression="zstd")
|
||||||
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||||
|
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue