480 lines
16 KiB
Python
480 lines
16 KiB
Python
"""Download Defra Round 4 (2022) strategic noise data for England.
|
|
|
|
Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
|
|
WCS, then samples the local maximum around each postcode representative point.
|
|
Outputs a parquet file with postcode-level noise in dB for each source.
|
|
|
|
Uses smaller 20km tiles at native 10m resolution so values are not understated
|
|
by a single coarse centroid sample.
|
|
|
|
Data source: Defra Strategic Noise Mapping Round 4 (2022)
|
|
- Lden = day-evening-night 24h weighted average (the EU standard metric)
|
|
- 10m grid, modelled at 4m above ground
|
|
License: Open Government Licence v3.0
|
|
|
|
Note: Road/rail use WCS 1.0.0; airport requires WCS 2.0.1 (Defra's 1.0.0
|
|
endpoint is broken for that coverage).
|
|
"""
|
|
|
|
import argparse
|
|
import math
|
|
import tempfile
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
import numpy as np
|
|
import polars as pl
|
|
import rasterio
|
|
from pyproj import Transformer
|
|
from rasterio.transform import rowcol
|
|
from scipy.ndimage import maximum_filter
|
|
|
|
from pipeline.local_temp import local_tmp_dir
|
|
|
|
# Noise sources:
|
|
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
|
|
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
|
|
# for many sparse/no-coverage tiles, which should become nulls.
|
|
NOISE_SOURCES = [
|
|
(
|
|
"Road",
|
|
"road_noise_lden_db",
|
|
"https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs",
|
|
"Road_Noise_Lden_England_Round_4_All",
|
|
"1.0.0",
|
|
False,
|
|
),
|
|
(
|
|
"Rail",
|
|
"rail_noise_lden_db",
|
|
"https://environment.data.gov.uk/spatialdata/noise-data/wcs",
|
|
"Rail_Noise_Lden_England_Round_4_All",
|
|
"1.0.0",
|
|
False,
|
|
),
|
|
(
|
|
"Airport",
|
|
"airport_noise_lden_db",
|
|
"https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs",
|
|
"dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden",
|
|
"2.0.1",
|
|
True,
|
|
),
|
|
]
|
|
|
|
# England extent in EPSG:27700 (British National Grid), rounded outward
|
|
BNG_MIN_E = 80_000
|
|
BNG_MAX_E = 660_000
|
|
BNG_MIN_N = 0
|
|
BNG_MAX_N = 660_000
|
|
|
|
# Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each,
|
|
# small enough to process one at a time without mosaicking all England.
|
|
TILE_SIZE = 20_000
|
|
|
|
# Max concurrent tile downloads
|
|
MAX_WORKERS = 4
|
|
|
|
# Native raster resolution (10m grid)
|
|
NATIVE_RESOLUTION = 10
|
|
|
|
# Request pixel resolution in metres.
|
|
RESOLUTION = NATIVE_RESOLUTION
|
|
|
|
# The pipeline has postcode representative points rather than complete unit
|
|
# polygons here. Use a small local footprint and take the maximum 10m cell so
|
|
# postcode-level noise is not understated by centroid rounding.
|
|
POSTCODE_NOISE_RADIUS_M = 50
|
|
|
|
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
|
|
# intermittently return 504s; smaller fallback requests usually succeed.
|
|
MAX_RETRIES = 3
|
|
RETRY_BACKOFF_SECONDS = 5
|
|
MIN_TILE_SIZE = 5_000
|
|
|
|
type Tile = tuple[int, int, int, int]
|
|
|
|
|
|
class NoGeoTiffError(RuntimeError):
|
|
"""Raised when WCS returns an XML/HTML exception or another non-raster body."""
|
|
|
|
|
|
def _wcs_get_coverage_url(
|
|
wcs_base: str,
|
|
coverage_id: str,
|
|
min_e: int,
|
|
min_n: int,
|
|
max_e: int,
|
|
max_n: int,
|
|
wcs_version: str = "1.0.0",
|
|
) -> str:
|
|
"""Build a WCS GetCoverage URL for a BNG bounding box."""
|
|
if wcs_version == "2.0.1":
|
|
return (
|
|
f"{wcs_base}?"
|
|
f"service=WCS&version=2.0.1&request=GetCoverage"
|
|
f"&coverageId={coverage_id}"
|
|
f"&format=image/tiff"
|
|
f"&subsettingCRS=EPSG:27700"
|
|
f"&subset=E({min_e},{max_e})"
|
|
f"&subset=N({min_n},{max_n})"
|
|
f"&scaleFactor={NATIVE_RESOLUTION / RESOLUTION}"
|
|
)
|
|
width = (max_e - min_e) // RESOLUTION
|
|
height = (max_n - min_n) // RESOLUTION
|
|
return (
|
|
f"{wcs_base}?"
|
|
f"service=WCS&version=1.0.0&request=GetCoverage"
|
|
f"&coverage={coverage_id}"
|
|
f"&CRS=EPSG:27700"
|
|
f"&BBOX={min_e},{min_n},{max_e},{max_n}"
|
|
f"&width={width}&height={height}"
|
|
f"&format=GeoTIFF"
|
|
)
|
|
|
|
|
|
_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
|
|
|
|
|
def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
"""Convert WGS84 lat/lon to British National Grid easting/northing."""
|
|
return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat)
|
|
|
|
|
|
def _looks_like_tiff(response: httpx.Response) -> bool:
|
|
content_type = response.headers.get("content-type", "")
|
|
return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
|
|
|
|
|
|
def _fetch_tile_bytes(
|
|
wcs_base: str,
|
|
coverage_id: str,
|
|
min_e: int,
|
|
min_n: int,
|
|
max_e: int,
|
|
max_n: int,
|
|
wcs_version: str = "1.0.0",
|
|
) -> bytes:
|
|
"""Fetch one WCS tile."""
|
|
url = _wcs_get_coverage_url(
|
|
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
|
)
|
|
with httpx.Client(timeout=300, follow_redirects=True) as client:
|
|
resp = client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
if not _looks_like_tiff(resp):
|
|
content_type = resp.headers.get("content-type", "<missing>")
|
|
body_preview = resp.text[:200].replace("\n", " ")
|
|
raise NoGeoTiffError(
|
|
f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}"
|
|
)
|
|
return resp.content
|
|
|
|
|
|
def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]:
|
|
e_edges = [min_e, max_e]
|
|
n_edges = [min_n, max_n]
|
|
if max_e - min_e > MIN_TILE_SIZE:
|
|
e_edges.insert(1, (min_e + max_e) // 2)
|
|
if max_n - min_n > MIN_TILE_SIZE:
|
|
n_edges.insert(1, (min_n + max_n) // 2)
|
|
|
|
subtiles: list[Tile] = []
|
|
for e0, e1 in zip(e_edges, e_edges[1:]):
|
|
for n0, n1 in zip(n_edges, n_edges[1:]):
|
|
if e1 > e0 and n1 > n0:
|
|
subtiles.append((e0, n0, e1, n1))
|
|
return subtiles
|
|
|
|
|
|
def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path:
|
|
return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif"
|
|
|
|
|
|
def _download_tile(
|
|
wcs_base: str,
|
|
coverage_id: str,
|
|
min_e: int,
|
|
min_n: int,
|
|
max_e: int,
|
|
max_n: int,
|
|
tile_dir: Path,
|
|
wcs_version: str = "1.0.0",
|
|
split_failures: bool = True,
|
|
) -> tuple[list[Path], list[Tile]]:
|
|
"""Download a WCS tile, splitting on repeated server failures."""
|
|
tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n)
|
|
if tile_path.exists() and tile_path.stat().st_size > 0:
|
|
return [tile_path], []
|
|
|
|
last_error: Exception | None = None
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
|
try:
|
|
content = _fetch_tile_bytes(
|
|
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
|
)
|
|
tile_path.write_bytes(content)
|
|
return [tile_path], []
|
|
except (
|
|
NoGeoTiffError,
|
|
httpx.HTTPStatusError,
|
|
httpx.TimeoutException,
|
|
httpx.ConnectError,
|
|
) as e:
|
|
last_error = e
|
|
if attempt < MAX_RETRIES:
|
|
sleep_for = RETRY_BACKOFF_SECONDS * attempt
|
|
print(
|
|
f" Retrying tile ({min_e},{min_n})-({max_e},{max_n}) "
|
|
f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})"
|
|
)
|
|
time.sleep(sleep_for)
|
|
|
|
subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else []
|
|
if len(subtiles) > 1:
|
|
print(
|
|
f" Splitting tile ({min_e},{min_n})-({max_e},{max_n}) "
|
|
f"into {len(subtiles)} smaller requests after: {last_error}"
|
|
)
|
|
paths: list[Path] = []
|
|
failures: list[Tile] = []
|
|
for e0, n0, e1, n1 in subtiles:
|
|
child_paths, child_failures = _download_tile(
|
|
wcs_base,
|
|
coverage_id,
|
|
e0,
|
|
n0,
|
|
e1,
|
|
n1,
|
|
tile_dir,
|
|
wcs_version,
|
|
split_failures,
|
|
)
|
|
paths.extend(child_paths)
|
|
failures.extend(child_failures)
|
|
return paths, failures
|
|
|
|
print(
|
|
f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}"
|
|
)
|
|
return [], [(min_e, min_n, max_e, max_n)]
|
|
|
|
|
|
def download_raster(
|
|
tile_dir: Path,
|
|
wcs_base: str,
|
|
coverage_id: str,
|
|
label: str,
|
|
wcs_version: str = "1.0.0",
|
|
allow_missing_tiles: bool = False,
|
|
) -> list[Path]:
|
|
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
|
|
tiles = []
|
|
for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
|
|
for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
|
|
max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
|
|
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
|
|
tiles.append((min_e, min_n, max_e, max_n))
|
|
|
|
print(
|
|
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
|
|
)
|
|
paths: list[Path] = []
|
|
failures: list[Tile] = []
|
|
completed = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
futures = {}
|
|
for min_e, min_n, max_e, max_n in tiles:
|
|
fut = executor.submit(
|
|
_download_tile,
|
|
wcs_base,
|
|
coverage_id,
|
|
min_e,
|
|
min_n,
|
|
max_e,
|
|
max_n,
|
|
tile_dir,
|
|
wcs_version,
|
|
not allow_missing_tiles,
|
|
)
|
|
futures[fut] = (min_e, min_n)
|
|
|
|
for fut in as_completed(futures):
|
|
completed += 1
|
|
result_paths, result_failures = fut.result()
|
|
paths.extend(result_paths)
|
|
failures.extend(result_failures)
|
|
print(
|
|
f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs",
|
|
end="",
|
|
flush=True,
|
|
)
|
|
|
|
if failures:
|
|
preview = ", ".join(
|
|
f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5]
|
|
)
|
|
suffix = "..." if len(failures) > 5 else ""
|
|
if allow_missing_tiles:
|
|
print(
|
|
f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data "
|
|
f"tile requests: {preview}{suffix}"
|
|
)
|
|
print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
|
|
return paths
|
|
raise RuntimeError(
|
|
f"[{label}] Failed to download {len(failures)} tile requests after "
|
|
f"retries and splitting: {preview}{suffix}"
|
|
)
|
|
|
|
print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
|
|
return paths
|
|
|
|
|
|
def sample_noise_at_postcodes(
|
|
tile_paths: list[Path],
|
|
easting: np.ndarray,
|
|
northing: np.ndarray,
|
|
label: str,
|
|
col_name: str,
|
|
) -> pl.Series:
|
|
"""Sample max noise values from 10m tiles around postcode representative points."""
|
|
print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
|
|
noise_db = np.full(len(easting), np.nan, dtype=np.float32)
|
|
radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
|
|
filter_size = radius_cells * 2 + 1
|
|
|
|
for path in tile_paths:
|
|
with rasterio.open(path) as dataset:
|
|
bounds = dataset.bounds
|
|
candidate_mask = (
|
|
(easting >= bounds.left - POSTCODE_NOISE_RADIUS_M)
|
|
& (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M)
|
|
& (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M)
|
|
& (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M)
|
|
)
|
|
candidate_indices = np.flatnonzero(candidate_mask)
|
|
if len(candidate_indices) == 0:
|
|
continue
|
|
|
|
grid = dataset.read(1).astype(np.float32, copy=False)
|
|
invalid = ~np.isfinite(grid) | (grid == 0)
|
|
if dataset.nodata is not None:
|
|
invalid |= np.isclose(
|
|
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
|
|
)
|
|
grid = grid.copy()
|
|
grid[invalid] = -np.inf
|
|
if filter_size > 1:
|
|
grid = maximum_filter(
|
|
grid, size=filter_size, mode="constant", cval=-np.inf
|
|
)
|
|
|
|
rows, cols = rowcol(
|
|
dataset.transform,
|
|
easting[candidate_indices],
|
|
northing[candidate_indices],
|
|
)
|
|
rows = np.asarray(rows)
|
|
cols = np.asarray(cols)
|
|
h, w = grid.shape
|
|
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
|
|
if not np.any(in_bounds):
|
|
continue
|
|
|
|
sampled_indices = candidate_indices[in_bounds]
|
|
sampled = grid[rows[in_bounds], cols[in_bounds]]
|
|
valid = sampled != -np.inf
|
|
if not np.any(valid):
|
|
continue
|
|
|
|
sampled_indices = sampled_indices[valid]
|
|
sampled = sampled[valid]
|
|
existing = noise_db[sampled_indices]
|
|
noise_db[sampled_indices] = np.where(
|
|
np.isnan(existing), sampled, np.maximum(existing, sampled)
|
|
)
|
|
|
|
valid_count = int(np.sum(~np.isnan(noise_db)))
|
|
print(
|
|
f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
|
|
)
|
|
|
|
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
|
|
# correctly ignores missing values instead of propagating NaN.
|
|
return pl.Series(col_name, noise_db).fill_nan(None)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download Defra noise data (road, rail, airport) and sample at postcode centroids"
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis",
|
|
type=Path,
|
|
required=True,
|
|
help="ArcGIS postcode data parquet (for lat/lon coordinates)",
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print("Loading postcode coordinates...")
|
|
postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
|
|
{"pcds": "postcode", "long": "lon"}
|
|
)
|
|
|
|
lat = postcodes["lat"].to_numpy()
|
|
lon = postcodes["lon"].to_numpy()
|
|
|
|
print("Converting lat/lon to BNG...")
|
|
easting, northing = _bng_from_latlon(lat, lon)
|
|
|
|
result = postcodes.select("postcode")
|
|
|
|
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
|
for (
|
|
label,
|
|
col_name,
|
|
wcs_base,
|
|
coverage_id,
|
|
wcs_version,
|
|
allow_missing_tiles,
|
|
) in NOISE_SOURCES:
|
|
tile_dir = Path(tmp) / label.lower()
|
|
tile_dir.mkdir()
|
|
tile_paths = download_raster(
|
|
tile_dir,
|
|
wcs_base,
|
|
coverage_id,
|
|
label,
|
|
wcs_version,
|
|
allow_missing_tiles,
|
|
)
|
|
|
|
if not tile_paths:
|
|
print(
|
|
f"[{label}] WARNING: No tiles downloaded — column will be all null"
|
|
)
|
|
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
|
|
else:
|
|
series = sample_noise_at_postcodes(
|
|
tile_paths, easting, northing, label, col_name
|
|
)
|
|
|
|
result = result.with_columns(series)
|
|
|
|
result.write_parquet(args.output, compression="zstd")
|
|
size_mb = args.output.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|