"""Download Defra Round 4 (2022) strategic noise data for England. Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via WCS, then samples the local maximum around each postcode representative point. Outputs a parquet file with postcode-level noise in dB for each source. Uses smaller 20km tiles at native 10m resolution so values are not understated by a single coarse centroid sample. Data source: Defra Strategic Noise Mapping Round 4 (2022) - Lden = day-evening-night 24h weighted average (the EU standard metric) - 10m grid, modelled at 4m above ground License: Open Government Licence v3.0 Note: Road/rail use WCS 1.0.0; airport requires WCS 2.0.1 (Defra's 1.0.0 endpoint is broken for that coverage). """ import argparse import math import tempfile import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import httpx import numpy as np import polars as pl import rasterio from pyproj import Transformer from rasterio.transform import rowcol from scipy.ndimage import maximum_filter # Noise sources: # (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles) # Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500 # for many sparse/no-coverage tiles, which should become nulls. NOISE_SOURCES = [ ( "Road", "road_noise_lden_db", "https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs", "Road_Noise_Lden_England_Round_4_All", "1.0.0", False, ), ( "Rail", "rail_noise_lden_db", "https://environment.data.gov.uk/spatialdata/noise-data/wcs", "Rail_Noise_Lden_England_Round_4_All", "1.0.0", False, ), ( "Airport", "airport_noise_lden_db", "https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs", "dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden", "2.0.1", True, ), ] # England extent in EPSG:27700 (British National Grid), rounded outward BNG_MIN_E = 80_000 BNG_MAX_E = 660_000 BNG_MIN_N = 0 BNG_MAX_N = 660_000 # Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each, # small enough to process one at a time without mosaicking all England. TILE_SIZE = 20_000 # Max concurrent tile downloads MAX_WORKERS = 4 # Native raster resolution (10m grid) NATIVE_RESOLUTION = 10 # Request pixel resolution in metres. RESOLUTION = NATIVE_RESOLUTION # The pipeline has postcode representative points rather than complete unit # polygons here. Use a small local footprint and take the maximum 10m cell so # postcode-level noise is not understated by centroid rounding. POSTCODE_NOISE_RADIUS_M = 50 # Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles # intermittently return 504s; smaller fallback requests usually succeed. MAX_RETRIES = 3 RETRY_BACKOFF_SECONDS = 5 MIN_TILE_SIZE = 5_000 type Tile = tuple[int, int, int, int] class NoGeoTiffError(RuntimeError): """Raised when WCS returns an XML/HTML exception or another non-raster body.""" def _wcs_get_coverage_url( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, wcs_version: str = "1.0.0", ) -> str: """Build a WCS GetCoverage URL for a BNG bounding box.""" if wcs_version == "2.0.1": return ( f"{wcs_base}?" f"service=WCS&version=2.0.1&request=GetCoverage" f"&coverageId={coverage_id}" f"&format=image/tiff" f"&subsettingCRS=EPSG:27700" f"&subset=E({min_e},{max_e})" f"&subset=N({min_n},{max_n})" f"&scaleFactor={NATIVE_RESOLUTION / RESOLUTION}" ) width = (max_e - min_e) // RESOLUTION height = (max_n - min_n) // RESOLUTION return ( f"{wcs_base}?" f"service=WCS&version=1.0.0&request=GetCoverage" f"&coverage={coverage_id}" f"&CRS=EPSG:27700" f"&BBOX={min_e},{min_n},{max_e},{max_n}" f"&width={width}&height={height}" f"&format=GeoTIFF" ) _TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """Convert WGS84 lat/lon to British National Grid easting/northing.""" return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat) def _looks_like_tiff(response: httpx.Response) -> bool: content_type = response.headers.get("content-type", "") return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*") def _fetch_tile_bytes( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, wcs_version: str = "1.0.0", ) -> bytes: """Fetch one WCS tile.""" url = _wcs_get_coverage_url( wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version ) with httpx.Client(timeout=300, follow_redirects=True) as client: resp = client.get(url) resp.raise_for_status() if not _looks_like_tiff(resp): content_type = resp.headers.get("content-type", "") body_preview = resp.text[:200].replace("\n", " ") raise NoGeoTiffError( f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}" ) return resp.content def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]: e_edges = [min_e, max_e] n_edges = [min_n, max_n] if max_e - min_e > MIN_TILE_SIZE: e_edges.insert(1, (min_e + max_e) // 2) if max_n - min_n > MIN_TILE_SIZE: n_edges.insert(1, (min_n + max_n) // 2) subtiles: list[Tile] = [] for e0, e1 in zip(e_edges, e_edges[1:]): for n0, n1 in zip(n_edges, n_edges[1:]): if e1 > e0 and n1 > n0: subtiles.append((e0, n0, e1, n1)) return subtiles def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path: return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif" def _download_tile( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, tile_dir: Path, wcs_version: str = "1.0.0", split_failures: bool = True, ) -> tuple[list[Path], list[Tile]]: """Download a WCS tile, splitting on repeated server failures.""" tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n) if tile_path.exists() and tile_path.stat().st_size > 0: return [tile_path], [] last_error: Exception | None = None for attempt in range(1, MAX_RETRIES + 1): try: content = _fetch_tile_bytes( wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version ) tile_path.write_bytes(content) return [tile_path], [] except ( NoGeoTiffError, httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError, ) as e: last_error = e if attempt < MAX_RETRIES: sleep_for = RETRY_BACKOFF_SECONDS * attempt print( f" Retrying tile ({min_e},{min_n})-({max_e},{max_n}) " f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})" ) time.sleep(sleep_for) subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else [] if len(subtiles) > 1: print( f" Splitting tile ({min_e},{min_n})-({max_e},{max_n}) " f"into {len(subtiles)} smaller requests after: {last_error}" ) paths: list[Path] = [] failures: list[Tile] = [] for e0, n0, e1, n1 in subtiles: child_paths, child_failures = _download_tile( wcs_base, coverage_id, e0, n0, e1, n1, tile_dir, wcs_version, split_failures, ) paths.extend(child_paths) failures.extend(child_failures) return paths, failures print( f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}" ) return [], [(min_e, min_n, max_e, max_n)] def download_raster( tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0", allow_missing_tiles: bool = False, ) -> list[Path]: """Download noise GeoTIFF raster covering England, returning paths to saved files.""" tiles = [] for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE): for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE): max_e = min(min_e + TILE_SIZE, BNG_MAX_E) max_n = min(min_n + TILE_SIZE, BNG_MAX_N) tiles.append((min_e, min_n, max_e, max_n)) print( f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..." ) paths: list[Path] = [] failures: list[Tile] = [] completed = 0 with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = {} for min_e, min_n, max_e, max_n in tiles: fut = executor.submit( _download_tile, wcs_base, coverage_id, min_e, min_n, max_e, max_n, tile_dir, wcs_version, not allow_missing_tiles, ) futures[fut] = (min_e, min_n) for fut in as_completed(futures): completed += 1 result_paths, result_failures = fut.result() paths.extend(result_paths) failures.extend(result_failures) print( f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs", end="", flush=True, ) if failures: preview = ", ".join( f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5] ) suffix = "..." if len(failures) > 5 else "" if allow_missing_tiles: print( f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data " f"tile requests: {preview}{suffix}" ) print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles") return paths raise RuntimeError( f"[{label}] Failed to download {len(failures)} tile requests after " f"retries and splitting: {preview}{suffix}" ) print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles") return paths def sample_noise_at_postcodes( tile_paths: list[Path], easting: np.ndarray, northing: np.ndarray, label: str, col_name: str, ) -> pl.Series: """Sample max noise values from 10m tiles around postcode representative points.""" print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...") noise_db = np.full(len(easting), np.nan, dtype=np.float32) radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION)) filter_size = radius_cells * 2 + 1 for path in tile_paths: with rasterio.open(path) as dataset: bounds = dataset.bounds candidate_mask = ( (easting >= bounds.left - POSTCODE_NOISE_RADIUS_M) & (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M) & (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M) & (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M) ) candidate_indices = np.flatnonzero(candidate_mask) if len(candidate_indices) == 0: continue grid = dataset.read(1).astype(np.float32, copy=False) invalid = ~np.isfinite(grid) | (grid == 0) if dataset.nodata is not None: invalid |= np.isclose( grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5 ) grid = grid.copy() grid[invalid] = -np.inf if filter_size > 1: grid = maximum_filter( grid, size=filter_size, mode="constant", cval=-np.inf ) rows, cols = rowcol( dataset.transform, easting[candidate_indices], northing[candidate_indices], ) rows = np.asarray(rows) cols = np.asarray(cols) h, w = grid.shape in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w) if not np.any(in_bounds): continue sampled_indices = candidate_indices[in_bounds] sampled = grid[rows[in_bounds], cols[in_bounds]] valid = sampled != -np.inf if not np.any(valid): continue sampled_indices = sampled_indices[valid] sampled = sampled[valid] existing = noise_db[sampled_indices] noise_db[sampled_indices] = np.where( np.isnan(existing), sampled, np.maximum(existing, sampled) ) valid_count = int(np.sum(~np.isnan(noise_db))) print( f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data" ) # Return as masked Series: use null (not NaN) so that Polars max_horizontal # correctly ignores missing values instead of propagating NaN. return pl.Series(col_name, noise_db).fill_nan(None) def main() -> None: parser = argparse.ArgumentParser( description="Download Defra noise data (road, rail, airport) and sample at postcode centroids" ) parser.add_argument( "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet (for lat/lon coordinates)", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() args.output.parent.mkdir(parents=True, exist_ok=True) print("Loading postcode coordinates...") postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename( {"pcds": "postcode", "long": "lon"} ) lat = postcodes["lat"].to_numpy() lon = postcodes["lon"].to_numpy() print("Converting lat/lon to BNG...") easting, northing = _bng_from_latlon(lat, lon) result = postcodes.select("postcode") with tempfile.TemporaryDirectory() as tmp: for ( label, col_name, wcs_base, coverage_id, wcs_version, allow_missing_tiles, ) in NOISE_SOURCES: tile_dir = Path(tmp) / label.lower() tile_dir.mkdir() tile_paths = download_raster( tile_dir, wcs_base, coverage_id, label, wcs_version, allow_missing_tiles, ) if not tile_paths: print( f"[{label}] WARNING: No tiles downloaded — column will be all null" ) series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32) else: series = sample_noise_at_postcodes( tile_paths, easting, northing, label, col_name ) result = result.with_columns(series) result.write_parquet(args.output, compression="zstd") size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB)") if __name__ == "__main__": main()