"""Download Defra Round 4 (2022) strategic noise data for England. Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via WCS, then samples noise values at postcode centroids. Outputs a parquet file with postcode-level noise in dB for each source. Uses 100km tiles (~42 per source) to balance request size vs count. The server times out on tiles larger than ~150km at 100m resolution. Data source: Defra Strategic Noise Mapping Round 4 (2022) - Lden = day-evening-night 24h weighted average (the EU standard metric) - 10m grid, modelled at 4m above ground License: Open Government Licence v3.0 Note: Road/rail use WCS 1.0.0; airport requires WCS 2.0.1 (Defra's 1.0.0 endpoint is broken for that coverage). """ import argparse import tempfile import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import httpx import numpy as np import polars as pl import rasterio from pyproj import Transformer from rasterio.merge import merge from rasterio.transform import rowcol # Noise sources: # (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles) # Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500 # for many sparse/no-coverage tiles, which should become nulls. NOISE_SOURCES = [ ( "Road", "road_noise_lden_db", "https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs", "Road_Noise_Lden_England_Round_4_All", "1.0.0", False, ), ( "Rail", "rail_noise_lden_db", "https://environment.data.gov.uk/spatialdata/noise-data/wcs", "Rail_Noise_Lden_England_Round_4_All", "1.0.0", False, ), ( "Airport", "airport_noise_lden_db", "https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs", "dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden", "2.0.1", True, ), ] # England extent in EPSG:27700 (British National Grid), rounded outward BNG_MIN_E = 80_000 BNG_MAX_E = 660_000 BNG_MIN_N = 0 BNG_MAX_N = 660_000 # Tile size in metres (100km balances request size vs count; 300km causes 504s) TILE_SIZE = 100_000 # Max concurrent tile downloads MAX_WORKERS = 4 # Native raster resolution (10m grid) NATIVE_RESOLUTION = 10 # Request pixel resolution in metres (100m is sufficient for postcode-level data # and keeps download size ~100x smaller than native 10m) RESOLUTION = 100 # Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles # intermittently return 504s; smaller fallback requests usually succeed. MAX_RETRIES = 3 RETRY_BACKOFF_SECONDS = 5 MIN_TILE_SIZE = 25_000 type Tile = tuple[int, int, int, int] def _wcs_get_coverage_url( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, wcs_version: str = "1.0.0", ) -> str: """Build a WCS GetCoverage URL for a BNG bounding box.""" if wcs_version == "2.0.1": return ( f"{wcs_base}?" f"service=WCS&version=2.0.1&request=GetCoverage" f"&coverageId={coverage_id}" f"&format=image/tiff" f"&subsettingCRS=EPSG:27700" f"&subset=E({min_e},{max_e})" f"&subset=N({min_n},{max_n})" f"&scaleFactor={NATIVE_RESOLUTION / RESOLUTION}" ) width = (max_e - min_e) // RESOLUTION height = (max_n - min_n) // RESOLUTION return ( f"{wcs_base}?" f"service=WCS&version=1.0.0&request=GetCoverage" f"&coverage={coverage_id}" f"&CRS=EPSG:27700" f"&BBOX={min_e},{min_n},{max_e},{max_n}" f"&width={width}&height={height}" f"&format=GeoTIFF" ) _TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.ndarray]: """Convert WGS84 lat/lon to British National Grid easting/northing.""" return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat) def _looks_like_tiff(response: httpx.Response) -> bool: content_type = response.headers.get("content-type", "") return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*") def _fetch_tile_bytes( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, wcs_version: str = "1.0.0", ) -> bytes | None: """Fetch one WCS tile. Returns None when the server reports no GeoTIFF.""" url = _wcs_get_coverage_url( wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version ) with httpx.Client(timeout=300, follow_redirects=True) as client: resp = client.get(url) resp.raise_for_status() if not _looks_like_tiff(resp): return None return resp.content def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]: e_edges = [min_e, max_e] n_edges = [min_n, max_n] if max_e - min_e > MIN_TILE_SIZE: e_edges.insert(1, (min_e + max_e) // 2) if max_n - min_n > MIN_TILE_SIZE: n_edges.insert(1, (min_n + max_n) // 2) subtiles: list[Tile] = [] for e0, e1 in zip(e_edges, e_edges[1:]): for n0, n1 in zip(n_edges, n_edges[1:]): if e1 > e0 and n1 > n0: subtiles.append((e0, n0, e1, n1)) return subtiles def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path: return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif" def _download_tile( wcs_base: str, coverage_id: str, min_e: int, min_n: int, max_e: int, max_n: int, tile_dir: Path, wcs_version: str = "1.0.0", split_failures: bool = True, ) -> tuple[list[Path], list[Tile]]: """Download a WCS tile, splitting on repeated server failures.""" tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n) if tile_path.exists() and tile_path.stat().st_size > 0: return [tile_path], [] last_error: Exception | None = None for attempt in range(1, MAX_RETRIES + 1): try: content = _fetch_tile_bytes( wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version ) if content is None: return [], [] tile_path.write_bytes(content) return [tile_path], [] except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e: last_error = e if attempt < MAX_RETRIES: sleep_for = RETRY_BACKOFF_SECONDS * attempt print( f" Retrying tile ({min_e},{min_n})-({max_e},{max_n}) " f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})" ) time.sleep(sleep_for) subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else [] if len(subtiles) > 1: print( f" Splitting tile ({min_e},{min_n})-({max_e},{max_n}) " f"into {len(subtiles)} smaller requests after: {last_error}" ) paths: list[Path] = [] failures: list[Tile] = [] for e0, n0, e1, n1 in subtiles: child_paths, child_failures = _download_tile( wcs_base, coverage_id, e0, n0, e1, n1, tile_dir, wcs_version, split_failures, ) paths.extend(child_paths) failures.extend(child_failures) return paths, failures print( f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}" ) return [], [(min_e, min_n, max_e, max_n)] def download_raster( tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0", allow_missing_tiles: bool = False, ) -> list[Path]: """Download noise GeoTIFF raster covering England, returning paths to saved files.""" tiles = [] for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE): for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE): max_e = min(min_e + TILE_SIZE, BNG_MAX_E) max_n = min(min_n + TILE_SIZE, BNG_MAX_N) tiles.append((min_e, min_n, max_e, max_n)) print( f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..." ) paths: list[Path] = [] failures: list[Tile] = [] completed = 0 with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: futures = {} for min_e, min_n, max_e, max_n in tiles: fut = executor.submit( _download_tile, wcs_base, coverage_id, min_e, min_n, max_e, max_n, tile_dir, wcs_version, not allow_missing_tiles, ) futures[fut] = (min_e, min_n) for fut in as_completed(futures): completed += 1 result_paths, result_failures = fut.result() paths.extend(result_paths) failures.extend(result_failures) print( f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs", end="", flush=True, ) if failures: preview = ", ".join( f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5] ) suffix = "..." if len(failures) > 5 else "" if allow_missing_tiles: print( f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data " f"tile requests: {preview}{suffix}" ) print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles") return paths raise RuntimeError( f"[{label}] Failed to download {len(failures)} tile requests after " f"retries and splitting: {preview}{suffix}" ) print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles") return paths def sample_noise_at_postcodes( tile_paths: list[Path], easting: np.ndarray, northing: np.ndarray, label: str, col_name: str, ) -> pl.Series: """Sample noise values from merged tiles at given BNG coordinates.""" print(f"[{label}] Merging {len(tile_paths)} tiles...") datasets = [rasterio.open(p) for p in tile_paths] raster_nodata = datasets[0].nodata mosaic, mosaic_transform = merge(datasets) for ds in datasets: ds.close() noise_grid = mosaic[0] print(f"[{label}] Sampling noise values at postcode centroids...") rows, cols = rowcol(mosaic_transform, easting, northing) rows = np.asarray(rows) cols = np.asarray(cols) h, w = noise_grid.shape in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w) noise_db = np.full(len(easting), np.nan, dtype=np.float32) valid_rows = rows[in_bounds] valid_cols = cols[in_bounds] sampled = noise_grid[valid_rows, valid_cols].astype(np.float32) # Mark nodata and zero (unmapped areas) as NaN. # Road/rail use nodata=-96, airport uses nodata=3.4e38. if raster_nodata is not None: sampled[np.isclose(sampled, np.float32(raster_nodata), rtol=1e-5)] = np.nan sampled[sampled == 0] = np.nan noise_db[in_bounds] = sampled valid_count = int(np.sum(~np.isnan(noise_db))) print( f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data" ) # Return as masked Series: use null (not NaN) so that Polars max_horizontal # correctly ignores missing values instead of propagating NaN. return pl.Series(col_name, noise_db).fill_nan(None) def main() -> None: parser = argparse.ArgumentParser( description="Download Defra noise data (road, rail, airport) and sample at postcode centroids" ) parser.add_argument( "--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet (for lat/lon coordinates)", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) args = parser.parse_args() args.output.parent.mkdir(parents=True, exist_ok=True) print("Loading postcode coordinates...") postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename( {"pcds": "postcode", "long": "lon"} ) lat = postcodes["lat"].to_numpy() lon = postcodes["lon"].to_numpy() print("Converting lat/lon to BNG...") easting, northing = _bng_from_latlon(lat, lon) result = postcodes.select("postcode") with tempfile.TemporaryDirectory() as tmp: for ( label, col_name, wcs_base, coverage_id, wcs_version, allow_missing_tiles, ) in NOISE_SOURCES: tile_dir = Path(tmp) / label.lower() tile_dir.mkdir() tile_paths = download_raster( tile_dir, wcs_base, coverage_id, label, wcs_version, allow_missing_tiles, ) if not tile_paths: print( f"[{label}] WARNING: No tiles downloaded — column will be all null" ) series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32) else: series = sample_noise_at_postcodes( tile_paths, easting, northing, label, col_name ) result = result.with_columns(series) result.write_parquet(args.output, compression="zstd") size_mb = args.output.stat().st_size / (1024 * 1024) print(f"Wrote {args.output} ({size_mb:.1f} MB)") if __name__ == "__main__": main()