perfect-postcode/pipeline/download/noise.py
Andras Schmelczer 6cc7288126
Some checks failed
CI / Check (push) Has been cancelled
Build and publish Docker image / build-and-push (push) Has been cancelled
All good
2026-05-18 21:20:10 +01:00

480 lines
16 KiB
Python

"""Download Defra Round 4 (2022) strategic noise data for England.
Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
WCS, then samples the local maximum around each postcode representative point.
Outputs a parquet file with postcode-level noise in dB for each source.
Uses smaller 20km tiles at native 10m resolution so values are not understated
by a single coarse centroid sample.
Data source: Defra Strategic Noise Mapping Round 4 (2022)
- Lden = day-evening-night 24h weighted average (the EU standard metric)
- 10m grid, modelled at 4m above ground
License: Open Government Licence v3.0
Note: Road/rail use WCS 1.0.0; airport requires WCS 2.0.1 (Defra's 1.0.0
endpoint is broken for that coverage).
"""
import argparse
import math
import tempfile
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import httpx
import numpy as np
import polars as pl
import rasterio
from pyproj import Transformer
from rasterio.transform import rowcol
from scipy.ndimage import maximum_filter
from pipeline.local_temp import local_tmp_dir
# Noise sources:
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
# Road/rail work with WCS 1.0.0; airport requires WCS 2.0.1 and returns 500
# for many sparse/no-coverage tiles, which should become nulls.
NOISE_SOURCES = [
(
"Road",
"road_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/road-noise-all-metrics-england-round-4/wcs",
"Road_Noise_Lden_England_Round_4_All",
"1.0.0",
False,
),
(
"Rail",
"rail_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/noise-data/wcs",
"Rail_Noise_Lden_England_Round_4_All",
"1.0.0",
False,
),
(
"Airport",
"airport_noise_lden_db",
"https://environment.data.gov.uk/spatialdata/airport-noise-all-metrics-england-round-4/wcs",
"dac9cba4-abe7-43bd-b8e9-8a83da52edd8__Airport_Noise_ALL_Lden",
"2.0.1",
True,
),
]
# England extent in EPSG:27700 (British National Grid), rounded outward
BNG_MIN_E = 80_000
BNG_MAX_E = 660_000
BNG_MIN_N = 0
BNG_MAX_N = 660_000
# Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each,
# small enough to process one at a time without mosaicking all England.
TILE_SIZE = 20_000
# Max concurrent tile downloads
MAX_WORKERS = 4
# Native raster resolution (10m grid)
NATIVE_RESOLUTION = 10
# Request pixel resolution in metres.
RESOLUTION = NATIVE_RESOLUTION
# The pipeline has postcode representative points rather than complete unit
# polygons here. Use a small local footprint and take the maximum 10m cell so
# postcode-level noise is not understated by centroid rounding.
POSTCODE_NOISE_RADIUS_M = 50
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
# intermittently return 504s; smaller fallback requests usually succeed.
MAX_RETRIES = 3
RETRY_BACKOFF_SECONDS = 5
MIN_TILE_SIZE = 5_000
type Tile = tuple[int, int, int, int]
class NoGeoTiffError(RuntimeError):
"""Raised when WCS returns an XML/HTML exception or another non-raster body."""
def _wcs_get_coverage_url(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
wcs_version: str = "1.0.0",
) -> str:
"""Build a WCS GetCoverage URL for a BNG bounding box."""
if wcs_version == "2.0.1":
return (
f"{wcs_base}?"
f"service=WCS&version=2.0.1&request=GetCoverage"
f"&coverageId={coverage_id}"
f"&format=image/tiff"
f"&subsettingCRS=EPSG:27700"
f"&subset=E({min_e},{max_e})"
f"&subset=N({min_n},{max_n})"
f"&scaleFactor={NATIVE_RESOLUTION / RESOLUTION}"
)
width = (max_e - min_e) // RESOLUTION
height = (max_n - min_n) // RESOLUTION
return (
f"{wcs_base}?"
f"service=WCS&version=1.0.0&request=GetCoverage"
f"&coverage={coverage_id}"
f"&CRS=EPSG:27700"
f"&BBOX={min_e},{min_n},{max_e},{max_n}"
f"&width={width}&height={height}"
f"&format=GeoTIFF"
)
_TO_BNG = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
def _bng_from_latlon(lat: np.ndarray, lon: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Convert WGS84 lat/lon to British National Grid easting/northing."""
return _TO_BNG.transform(lon, lat) # pyproj takes (x=lon, y=lat)
def _looks_like_tiff(response: httpx.Response) -> bool:
content_type = response.headers.get("content-type", "")
return "tiff" in content_type or response.content[:4] in (b"II*\x00", b"MM\x00*")
def _fetch_tile_bytes(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
wcs_version: str = "1.0.0",
) -> bytes:
"""Fetch one WCS tile."""
url = _wcs_get_coverage_url(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
with httpx.Client(timeout=300, follow_redirects=True) as client:
resp = client.get(url)
resp.raise_for_status()
if not _looks_like_tiff(resp):
content_type = resp.headers.get("content-type", "<missing>")
body_preview = resp.text[:200].replace("\n", " ")
raise NoGeoTiffError(
f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}"
)
return resp.content
def _split_tile(min_e: int, min_n: int, max_e: int, max_n: int) -> list[Tile]:
e_edges = [min_e, max_e]
n_edges = [min_n, max_n]
if max_e - min_e > MIN_TILE_SIZE:
e_edges.insert(1, (min_e + max_e) // 2)
if max_n - min_n > MIN_TILE_SIZE:
n_edges.insert(1, (min_n + max_n) // 2)
subtiles: list[Tile] = []
for e0, e1 in zip(e_edges, e_edges[1:]):
for n0, n1 in zip(n_edges, n_edges[1:]):
if e1 > e0 and n1 > n0:
subtiles.append((e0, n0, e1, n1))
return subtiles
def _tile_path(tile_dir: Path, min_e: int, min_n: int, max_e: int, max_n: int) -> Path:
return tile_dir / f"tile_{min_e}_{min_n}_{max_e}_{max_n}.tif"
def _download_tile(
wcs_base: str,
coverage_id: str,
min_e: int,
min_n: int,
max_e: int,
max_n: int,
tile_dir: Path,
wcs_version: str = "1.0.0",
split_failures: bool = True,
) -> tuple[list[Path], list[Tile]]:
"""Download a WCS tile, splitting on repeated server failures."""
tile_path = _tile_path(tile_dir, min_e, min_n, max_e, max_n)
if tile_path.exists() and tile_path.stat().st_size > 0:
return [tile_path], []
last_error: Exception | None = None
for attempt in range(1, MAX_RETRIES + 1):
try:
content = _fetch_tile_bytes(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
tile_path.write_bytes(content)
return [tile_path], []
except (
NoGeoTiffError,
httpx.HTTPStatusError,
httpx.TimeoutException,
httpx.ConnectError,
) as e:
last_error = e
if attempt < MAX_RETRIES:
sleep_for = RETRY_BACKOFF_SECONDS * attempt
print(
f" Retrying tile ({min_e},{min_n})-({max_e},{max_n}) "
f"after {type(e).__name__} ({attempt}/{MAX_RETRIES})"
)
time.sleep(sleep_for)
subtiles = _split_tile(min_e, min_n, max_e, max_n) if split_failures else []
if len(subtiles) > 1:
print(
f" Splitting tile ({min_e},{min_n})-({max_e},{max_n}) "
f"into {len(subtiles)} smaller requests after: {last_error}"
)
paths: list[Path] = []
failures: list[Tile] = []
for e0, n0, e1, n1 in subtiles:
child_paths, child_failures = _download_tile(
wcs_base,
coverage_id,
e0,
n0,
e1,
n1,
tile_dir,
wcs_version,
split_failures,
)
paths.extend(child_paths)
failures.extend(child_failures)
return paths, failures
print(
f" Failed to download tile ({min_e},{min_n})-({max_e},{max_n}): {last_error}"
)
return [], [(min_e, min_n, max_e, max_n)]
def download_raster(
tile_dir: Path,
wcs_base: str,
coverage_id: str,
label: str,
wcs_version: str = "1.0.0",
allow_missing_tiles: bool = False,
) -> list[Path]:
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
tiles = []
for min_e in range(BNG_MIN_E, BNG_MAX_E, TILE_SIZE):
for min_n in range(BNG_MIN_N, BNG_MAX_N, TILE_SIZE):
max_e = min(min_e + TILE_SIZE, BNG_MAX_E)
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
tiles.append((min_e, min_n, max_e, max_n))
print(
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
)
paths: list[Path] = []
failures: list[Tile] = []
completed = 0
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {}
for min_e, min_n, max_e, max_n in tiles:
fut = executor.submit(
_download_tile,
wcs_base,
coverage_id,
min_e,
min_n,
max_e,
max_n,
tile_dir,
wcs_version,
not allow_missing_tiles,
)
futures[fut] = (min_e, min_n)
for fut in as_completed(futures):
completed += 1
result_paths, result_failures = fut.result()
paths.extend(result_paths)
failures.extend(result_failures)
print(
f"\r [{completed}/{len(tiles)}] Downloaded {len(paths)} GeoTIFFs",
end="",
flush=True,
)
if failures:
preview = ", ".join(
f"({e0},{n0})-({e1},{n1})" for e0, n0, e1, n1 in failures[:5]
)
suffix = "..." if len(failures) > 5 else ""
if allow_missing_tiles:
print(
f"\n[{label}] WARNING: skipped {len(failures)} missing/no-data "
f"tile requests: {preview}{suffix}"
)
print(f"[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
return paths
raise RuntimeError(
f"[{label}] Failed to download {len(failures)} tile requests after "
f"retries and splitting: {preview}{suffix}"
)
print(f"\n[{label}] Downloaded {len(paths)} GeoTIFFs from {len(tiles)} tiles")
return paths
def sample_noise_at_postcodes(
tile_paths: list[Path],
easting: np.ndarray,
northing: np.ndarray,
label: str,
col_name: str,
) -> pl.Series:
"""Sample max noise values from 10m tiles around postcode representative points."""
print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
noise_db = np.full(len(easting), np.nan, dtype=np.float32)
radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
filter_size = radius_cells * 2 + 1
for path in tile_paths:
with rasterio.open(path) as dataset:
bounds = dataset.bounds
candidate_mask = (
(easting >= bounds.left - POSTCODE_NOISE_RADIUS_M)
& (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M)
& (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M)
& (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M)
)
candidate_indices = np.flatnonzero(candidate_mask)
if len(candidate_indices) == 0:
continue
grid = dataset.read(1).astype(np.float32, copy=False)
invalid = ~np.isfinite(grid) | (grid == 0)
if dataset.nodata is not None:
invalid |= np.isclose(
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
)
grid = grid.copy()
grid[invalid] = -np.inf
if filter_size > 1:
grid = maximum_filter(
grid, size=filter_size, mode="constant", cval=-np.inf
)
rows, cols = rowcol(
dataset.transform,
easting[candidate_indices],
northing[candidate_indices],
)
rows = np.asarray(rows)
cols = np.asarray(cols)
h, w = grid.shape
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
if not np.any(in_bounds):
continue
sampled_indices = candidate_indices[in_bounds]
sampled = grid[rows[in_bounds], cols[in_bounds]]
valid = sampled != -np.inf
if not np.any(valid):
continue
sampled_indices = sampled_indices[valid]
sampled = sampled[valid]
existing = noise_db[sampled_indices]
noise_db[sampled_indices] = np.where(
np.isnan(existing), sampled, np.maximum(existing, sampled)
)
valid_count = int(np.sum(~np.isnan(noise_db)))
print(
f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
)
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
# correctly ignores missing values instead of propagating NaN.
return pl.Series(col_name, noise_db).fill_nan(None)
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Defra noise data (road, rail, airport) and sample at postcode centroids"
)
parser.add_argument(
"--arcgis",
type=Path,
required=True,
help="ArcGIS postcode data parquet (for lat/lon coordinates)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
args.output.parent.mkdir(parents=True, exist_ok=True)
print("Loading postcode coordinates...")
postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
{"pcds": "postcode", "long": "lon"}
)
lat = postcodes["lat"].to_numpy()
lon = postcodes["lon"].to_numpy()
print("Converting lat/lon to BNG...")
easting, northing = _bng_from_latlon(lat, lon)
result = postcodes.select("postcode")
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
for (
label,
col_name,
wcs_base,
coverage_id,
wcs_version,
allow_missing_tiles,
) in NOISE_SOURCES:
tile_dir = Path(tmp) / label.lower()
tile_dir.mkdir()
tile_paths = download_raster(
tile_dir,
wcs_base,
coverage_id,
label,
wcs_version,
allow_missing_tiles,
)
if not tile_paths:
print(
f"[{label}] WARNING: No tiles downloaded — column will be all null"
)
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
else:
series = sample_noise_at_postcodes(
tile_paths, easting, northing, label, col_name
)
result = result.with_columns(series)
result.write_parquet(args.output, compression="zstd")
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()