LGTM
This commit is contained in:
parent
a8165249a4
commit
a4103b0896
64 changed files with 5376 additions and 3832 deletions
|
|
@ -1,11 +1,11 @@
|
|||
"""Download Defra Round 4 (2022) strategic noise data for England.
|
||||
|
||||
Downloads modelled noise levels (road, rail, airport) as GeoTIFF rasters via
|
||||
WCS, then samples noise values at postcode centroids. Outputs a parquet file
|
||||
with postcode-level noise in dB for each source.
|
||||
WCS, then samples the local maximum around each postcode representative point.
|
||||
Outputs a parquet file with postcode-level noise in dB for each source.
|
||||
|
||||
Uses 100km tiles (~42 per source) to balance request size vs count. The server
|
||||
times out on tiles larger than ~150km at 100m resolution.
|
||||
Uses smaller 20km tiles at native 10m resolution so values are not understated
|
||||
by a single coarse centroid sample.
|
||||
|
||||
Data source: Defra Strategic Noise Mapping Round 4 (2022)
|
||||
- Lden = day-evening-night 24h weighted average (the EU standard metric)
|
||||
|
|
@ -17,6 +17,7 @@ endpoint is broken for that coverage).
|
|||
"""
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import tempfile
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
|
@ -27,8 +28,8 @@ import numpy as np
|
|||
import polars as pl
|
||||
import rasterio
|
||||
from pyproj import Transformer
|
||||
from rasterio.merge import merge
|
||||
from rasterio.transform import rowcol
|
||||
from scipy.ndimage import maximum_filter
|
||||
|
||||
# Noise sources:
|
||||
# (label, column_name, WCS base URL, coverage ID, WCS version, allow_missing_tiles)
|
||||
|
|
@ -67,8 +68,9 @@ BNG_MAX_E = 660_000
|
|||
BNG_MIN_N = 0
|
||||
BNG_MAX_N = 660_000
|
||||
|
||||
# Tile size in metres (100km balances request size vs count; 300km causes 504s)
|
||||
TILE_SIZE = 100_000
|
||||
# Tile size in metres. At 10m resolution, 20km tiles are ~4M pixels each,
|
||||
# small enough to process one at a time without mosaicking all England.
|
||||
TILE_SIZE = 20_000
|
||||
|
||||
# Max concurrent tile downloads
|
||||
MAX_WORKERS = 4
|
||||
|
|
@ -76,19 +78,27 @@ MAX_WORKERS = 4
|
|||
# Native raster resolution (10m grid)
|
||||
NATIVE_RESOLUTION = 10
|
||||
|
||||
# Request pixel resolution in metres (100m is sufficient for postcode-level data
|
||||
# and keeps download size ~100x smaller than native 10m)
|
||||
RESOLUTION = 100
|
||||
# Request pixel resolution in metres.
|
||||
RESOLUTION = NATIVE_RESOLUTION
|
||||
|
||||
# The pipeline has postcode representative points rather than complete unit
|
||||
# polygons here. Use a small local footprint and take the maximum 10m cell so
|
||||
# postcode-level noise is not understated by centroid rounding.
|
||||
POSTCODE_NOISE_RADIUS_M = 50
|
||||
|
||||
# Retry/split behaviour for slow Defra WCS requests. Some 100km eastern tiles
|
||||
# intermittently return 504s; smaller fallback requests usually succeed.
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BACKOFF_SECONDS = 5
|
||||
MIN_TILE_SIZE = 25_000
|
||||
MIN_TILE_SIZE = 5_000
|
||||
|
||||
type Tile = tuple[int, int, int, int]
|
||||
|
||||
|
||||
class NoGeoTiffError(RuntimeError):
|
||||
"""Raised when WCS returns an XML/HTML exception or another non-raster body."""
|
||||
|
||||
|
||||
def _wcs_get_coverage_url(
|
||||
wcs_base: str,
|
||||
coverage_id: str,
|
||||
|
|
@ -144,8 +154,8 @@ def _fetch_tile_bytes(
|
|||
max_e: int,
|
||||
max_n: int,
|
||||
wcs_version: str = "1.0.0",
|
||||
) -> bytes | None:
|
||||
"""Fetch one WCS tile. Returns None when the server reports no GeoTIFF."""
|
||||
) -> bytes:
|
||||
"""Fetch one WCS tile."""
|
||||
url = _wcs_get_coverage_url(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
|
|
@ -154,7 +164,11 @@ def _fetch_tile_bytes(
|
|||
resp.raise_for_status()
|
||||
|
||||
if not _looks_like_tiff(resp):
|
||||
return None
|
||||
content_type = resp.headers.get("content-type", "<missing>")
|
||||
body_preview = resp.text[:200].replace("\n", " ")
|
||||
raise NoGeoTiffError(
|
||||
f"WCS returned non-GeoTIFF response ({content_type}): {body_preview}"
|
||||
)
|
||||
return resp.content
|
||||
|
||||
|
||||
|
|
@ -200,11 +214,14 @@ def _download_tile(
|
|||
content = _fetch_tile_bytes(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
if content is None:
|
||||
return [], []
|
||||
tile_path.write_bytes(content)
|
||||
return [tile_path], []
|
||||
except (httpx.HTTPStatusError, httpx.TimeoutException, httpx.ConnectError) as e:
|
||||
except (
|
||||
NoGeoTiffError,
|
||||
httpx.HTTPStatusError,
|
||||
httpx.TimeoutException,
|
||||
httpx.ConnectError,
|
||||
) as e:
|
||||
last_error = e
|
||||
if attempt < MAX_RETRIES:
|
||||
sleep_for = RETRY_BACKOFF_SECONDS * attempt
|
||||
|
|
@ -323,35 +340,62 @@ def sample_noise_at_postcodes(
|
|||
label: str,
|
||||
col_name: str,
|
||||
) -> pl.Series:
|
||||
"""Sample noise values from merged tiles at given BNG coordinates."""
|
||||
print(f"[{label}] Merging {len(tile_paths)} tiles...")
|
||||
datasets = [rasterio.open(p) for p in tile_paths]
|
||||
raster_nodata = datasets[0].nodata
|
||||
mosaic, mosaic_transform = merge(datasets)
|
||||
for ds in datasets:
|
||||
ds.close()
|
||||
|
||||
noise_grid = mosaic[0]
|
||||
|
||||
print(f"[{label}] Sampling noise values at postcode centroids...")
|
||||
rows, cols = rowcol(mosaic_transform, easting, northing)
|
||||
rows = np.asarray(rows)
|
||||
cols = np.asarray(cols)
|
||||
|
||||
h, w = noise_grid.shape
|
||||
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
|
||||
|
||||
"""Sample max noise values from 10m tiles around postcode representative points."""
|
||||
print(f"[{label}] Sampling max noise values from {len(tile_paths)} tiles...")
|
||||
noise_db = np.full(len(easting), np.nan, dtype=np.float32)
|
||||
valid_rows = rows[in_bounds]
|
||||
valid_cols = cols[in_bounds]
|
||||
sampled = noise_grid[valid_rows, valid_cols].astype(np.float32)
|
||||
radius_cells = max(0, math.ceil(POSTCODE_NOISE_RADIUS_M / RESOLUTION))
|
||||
filter_size = radius_cells * 2 + 1
|
||||
|
||||
# Mark nodata and zero (unmapped areas) as NaN.
|
||||
# Road/rail use nodata=-96, airport uses nodata=3.4e38.
|
||||
if raster_nodata is not None:
|
||||
sampled[np.isclose(sampled, np.float32(raster_nodata), rtol=1e-5)] = np.nan
|
||||
sampled[sampled == 0] = np.nan
|
||||
noise_db[in_bounds] = sampled
|
||||
for path in tile_paths:
|
||||
with rasterio.open(path) as dataset:
|
||||
bounds = dataset.bounds
|
||||
candidate_mask = (
|
||||
(easting >= bounds.left - POSTCODE_NOISE_RADIUS_M)
|
||||
& (easting <= bounds.right + POSTCODE_NOISE_RADIUS_M)
|
||||
& (northing >= bounds.bottom - POSTCODE_NOISE_RADIUS_M)
|
||||
& (northing <= bounds.top + POSTCODE_NOISE_RADIUS_M)
|
||||
)
|
||||
candidate_indices = np.flatnonzero(candidate_mask)
|
||||
if len(candidate_indices) == 0:
|
||||
continue
|
||||
|
||||
grid = dataset.read(1).astype(np.float32, copy=False)
|
||||
invalid = ~np.isfinite(grid) | (grid == 0)
|
||||
if dataset.nodata is not None:
|
||||
invalid |= np.isclose(
|
||||
grid, np.float32(dataset.nodata), rtol=1e-5, atol=1e-5
|
||||
)
|
||||
grid = grid.copy()
|
||||
grid[invalid] = -np.inf
|
||||
if filter_size > 1:
|
||||
grid = maximum_filter(
|
||||
grid, size=filter_size, mode="constant", cval=-np.inf
|
||||
)
|
||||
|
||||
rows, cols = rowcol(
|
||||
dataset.transform,
|
||||
easting[candidate_indices],
|
||||
northing[candidate_indices],
|
||||
)
|
||||
rows = np.asarray(rows)
|
||||
cols = np.asarray(cols)
|
||||
h, w = grid.shape
|
||||
in_bounds = (rows >= 0) & (rows < h) & (cols >= 0) & (cols < w)
|
||||
if not np.any(in_bounds):
|
||||
continue
|
||||
|
||||
sampled_indices = candidate_indices[in_bounds]
|
||||
sampled = grid[rows[in_bounds], cols[in_bounds]]
|
||||
valid = sampled != -np.inf
|
||||
if not np.any(valid):
|
||||
continue
|
||||
|
||||
sampled_indices = sampled_indices[valid]
|
||||
sampled = sampled[valid]
|
||||
existing = noise_db[sampled_indices]
|
||||
noise_db[sampled_indices] = np.where(
|
||||
np.isnan(existing), sampled, np.maximum(existing, sampled)
|
||||
)
|
||||
|
||||
valid_count = int(np.sum(~np.isnan(noise_db)))
|
||||
print(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue