This commit is contained in:
Andras Schmelczer 2026-02-07 19:13:36 +00:00
parent 555ba7cf53
commit 3c2e527328
6 changed files with 64 additions and 26 deletions

View file

@ -54,9 +54,15 @@ def main() -> None:
download(PERFORMANCE_URL, zip_path)
extract_zip(zip_path, extract_dir)
print(list((extract_dir / "202507_fixed_coverage_r01").glob("*")))
extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir)
extract_zip(
extract_dir
/ "202507_fixed_coverage_r01"
/ "202507_fixed_pc_coverage_r01.zip",
extracted_again_dir,
)
convert_to_parquet(extracted_again_dir, args.output)
if __name__ == "__main__":
main()

View file

@ -31,9 +31,7 @@ def download_and_convert(output_path: Path) -> None:
)
# Rename columns to be descriptive
rename_map = {
col: f"% {col}" for col in wide.columns if col != "Geography_code"
}
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
wide = wide.rename(rename_map)
print(f"Output shape: {wide.shape}")

View file

@ -11,13 +11,13 @@ NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
STOP_TYPES = {
'AIR': "Airport",
'FTD': "Ferry",
"AIR": "Airport",
"FTD": "Ferry",
"RSE": "Rail station",
"BCT": "Bus stop",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Metro or Tram stop",
"TMU": "Metro or Tram stop",
}
@ -48,7 +48,7 @@ def download_naptan(output: Path) -> None:
)
df.write_parquet(output)
size_mb = output.stat().st_size / (1024 * 1024)
size_mb = output.stat().st_size / (1024 * 1024)
print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
counts = df.group_by("category").len().sort("len", descending=True)

View file

@ -128,14 +128,19 @@ def _download_tile(
wcs_version: str = "1.0.0",
) -> Path | None:
"""Download a single WCS tile. Returns path if successful, None otherwise."""
url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version)
url = _wcs_get_coverage_url(
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
)
try:
with httpx.Client(timeout=300, follow_redirects=True) as client:
resp = client.get(url)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"):
if "tiff" not in content_type and resp.content[:4] not in (
b"II*\x00",
b"MM\x00*",
):
return None
tile_path.write_bytes(resp.content)
@ -146,7 +151,11 @@ def _download_tile(
def download_raster(
tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0"
tile_dir: Path,
wcs_base: str,
coverage_id: str,
label: str,
wcs_version: str = "1.0.0",
) -> list[Path]:
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
tiles = []
@ -156,7 +165,9 @@ def download_raster(
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
tiles.append((min_e, min_n, max_e, max_n))
print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...")
print(
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
)
paths = []
completed = 0
@ -165,8 +176,15 @@ def download_raster(
for min_e, min_n, max_e, max_n in tiles:
tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
fut = executor.submit(
_download_tile, wcs_base, coverage_id,
min_e, min_n, max_e, max_n, tile_path, wcs_version,
_download_tile,
wcs_base,
coverage_id,
min_e,
min_n,
max_e,
max_n,
tile_path,
wcs_version,
)
futures[fut] = (min_e, min_n)
@ -223,7 +241,9 @@ def sample_noise_at_postcodes(
noise_db[in_bounds] = sampled
valid_count = int(np.sum(~np.isnan(noise_db)))
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
print(
f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
)
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
# correctly ignores missing values instead of propagating NaN.
@ -248,9 +268,9 @@ def main() -> None:
args.output.parent.mkdir(parents=True, exist_ok=True)
print("Loading postcode coordinates...")
postcodes = pl.read_parquet(
args.arcgis, columns=["pcds", "lat", "long"]
).rename({"pcds": "postcode", "long": "lon"})
postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
{"pcds": "postcode", "long": "lon"}
)
lat = postcodes["lat"].to_numpy()
lon = postcodes["lon"].to_numpy()
@ -264,13 +284,19 @@ def main() -> None:
for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
tile_dir = Path(tmp) / label.lower()
tile_dir.mkdir()
tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version)
tile_paths = download_raster(
tile_dir, wcs_base, coverage_id, label, wcs_version
)
if not tile_paths:
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
print(
f"[{label}] WARNING: No tiles downloaded — column will be all null"
)
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
else:
series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
series = sample_noise_at_postcodes(
tile_paths, easting, northing, label, col_name
)
result = result.with_columns(series)

View file

@ -10,7 +10,11 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
df = pl.scan_csv(
csvs,
schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
schema_overrides={
"LSOA code": pl.Utf8,
"Crime type": pl.Utf8,
"Month": pl.Utf8,
},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type

View file

@ -5,7 +5,7 @@ from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import _count_pois_per_postcode
from pipeline.utils.poi_counts import count_pois_per_postcode
# POI category groups for proximity counting
@ -13,11 +13,15 @@ POI_GROUPS = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
"parks": ["Park", "Garden", "Nature Reserve"],
"public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
"public_transport": [
"Metro or Tram stop",
"Rail station",
"Bus stop",
"Bus station",
], # comes from naptan.py
}
def main():
parser = argparse.ArgumentParser(
description="Count POIs within radius per postcode"
@ -41,7 +45,7 @@ def main():
pois = pl.read_parquet(args.pois)
result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)