From 3c2e527328fe339e048d277add1eae6a227ff735 Mon Sep 17 00:00:00 2001 From: Andras Schmelczer Date: Sat, 7 Feb 2026 19:13:36 +0000 Subject: [PATCH] Format --- pipeline/download/broadband.py | 8 ++++- pipeline/download/ethnicity.py | 4 +-- pipeline/download/naptan.py | 8 ++--- pipeline/download/noise.py | 52 +++++++++++++++++++++-------- pipeline/transform/crime.py | 6 +++- pipeline/transform/poi_proximity.py | 12 ++++--- 6 files changed, 64 insertions(+), 26 deletions(-) diff --git a/pipeline/download/broadband.py b/pipeline/download/broadband.py index cc70577..02ee548 100644 --- a/pipeline/download/broadband.py +++ b/pipeline/download/broadband.py @@ -54,9 +54,15 @@ def main() -> None: download(PERFORMANCE_URL, zip_path) extract_zip(zip_path, extract_dir) print(list((extract_dir / "202507_fixed_coverage_r01").glob("*"))) - extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir) + extract_zip( + extract_dir + / "202507_fixed_coverage_r01" + / "202507_fixed_pc_coverage_r01.zip", + extracted_again_dir, + ) convert_to_parquet(extracted_again_dir, args.output) + if __name__ == "__main__": main() diff --git a/pipeline/download/ethnicity.py b/pipeline/download/ethnicity.py index 22a21ad..5971c49 100644 --- a/pipeline/download/ethnicity.py +++ b/pipeline/download/ethnicity.py @@ -31,9 +31,7 @@ def download_and_convert(output_path: Path) -> None: ) # Rename columns to be descriptive - rename_map = { - col: f"% {col}" for col in wide.columns if col != "Geography_code" - } + rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"} wide = wide.rename(rename_map) print(f"Output shape: {wide.shape}") diff --git a/pipeline/download/naptan.py b/pipeline/download/naptan.py index 3935c21..750344d 100644 --- a/pipeline/download/naptan.py +++ b/pipeline/download/naptan.py @@ -11,13 +11,13 @@ NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv" STOP_TYPES = { - 'AIR': "Airport", - 'FTD': "Ferry", + "AIR": "Airport", + "FTD": "Ferry", "RSE": "Rail station", "BCT": "Bus stop", "BCE": "Bus station", "TXR": "Taxi rank", - "TMU": "Metro or Tram stop", + "TMU": "Metro or Tram stop", } @@ -48,7 +48,7 @@ def download_naptan(output: Path) -> None: ) df.write_parquet(output) - size_mb = output.stat().st_size / (1024 * 1024) + size_mb = output.stat().st_size / (1024 * 1024) print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)") counts = df.group_by("category").len().sort("len", descending=True) diff --git a/pipeline/download/noise.py b/pipeline/download/noise.py index 554adfe..0433f14 100644 --- a/pipeline/download/noise.py +++ b/pipeline/download/noise.py @@ -128,14 +128,19 @@ def _download_tile( wcs_version: str = "1.0.0", ) -> Path | None: """Download a single WCS tile. Returns path if successful, None otherwise.""" - url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version) + url = _wcs_get_coverage_url( + wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version + ) try: with httpx.Client(timeout=300, follow_redirects=True) as client: resp = client.get(url) resp.raise_for_status() content_type = resp.headers.get("content-type", "") - if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"): + if "tiff" not in content_type and resp.content[:4] not in ( + b"II*\x00", + b"MM\x00*", + ): return None tile_path.write_bytes(resp.content) @@ -146,7 +151,11 @@ def _download_tile( def download_raster( - tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0" + tile_dir: Path, + wcs_base: str, + coverage_id: str, + label: str, + wcs_version: str = "1.0.0", ) -> list[Path]: """Download noise GeoTIFF raster covering England, returning paths to saved files.""" tiles = [] @@ -156,7 +165,9 @@ def download_raster( max_n = min(min_n + TILE_SIZE, BNG_MAX_N) tiles.append((min_e, min_n, max_e, max_n)) - print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...") + print( + f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..." + ) paths = [] completed = 0 @@ -165,8 +176,15 @@ def download_raster( for min_e, min_n, max_e, max_n in tiles: tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif" fut = executor.submit( - _download_tile, wcs_base, coverage_id, - min_e, min_n, max_e, max_n, tile_path, wcs_version, + _download_tile, + wcs_base, + coverage_id, + min_e, + min_n, + max_e, + max_n, + tile_path, + wcs_version, ) futures[fut] = (min_e, min_n) @@ -223,7 +241,9 @@ def sample_noise_at_postcodes( noise_db[in_bounds] = sampled valid_count = int(np.sum(~np.isnan(noise_db))) - print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data") + print( + f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data" + ) # Return as masked Series: use null (not NaN) so that Polars max_horizontal # correctly ignores missing values instead of propagating NaN. @@ -248,9 +268,9 @@ def main() -> None: args.output.parent.mkdir(parents=True, exist_ok=True) print("Loading postcode coordinates...") - postcodes = pl.read_parquet( - args.arcgis, columns=["pcds", "lat", "long"] - ).rename({"pcds": "postcode", "long": "lon"}) + postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename( + {"pcds": "postcode", "long": "lon"} + ) lat = postcodes["lat"].to_numpy() lon = postcodes["lon"].to_numpy() @@ -264,13 +284,19 @@ def main() -> None: for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES: tile_dir = Path(tmp) / label.lower() tile_dir.mkdir() - tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version) + tile_paths = download_raster( + tile_dir, wcs_base, coverage_id, label, wcs_version + ) if not tile_paths: - print(f"[{label}] WARNING: No tiles downloaded — column will be all null") + print( + f"[{label}] WARNING: No tiles downloaded — column will be all null" + ) series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32) else: - series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name) + series = sample_noise_at_postcodes( + tile_paths, easting, northing, label, col_name + ) result = result.with_columns(series) diff --git a/pipeline/transform/crime.py b/pipeline/transform/crime.py index 6eecb83..acd4010 100644 --- a/pipeline/transform/crime.py +++ b/pipeline/transform/crime.py @@ -10,7 +10,11 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None: df = pl.scan_csv( csvs, - schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8}, + schema_overrides={ + "LSOA code": pl.Utf8, + "Crime type": pl.Utf8, + "Month": pl.Utf8, + }, ).select("LSOA code", "Crime type", "Month") # Extract year, count crimes per LSOA / year / crime type diff --git a/pipeline/transform/poi_proximity.py b/pipeline/transform/poi_proximity.py index 0be1a9f..1b7aff3 100644 --- a/pipeline/transform/poi_proximity.py +++ b/pipeline/transform/poi_proximity.py @@ -5,7 +5,7 @@ from pathlib import Path import polars as pl -from pipeline.utils.poi_counts import _count_pois_per_postcode +from pipeline.utils.poi_counts import count_pois_per_postcode # POI category groups for proximity counting @@ -13,11 +13,15 @@ POI_GROUPS = { "restaurants": ["Restaurant", "Fast Food"], "groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"], "parks": ["Park", "Garden", "Nature Reserve"], - "public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py + "public_transport": [ + "Metro or Tram stop", + "Rail station", + "Bus stop", + "Bus station", + ], # comes from naptan.py } - def main(): parser = argparse.ArgumentParser( description="Count POIs within radius per postcode" @@ -41,7 +45,7 @@ def main(): pois = pl.read_parquet(args.pois) - result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2) + result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2) result.write_parquet(args.output) size_mb = args.output.stat().st_size / (1024 * 1024)