From 3c2e527328fe339e048d277add1eae6a227ff735 Mon Sep 17 00:00:00 2001
From: Andras Schmelczer <andras@schmelczer.dev>
Date: Sat, 7 Feb 2026 19:13:36 +0000
Subject: [PATCH] Format

---
 pipeline/download/broadband.py      |  8 ++++-
 pipeline/download/ethnicity.py      |  4 +--
 pipeline/download/naptan.py         |  8 ++---
 pipeline/download/noise.py          | 52 +++++++++++++++++++++--------
 pipeline/transform/crime.py         |  6 +++-
 pipeline/transform/poi_proximity.py | 12 ++++---
 6 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/pipeline/download/broadband.py b/pipeline/download/broadband.py
index cc70577..02ee548 100644
--- a/pipeline/download/broadband.py
+++ b/pipeline/download/broadband.py
@@ -54,9 +54,15 @@ def main() -> None:
         download(PERFORMANCE_URL, zip_path)
         extract_zip(zip_path, extract_dir)
         print(list((extract_dir / "202507_fixed_coverage_r01").glob("*")))
-        extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir)
+        extract_zip(
+            extract_dir
+            / "202507_fixed_coverage_r01"
+            / "202507_fixed_pc_coverage_r01.zip",
+            extracted_again_dir,
+        )
 
         convert_to_parquet(extracted_again_dir, args.output)
 
+
 if __name__ == "__main__":
     main()
diff --git a/pipeline/download/ethnicity.py b/pipeline/download/ethnicity.py
index 22a21ad..5971c49 100644
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@@ -31,9 +31,7 @@ def download_and_convert(output_path: Path) -> None:
     )
 
     # Rename columns to be descriptive
-    rename_map = {
-        col: f"% {col}" for col in wide.columns if col != "Geography_code"
-    }
+    rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
     wide = wide.rename(rename_map)
 
     print(f"Output shape: {wide.shape}")
diff --git a/pipeline/download/naptan.py b/pipeline/download/naptan.py
index 3935c21..750344d 100644
--- a/pipeline/download/naptan.py
+++ b/pipeline/download/naptan.py
@@ -11,13 +11,13 @@ NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
 
 
 STOP_TYPES = {
-    'AIR': "Airport",
-    'FTD': "Ferry",
+    "AIR": "Airport",
+    "FTD": "Ferry",
     "RSE": "Rail station",
     "BCT": "Bus stop",
     "BCE": "Bus station",
     "TXR": "Taxi rank",
-    "TMU": "Metro or Tram stop",   
+    "TMU": "Metro or Tram stop",
 }
 
 
@@ -48,7 +48,7 @@ def download_naptan(output: Path) -> None:
     )
 
     df.write_parquet(output)
-    size_mb = output.stat().st_size / (1024 * 1024) 
+    size_mb = output.stat().st_size / (1024 * 1024)
     print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")
 
     counts = df.group_by("category").len().sort("len", descending=True)
diff --git a/pipeline/download/noise.py b/pipeline/download/noise.py
index 554adfe..0433f14 100644
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@@ -128,14 +128,19 @@ def _download_tile(
     wcs_version: str = "1.0.0",
 ) -> Path | None:
     """Download a single WCS tile. Returns path if successful, None otherwise."""
-    url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version)
+    url = _wcs_get_coverage_url(
+        wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
+    )
     try:
         with httpx.Client(timeout=300, follow_redirects=True) as client:
             resp = client.get(url)
             resp.raise_for_status()
 
         content_type = resp.headers.get("content-type", "")
-        if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"):
+        if "tiff" not in content_type and resp.content[:4] not in (
+            b"II*\x00",
+            b"MM\x00*",
+        ):
             return None
 
         tile_path.write_bytes(resp.content)
@@ -146,7 +151,11 @@ def _download_tile(
 
 
 def download_raster(
-    tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0"
+    tile_dir: Path,
+    wcs_base: str,
+    coverage_id: str,
+    label: str,
+    wcs_version: str = "1.0.0",
 ) -> list[Path]:
     """Download noise GeoTIFF raster covering England, returning paths to saved files."""
     tiles = []
@@ -156,7 +165,9 @@ def download_raster(
             max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
             tiles.append((min_e, min_n, max_e, max_n))
 
-    print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...")
+    print(
+        f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
+    )
     paths = []
     completed = 0
 
@@ -165,8 +176,15 @@ def download_raster(
         for min_e, min_n, max_e, max_n in tiles:
             tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
             fut = executor.submit(
-                _download_tile, wcs_base, coverage_id,
-                min_e, min_n, max_e, max_n, tile_path, wcs_version,
+                _download_tile,
+                wcs_base,
+                coverage_id,
+                min_e,
+                min_n,
+                max_e,
+                max_n,
+                tile_path,
+                wcs_version,
             )
             futures[fut] = (min_e, min_n)
 
@@ -223,7 +241,9 @@ def sample_noise_at_postcodes(
     noise_db[in_bounds] = sampled
 
     valid_count = int(np.sum(~np.isnan(noise_db)))
-    print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
+    print(
+        f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
+    )
 
     # Return as masked Series: use null (not NaN) so that Polars max_horizontal
     # correctly ignores missing values instead of propagating NaN.
@@ -248,9 +268,9 @@ def main() -> None:
     args.output.parent.mkdir(parents=True, exist_ok=True)
 
     print("Loading postcode coordinates...")
-    postcodes = pl.read_parquet(
-        args.arcgis, columns=["pcds", "lat", "long"]
-    ).rename({"pcds": "postcode", "long": "lon"})
+    postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
+        {"pcds": "postcode", "long": "lon"}
+    )
 
     lat = postcodes["lat"].to_numpy()
     lon = postcodes["lon"].to_numpy()
@@ -264,13 +284,19 @@ def main() -> None:
         for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
             tile_dir = Path(tmp) / label.lower()
             tile_dir.mkdir()
-            tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version)
+            tile_paths = download_raster(
+                tile_dir, wcs_base, coverage_id, label, wcs_version
+            )
 
             if not tile_paths:
-                print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
+                print(
+                    f"[{label}] WARNING: No tiles downloaded — column will be all null"
+                )
                 series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
             else:
-                series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
+                series = sample_noise_at_postcodes(
+                    tile_paths, easting, northing, label, col_name
+                )
 
             result = result.with_columns(series)
 
diff --git a/pipeline/transform/crime.py b/pipeline/transform/crime.py
index 6eecb83..acd4010 100644
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@@ -10,7 +10,11 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
 
     df = pl.scan_csv(
         csvs,
-        schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
+        schema_overrides={
+            "LSOA code": pl.Utf8,
+            "Crime type": pl.Utf8,
+            "Month": pl.Utf8,
+        },
     ).select("LSOA code", "Crime type", "Month")
 
     # Extract year, count crimes per LSOA / year / crime type
diff --git a/pipeline/transform/poi_proximity.py b/pipeline/transform/poi_proximity.py
index 0be1a9f..1b7aff3 100644
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@@ -5,7 +5,7 @@ from pathlib import Path
 
 import polars as pl
 
-from pipeline.utils.poi_counts import _count_pois_per_postcode
+from pipeline.utils.poi_counts import count_pois_per_postcode
 
 
 # POI category groups for proximity counting
@@ -13,11 +13,15 @@ POI_GROUPS = {
     "restaurants": ["Restaurant", "Fast Food"],
     "groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
     "parks": ["Park", "Garden", "Nature Reserve"],
-    "public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
+    "public_transport": [
+        "Metro or Tram stop",
+        "Rail station",
+        "Bus stop",
+        "Bus station",
+    ],  # comes from naptan.py
 }
 
 
-
 def main():
     parser = argparse.ArgumentParser(
         description="Count POIs within radius per postcode"
@@ -41,7 +45,7 @@ def main():
 
     pois = pl.read_parquet(args.pois)
 
-    result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
+    result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
 
     result.write_parquet(args.output)
     size_mb = args.output.stat().st_size / (1024 * 1024)