Format

2026-02-07 19:13:36 +00:00 · 2026-02-07 19:13:36 +00:00 · 3c2e527328
commit 3c2e527328
parent 555ba7cf53
6 changed files with 64 additions and 26 deletions
--- a/pipeline/download/broadband.py
+++ b/pipeline/download/broadband.py
@ -54,9 +54,15 @@ def main() -> None:
        download(PERFORMANCE_URL, zip_path)
        extract_zip(zip_path, extract_dir)
        print(list((extract_dir / "202507_fixed_coverage_r01").glob("*")))
-        extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir)
+        extract_zip(
+            extract_dir
+            / "202507_fixed_coverage_r01"
+            / "202507_fixed_pc_coverage_r01.zip",
+            extracted_again_dir,
+        )

        convert_to_parquet(extracted_again_dir, args.output)

+
 if __name__ == "__main__":
    main()
--- a/pipeline/download/ethnicity.py
+++ b/pipeline/download/ethnicity.py
@ -31,9 +31,7 @@ def download_and_convert(output_path: Path) -> None:
    )

    # Rename columns to be descriptive
-    rename_map = {
-        col: f"% {col}" for col in wide.columns if col != "Geography_code"
-    }
+    rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
    wide = wide.rename(rename_map)

    print(f"Output shape: {wide.shape}")
--- a/pipeline/download/naptan.py
+++ b/pipeline/download/naptan.py
@ -11,13 +11,13 @@ NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"


 STOP_TYPES = {
-    'AIR': "Airport",
-    'FTD': "Ferry",
+    "AIR": "Airport",
+    "FTD": "Ferry",
    "RSE": "Rail station",
    "BCT": "Bus stop",
    "BCE": "Bus station",
    "TXR": "Taxi rank",
-    "TMU": "Metro or Tram stop",   
+    "TMU": "Metro or Tram stop",
 }


@ -48,7 +48,7 @@ def download_naptan(output: Path) -> None:
    )

    df.write_parquet(output)
-    size_mb = output.stat().st_size / (1024 * 1024) 
+    size_mb = output.stat().st_size / (1024 * 1024)
    print(f"Wrote {output} ({size_mb:.1f} MB, {len(df):,} stations)")

    counts = df.group_by("category").len().sort("len", descending=True)
--- a/pipeline/download/noise.py
+++ b/pipeline/download/noise.py
@ -128,14 +128,19 @@ def _download_tile(
    wcs_version: str = "1.0.0",
 ) -> Path | None:
    """Download a single WCS tile. Returns path if successful, None otherwise."""
-    url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version)
+    url = _wcs_get_coverage_url(
+        wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
+    )
    try:
        with httpx.Client(timeout=300, follow_redirects=True) as client:
            resp = client.get(url)
            resp.raise_for_status()

        content_type = resp.headers.get("content-type", "")
-        if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"):
+        if "tiff" not in content_type and resp.content[:4] not in (
+            b"II*\x00",
+            b"MM\x00*",
+        ):
            return None

        tile_path.write_bytes(resp.content)
@ -146,7 +151,11 @@ def _download_tile(


 def download_raster(
-    tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0"
+    tile_dir: Path,
+    wcs_base: str,
+    coverage_id: str,
+    label: str,
+    wcs_version: str = "1.0.0",
 ) -> list[Path]:
    """Download noise GeoTIFF raster covering England, returning paths to saved files."""
    tiles = []
@ -156,7 +165,9 @@ def download_raster(
            max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
            tiles.append((min_e, min_n, max_e, max_n))

-    print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...")
+    print(
+        f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
+    )
    paths = []
    completed = 0

@ -165,8 +176,15 @@ def download_raster(
        for min_e, min_n, max_e, max_n in tiles:
            tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
            fut = executor.submit(
-                _download_tile, wcs_base, coverage_id,
-                min_e, min_n, max_e, max_n, tile_path, wcs_version,
+                _download_tile,
+                wcs_base,
+                coverage_id,
+                min_e,
+                min_n,
+                max_e,
+                max_n,
+                tile_path,
+                wcs_version,
            )
            futures[fut] = (min_e, min_n)

@ -223,7 +241,9 @@ def sample_noise_at_postcodes(
    noise_db[in_bounds] = sampled

    valid_count = int(np.sum(~np.isnan(noise_db)))
-    print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
+    print(
+        f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
+    )

    # Return as masked Series: use null (not NaN) so that Polars max_horizontal
    # correctly ignores missing values instead of propagating NaN.
@ -248,9 +268,9 @@ def main() -> None:
    args.output.parent.mkdir(parents=True, exist_ok=True)

    print("Loading postcode coordinates...")
-    postcodes = pl.read_parquet(
-        args.arcgis, columns=["pcds", "lat", "long"]
-    ).rename({"pcds": "postcode", "long": "lon"})
+    postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
+        {"pcds": "postcode", "long": "lon"}
+    )

    lat = postcodes["lat"].to_numpy()
    lon = postcodes["lon"].to_numpy()
@ -264,13 +284,19 @@ def main() -> None:
        for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
            tile_dir = Path(tmp) / label.lower()
            tile_dir.mkdir()
-            tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version)
+            tile_paths = download_raster(
+                tile_dir, wcs_base, coverage_id, label, wcs_version
+            )

            if not tile_paths:
-                print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
+                print(
+                    f"[{label}] WARNING: No tiles downloaded — column will be all null"
+                )
                series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
            else:
-                series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
+                series = sample_noise_at_postcodes(
+                    tile_paths, easting, northing, label, col_name
+                )

            result = result.with_columns(series)

--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -10,7 +10,11 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:

    df = pl.scan_csv(
        csvs,
-        schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
+        schema_overrides={
+            "LSOA code": pl.Utf8,
+            "Crime type": pl.Utf8,
+            "Month": pl.Utf8,
+        },
    ).select("LSOA code", "Crime type", "Month")

    # Extract year, count crimes per LSOA / year / crime type
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -5,7 +5,7 @@ from pathlib import Path

 import polars as pl

-from pipeline.utils.poi_counts import _count_pois_per_postcode
+from pipeline.utils.poi_counts import count_pois_per_postcode


 # POI category groups for proximity counting
@ -13,11 +13,15 @@ POI_GROUPS = {
    "restaurants": ["Restaurant", "Fast Food"],
    "groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
    "parks": ["Park", "Garden", "Nature Reserve"],
-    "public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
+    "public_transport": [
+        "Metro or Tram stop",
+        "Rail station",
+        "Bus stop",
+        "Bus station",
+    ],  # comes from naptan.py
 }


-
 def main():
    parser = argparse.ArgumentParser(
        description="Count POIs within radius per postcode"
@ -41,7 +45,7 @@ def main():

    pois = pl.read_parquet(args.pois)

-    result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
+    result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)