Format
This commit is contained in:
parent
555ba7cf53
commit
3c2e527328
6 changed files with 64 additions and 26 deletions
|
|
@ -54,9 +54,15 @@ def main() -> None:
|
|||
download(PERFORMANCE_URL, zip_path)
|
||||
extract_zip(zip_path, extract_dir)
|
||||
print(list((extract_dir / "202507_fixed_coverage_r01").glob("*")))
|
||||
extract_zip(extract_dir / "202507_fixed_coverage_r01" / "202507_fixed_pc_coverage_r01.zip", extracted_again_dir)
|
||||
extract_zip(
|
||||
extract_dir
|
||||
/ "202507_fixed_coverage_r01"
|
||||
/ "202507_fixed_pc_coverage_r01.zip",
|
||||
extracted_again_dir,
|
||||
)
|
||||
|
||||
convert_to_parquet(extracted_again_dir, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -31,9 +31,7 @@ def download_and_convert(output_path: Path) -> None:
|
|||
)
|
||||
|
||||
# Rename columns to be descriptive
|
||||
rename_map = {
|
||||
col: f"% {col}" for col in wide.columns if col != "Geography_code"
|
||||
}
|
||||
rename_map = {col: f"% {col}" for col in wide.columns if col != "Geography_code"}
|
||||
wide = wide.rename(rename_map)
|
||||
|
||||
print(f"Output shape: {wide.shape}")
|
||||
|
|
|
|||
|
|
@ -11,8 +11,8 @@ NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
|
|||
|
||||
|
||||
STOP_TYPES = {
|
||||
'AIR': "Airport",
|
||||
'FTD': "Ferry",
|
||||
"AIR": "Airport",
|
||||
"FTD": "Ferry",
|
||||
"RSE": "Rail station",
|
||||
"BCT": "Bus stop",
|
||||
"BCE": "Bus station",
|
||||
|
|
|
|||
|
|
@ -128,14 +128,19 @@ def _download_tile(
|
|||
wcs_version: str = "1.0.0",
|
||||
) -> Path | None:
|
||||
"""Download a single WCS tile. Returns path if successful, None otherwise."""
|
||||
url = _wcs_get_coverage_url(wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version)
|
||||
url = _wcs_get_coverage_url(
|
||||
wcs_base, coverage_id, min_e, min_n, max_e, max_n, wcs_version
|
||||
)
|
||||
try:
|
||||
with httpx.Client(timeout=300, follow_redirects=True) as client:
|
||||
resp = client.get(url)
|
||||
resp.raise_for_status()
|
||||
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "tiff" not in content_type and resp.content[:4] not in (b"II*\x00", b"MM\x00*"):
|
||||
if "tiff" not in content_type and resp.content[:4] not in (
|
||||
b"II*\x00",
|
||||
b"MM\x00*",
|
||||
):
|
||||
return None
|
||||
|
||||
tile_path.write_bytes(resp.content)
|
||||
|
|
@ -146,7 +151,11 @@ def _download_tile(
|
|||
|
||||
|
||||
def download_raster(
|
||||
tile_dir: Path, wcs_base: str, coverage_id: str, label: str, wcs_version: str = "1.0.0"
|
||||
tile_dir: Path,
|
||||
wcs_base: str,
|
||||
coverage_id: str,
|
||||
label: str,
|
||||
wcs_version: str = "1.0.0",
|
||||
) -> list[Path]:
|
||||
"""Download noise GeoTIFF raster covering England, returning paths to saved files."""
|
||||
tiles = []
|
||||
|
|
@ -156,7 +165,9 @@ def download_raster(
|
|||
max_n = min(min_n + TILE_SIZE, BNG_MAX_N)
|
||||
tiles.append((min_e, min_n, max_e, max_n))
|
||||
|
||||
print(f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)...")
|
||||
print(
|
||||
f"[{label}] Downloading {len(tiles)} tiles at {RESOLUTION}m resolution ({MAX_WORKERS} workers)..."
|
||||
)
|
||||
paths = []
|
||||
completed = 0
|
||||
|
||||
|
|
@ -165,8 +176,15 @@ def download_raster(
|
|||
for min_e, min_n, max_e, max_n in tiles:
|
||||
tile_path = tile_dir / f"tile_{min_e}_{min_n}.tif"
|
||||
fut = executor.submit(
|
||||
_download_tile, wcs_base, coverage_id,
|
||||
min_e, min_n, max_e, max_n, tile_path, wcs_version,
|
||||
_download_tile,
|
||||
wcs_base,
|
||||
coverage_id,
|
||||
min_e,
|
||||
min_n,
|
||||
max_e,
|
||||
max_n,
|
||||
tile_path,
|
||||
wcs_version,
|
||||
)
|
||||
futures[fut] = (min_e, min_n)
|
||||
|
||||
|
|
@ -223,7 +241,9 @@ def sample_noise_at_postcodes(
|
|||
noise_db[in_bounds] = sampled
|
||||
|
||||
valid_count = int(np.sum(~np.isnan(noise_db)))
|
||||
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
|
||||
print(
|
||||
f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data"
|
||||
)
|
||||
|
||||
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
|
||||
# correctly ignores missing values instead of propagating NaN.
|
||||
|
|
@ -248,9 +268,9 @@ def main() -> None:
|
|||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("Loading postcode coordinates...")
|
||||
postcodes = pl.read_parquet(
|
||||
args.arcgis, columns=["pcds", "lat", "long"]
|
||||
).rename({"pcds": "postcode", "long": "lon"})
|
||||
postcodes = pl.read_parquet(args.arcgis, columns=["pcds", "lat", "long"]).rename(
|
||||
{"pcds": "postcode", "long": "lon"}
|
||||
)
|
||||
|
||||
lat = postcodes["lat"].to_numpy()
|
||||
lon = postcodes["lon"].to_numpy()
|
||||
|
|
@ -264,13 +284,19 @@ def main() -> None:
|
|||
for label, col_name, wcs_base, coverage_id, wcs_version in NOISE_SOURCES:
|
||||
tile_dir = Path(tmp) / label.lower()
|
||||
tile_dir.mkdir()
|
||||
tile_paths = download_raster(tile_dir, wcs_base, coverage_id, label, wcs_version)
|
||||
tile_paths = download_raster(
|
||||
tile_dir, wcs_base, coverage_id, label, wcs_version
|
||||
)
|
||||
|
||||
if not tile_paths:
|
||||
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
|
||||
print(
|
||||
f"[{label}] WARNING: No tiles downloaded — column will be all null"
|
||||
)
|
||||
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
|
||||
else:
|
||||
series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
|
||||
series = sample_noise_at_postcodes(
|
||||
tile_paths, easting, northing, label, col_name
|
||||
)
|
||||
|
||||
result = result.with_columns(series)
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,11 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
|
||||
df = pl.scan_csv(
|
||||
csvs,
|
||||
schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
|
||||
schema_overrides={
|
||||
"LSOA code": pl.Utf8,
|
||||
"Crime type": pl.Utf8,
|
||||
"Month": pl.Utf8,
|
||||
},
|
||||
).select("LSOA code", "Crime type", "Month")
|
||||
|
||||
# Extract year, count crimes per LSOA / year / crime type
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from pathlib import Path
|
|||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.poi_counts import _count_pois_per_postcode
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
|
||||
# POI category groups for proximity counting
|
||||
|
|
@ -13,11 +13,15 @@ POI_GROUPS = {
|
|||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Greengrocer", "Grocery Shop", "Supermarket", "Convenience Store"],
|
||||
"parks": ["Park", "Garden", "Nature Reserve"],
|
||||
"public_transport": ["Metro or Tram stop", "Rail station", "Bus stop", "Bus station"], # comes from naptan.py
|
||||
"public_transport": [
|
||||
"Metro or Tram stop",
|
||||
"Rail station",
|
||||
"Bus stop",
|
||||
"Bus station",
|
||||
], # comes from naptan.py
|
||||
}
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Count POIs within radius per postcode"
|
||||
|
|
@ -41,7 +45,7 @@ def main():
|
|||
|
||||
pois = pl.read_parquet(args.pois)
|
||||
|
||||
result = _count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
|
||||
result = count_pois_per_postcode(postcodes, pois, groups=POI_GROUPS, radius_km=2)
|
||||
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue