scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
105
pipeline/transform/postcode_boundaries/loader.py
Normal file
105
pipeline/transform/postcode_boundaries/loader.py
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
|
||||
|
||||
The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
|
||||
postcode district under ``units/{district}.geojson``, each feature carrying a
|
||||
``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
|
||||
that test points against postcode geometry want those polygons back in British
|
||||
National Grid (EPSG:27700) so buffers/distances are in metres.
|
||||
|
||||
:func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs
|
||||
invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
|
||||
postcode so callers can use the array index as a stable postcode id -- the same
|
||||
"buffer index == postcode index" convention used by ``tree_density``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import shapely
|
||||
from pyproj import Transformer
|
||||
|
||||
|
||||
def _read_district(
|
||||
path: Path, transformer: Transformer
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Return (postcodes, polygons_27700) for one district GeoJSON."""
|
||||
with path.open() as file:
|
||||
collection = json.load(file)
|
||||
|
||||
features = collection.get("features", [])
|
||||
if not features:
|
||||
return np.empty(0, dtype=object), np.empty(0, dtype=object)
|
||||
|
||||
postcodes = np.array(
|
||||
[feature["properties"]["postcodes"] for feature in features], dtype=object
|
||||
)
|
||||
geom_json = np.array(
|
||||
[json.dumps(feature["geometry"]) for feature in features], dtype=object
|
||||
)
|
||||
geoms = shapely.from_geojson(geom_json)
|
||||
|
||||
# Reproject every vertex in a single pyproj call, then rebuild the polygons.
|
||||
coords = shapely.get_coordinates(geoms)
|
||||
if coords.size:
|
||||
x, y = transformer.transform(coords[:, 0], coords[:, 1])
|
||||
geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
|
||||
|
||||
invalid = ~shapely.is_valid(geoms)
|
||||
if invalid.any():
|
||||
geoms[invalid] = shapely.make_valid(geoms[invalid])
|
||||
|
||||
return postcodes, geoms
|
||||
|
||||
|
||||
def load_postcode_polygons(
|
||||
units_dir: Path, max_postcodes: int | None = None
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
"""Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
|
||||
|
||||
Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
|
||||
``max_postcodes`` (testing) keeps only the lexicographically-first N
|
||||
postcodes, reading just enough district files to reach the cap.
|
||||
"""
|
||||
units_dir = Path(units_dir)
|
||||
files = sorted(units_dir.glob("*.geojson"))
|
||||
if not files:
|
||||
raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
|
||||
|
||||
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
postcode_chunks: list[np.ndarray] = []
|
||||
geom_chunks: list[np.ndarray] = []
|
||||
total = 0
|
||||
for path in files:
|
||||
postcodes, geoms = _read_district(path, transformer)
|
||||
if len(postcodes) == 0:
|
||||
continue
|
||||
postcode_chunks.append(postcodes)
|
||||
geom_chunks.append(geoms)
|
||||
total += len(postcodes)
|
||||
if max_postcodes is not None and total >= max_postcodes:
|
||||
break
|
||||
|
||||
if not postcode_chunks:
|
||||
raise ValueError(f"No postcode features found in {units_dir}")
|
||||
|
||||
postcodes = np.concatenate(postcode_chunks)
|
||||
geoms = np.concatenate(geom_chunks)
|
||||
|
||||
# Stable postcode order makes "index == postcode id" deterministic; dedupe
|
||||
# defensively (a postcode lives in exactly one district file).
|
||||
order = np.argsort(postcodes, kind="stable")
|
||||
postcodes = postcodes[order]
|
||||
geoms = geoms[order]
|
||||
_, first = np.unique(postcodes, return_index=True)
|
||||
postcodes = postcodes[first]
|
||||
geoms = geoms[first]
|
||||
|
||||
if max_postcodes is not None and len(postcodes) > max_postcodes:
|
||||
postcodes = postcodes[:max_postcodes]
|
||||
geoms = geoms[:max_postcodes]
|
||||
|
||||
print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
|
||||
return postcodes, geoms
|
||||
Loading…
Add table
Add a link
Reference in a new issue