scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -22,6 +22,12 @@ def main() -> None:
        description="Generate postcode boundary polygons from OA + INSPIRE + UPRN data"
    )
    parser.add_argument("--uprn", type=Path, required=True, help="UPRN lookup parquet")
+    parser.add_argument(
+        "--arcgis",
+        type=Path,
+        default=None,
+        help="Optional ArcGIS postcode parquet used to remap terminated postcodes",
+    )
    parser.add_argument(
        "--oa-boundaries", type=Path, required=True, help="OA boundaries GeoPackage"
    )
@ -46,7 +52,7 @@ def main() -> None:
    print("=" * 60)

    oa_geoms = load_oa_boundaries(args.oa_boundaries)
-    uprn_df, uprn_offsets = load_uprns(args.uprn)
+    uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)

    # Phase 2: Parse/load INSPIRE
    print()
--- a/pipeline/transform/postcode_boundaries/loader.py
+++ b/pipeline/transform/postcode_boundaries/loader.py
@ -0,0 +1,105 @@
+"""Load per-district postcode boundary GeoJSONs as EPSG:27700 polygons.
+
+The postcode-boundary pipeline (:mod:`output`) writes one WGS84 GeoJSON per
+postcode district under ``units/{district}.geojson``, each feature carrying a
+``postcodes`` (full unit string, e.g. "AL1 1AG") property. Spatial transforms
+that test points against postcode geometry want those polygons back in British
+National Grid (EPSG:27700) so buffers/distances are in metres.
+
+:func:`load_postcode_polygons` reads the files, reprojects WGS84→27700, repairs
+invalid rings, and returns parallel ``(postcodes, polygons)`` arrays sorted by
+postcode so callers can use the array index as a stable postcode id -- the same
+"buffer index == postcode index" convention used by ``tree_density``.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import numpy as np
+import shapely
+from pyproj import Transformer
+
+
+def _read_district(
+    path: Path, transformer: Transformer
+) -> tuple[np.ndarray, np.ndarray]:
+    """Return (postcodes, polygons_27700) for one district GeoJSON."""
+    with path.open() as file:
+        collection = json.load(file)
+
+    features = collection.get("features", [])
+    if not features:
+        return np.empty(0, dtype=object), np.empty(0, dtype=object)
+
+    postcodes = np.array(
+        [feature["properties"]["postcodes"] for feature in features], dtype=object
+    )
+    geom_json = np.array(
+        [json.dumps(feature["geometry"]) for feature in features], dtype=object
+    )
+    geoms = shapely.from_geojson(geom_json)
+
+    # Reproject every vertex in a single pyproj call, then rebuild the polygons.
+    coords = shapely.get_coordinates(geoms)
+    if coords.size:
+        x, y = transformer.transform(coords[:, 0], coords[:, 1])
+        geoms = shapely.set_coordinates(geoms, np.column_stack([x, y]))
+
+    invalid = ~shapely.is_valid(geoms)
+    if invalid.any():
+        geoms[invalid] = shapely.make_valid(geoms[invalid])
+
+    return postcodes, geoms
+
+
+def load_postcode_polygons(
+    units_dir: Path, max_postcodes: int | None = None
+) -> tuple[np.ndarray, np.ndarray]:
+    """Load all postcode polygons under ``units_dir`` reprojected to EPSG:27700.
+
+    Returns ``(postcodes, polygons)`` parallel object arrays sorted by postcode.
+    ``max_postcodes`` (testing) keeps only the lexicographically-first N
+    postcodes, reading just enough district files to reach the cap.
+    """
+    units_dir = Path(units_dir)
+    files = sorted(units_dir.glob("*.geojson"))
+    if not files:
+        raise FileNotFoundError(f"No postcode-boundary GeoJSONs found in {units_dir}")
+
+    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
+    postcode_chunks: list[np.ndarray] = []
+    geom_chunks: list[np.ndarray] = []
+    total = 0
+    for path in files:
+        postcodes, geoms = _read_district(path, transformer)
+        if len(postcodes) == 0:
+            continue
+        postcode_chunks.append(postcodes)
+        geom_chunks.append(geoms)
+        total += len(postcodes)
+        if max_postcodes is not None and total >= max_postcodes:
+            break
+
+    if not postcode_chunks:
+        raise ValueError(f"No postcode features found in {units_dir}")
+
+    postcodes = np.concatenate(postcode_chunks)
+    geoms = np.concatenate(geom_chunks)
+
+    # Stable postcode order makes "index == postcode id" deterministic; dedupe
+    # defensively (a postcode lives in exactly one district file).
+    order = np.argsort(postcodes, kind="stable")
+    postcodes = postcodes[order]
+    geoms = geoms[order]
+    _, first = np.unique(postcodes, return_index=True)
+    postcodes = postcodes[first]
+    geoms = geoms[first]
+
+    if max_postcodes is not None and len(postcodes) > max_postcodes:
+        postcodes = postcodes[:max_postcodes]
+        geoms = geoms[:max_postcodes]
+
+    print(f"Loaded {len(postcodes):,} postcode polygons from {units_dir}")
+    return postcodes, geoms
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -121,6 +121,50 @@ class TestWhitespacePostcodes:
        loaded_df, _ = load_uprns(path)
        assert len(loaded_df) == 0

+    def test_non_english_oas_excluded(self, tmp_path):
+        df = pl.DataFrame(
+            {
+                "GRIDGB1E": [500010, 300010],
+                "GRIDGB1N": [180010, 220010],
+                "PCDS": ["AA1 1AA", "CF1 1AA"],
+                "OA21CD": ["E00000001", "W00000001"],
+            }
+        )
+        path = tmp_path / "uprn.parquet"
+        df.write_parquet(path)
+
+        loaded_df, offsets = load_uprns(path)
+
+        assert set(offsets) == {"E00000001"}
+        assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
+
+    def test_terminated_postcodes_are_remapped(self, tmp_path):
+        uprns = pl.DataFrame(
+            {
+                "GRIDGB1E": [500010],
+                "GRIDGB1N": [180010],
+                "PCDS": ["aa1 1aa"],
+                "OA21CD": ["E00000001"],
+            }
+        )
+        uprn_path = tmp_path / "uprn.parquet"
+        uprns.write_parquet(uprn_path)
+        arcgis = pl.DataFrame(
+            {
+                "pcds": ["AA1 1AA", "AA1 1AB"],
+                "east1m": [500010, 500030],
+                "north1m": [180010, 180020],
+                "doterm": ["2020-01-01", None],
+                "ctry25cd": ["E92000001", "E92000001"],
+            }
+        )
+        arcgis_path = tmp_path / "arcgis.parquet"
+        arcgis.write_parquet(arcgis_path)
+
+        loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
+
+        assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
+

 # ---------------------------------------------------------------------------
 # Bug 3: Voronoi deduplication is first-seen-wins
--- a/pipeline/transform/postcode_boundaries/uprn.py
+++ b/pipeline/transform/postcode_boundaries/uprn.py
@ -4,11 +4,18 @@ import numpy as np
 import polars as pl

 from pipeline.local_temp import local_tmp_dir
+from pipeline.utils.postcode_mapping import build_postcode_mapping

 from .memory import release_memory


-def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
+def _canonical_postcode_expr(name: str) -> pl.Expr:
+    return pl.col(name).str.strip_chars().str.to_uppercase()
+
+
+def load_uprns(
+    uprn_path: Path, arcgis_path: Path | None = None
+) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
    """Load UPRNs as a sorted polars DataFrame with OA offset lookup.

    Returns (df, offsets) where offsets[oa_code] = (start_row, end_row).
@ -17,29 +24,46 @@ def load_uprns(uprn_path: Path) -> tuple[pl.DataFrame, dict[str, tuple[int, int]
    import tempfile

    print("Loading UPRN lookup...")
+    mapping = None
+    if arcgis_path is not None:
+        mapping = (
+            build_postcode_mapping(arcgis_path)
+            .with_columns(
+                _canonical_postcode_expr("old_postcode").alias("old_postcode"),
+                _canonical_postcode_expr("new_postcode").alias("new_postcode"),
+            )
+            .unique("old_postcode")
+        )

    # Sort via streaming sink to avoid polars doubling memory during in-memory sort
    with tempfile.NamedTemporaryFile(
        suffix=".parquet", delete=False, dir=local_tmp_dir()
    ) as tmp:
        tmp_path = Path(tmp.name)
-    (
+    uprns = (
        pl.scan_parquet(uprn_path)
        .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
-        .filter(~pl.col("OA21CD").str.starts_with("S"))
+        .filter(pl.col("OA21CD").str.starts_with("E"))
        .filter(pl.col("GRIDGB1E").is_not_null() & pl.col("GRIDGB1N").is_not_null())
-        .with_columns(pl.col("PCDS").str.strip_chars())
+        .with_columns(_canonical_postcode_expr("PCDS").alias("PCDS"))
        .filter(pl.col("PCDS").is_not_null() & (pl.col("PCDS") != ""))
-        .sort("OA21CD")
-        .sink_parquet(tmp_path)
    )
+
+    if mapping is not None and mapping.height > 0:
+        uprns = (
+            uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
+            .with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
+            .select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
+        )
+
+    uprns.sort("OA21CD").sink_parquet(tmp_path)
    release_memory()

    # Read the sorted data — only one copy in memory (~2GB)
    df = pl.read_parquet(tmp_path)
    tmp_path.unlink()
    n = len(df)
-    print(f"  Loaded {n:,} UPRNs (England & Wales)")
+    print(f"  Loaded {n:,} UPRNs (England)")

    # Compute OA group offsets using polars (avoids 37M Python string creation)
    boundary_df = (