scraping and data

2026-05-31 15:36:33 +01:00 · 2026-05-31 15:36:33 +01:00 · 8688b7475e
commit 8688b7475e
parent d98819b569
43 changed files with 4920 additions and 531 deletions
--- a/pipeline/transform/tree_density.py
+++ b/pipeline/transform/tree_density.py
@ -1,10 +1,16 @@
-"""Derive street-scale tree density metrics from Forest Research TOW data.
+"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.

 The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
 inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
 postcode-level metric from the tree polygons, then optionally rolls that up to
 Price Paid street names so the dashboard can answer "what is this address's
 street like?" without loading the full geodatabase at runtime.
+
+TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
+woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
+proximity (tiny crowns), while large NFI woodland parcels are accumulated by
+true buffer-clipped intersection area so they cannot saturate a postcode from
+mere centroid proximity.
 """

 from __future__ import annotations
@ -22,7 +28,6 @@ import shapely
 from scipy.spatial import cKDTree


-DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
 TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
 STREET_TREE_DENSITY_COL = "Street tree density percentile"
 STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
 POSTCODE_COUNT_COL = "Tree features within {radius}m"
 POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"

+# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
+# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
+# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
+NFI_CATEGORY_COL = "CATEGORY"
+NFI_WOODLAND_VALUE = "Woodland"
+NFI_TYPE_COL = "IFT_IOA"
+NFI_AREA_HA_COL = "Area_ha"
+

 def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
    """Extract the TOW zip and return the extracted .gdb path."""
@ -83,12 +96,60 @@ def _tow_dataset_path(
    return str(_safe_extract_zip(zip_path, extract_dir, force_extract))


-def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
-    if not tow_types:
-        return None
-    escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
-    values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
-    return f"Woodland_Type IN ({values})"
+def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
+    """Extract an arbitrary zip into extract_dir and return the directory."""
+    if extract_dir.exists() and not force:
+        print(f"Using existing extraction directory: {extract_dir}")
+        return extract_dir
+    if extract_dir.exists():
+        shutil.rmtree(extract_dir)
+
+    tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
+    if tmp_dir.exists():
+        shutil.rmtree(tmp_dir)
+    tmp_dir.mkdir(parents=True)
+
+    root = tmp_dir.resolve()
+    print(f"Extracting {zip_path} to {extract_dir}...")
+    with zipfile.ZipFile(zip_path) as archive:
+        for member in archive.infolist():
+            target = (tmp_dir / member.filename).resolve()
+            if root != target and root not in target.parents:
+                raise ValueError(f"Unsafe path in zip archive: {member.filename}")
+            if member.is_dir():
+                target.mkdir(parents=True, exist_ok=True)
+                continue
+            target.parent.mkdir(parents=True, exist_ok=True)
+            with archive.open(member) as source, target.open("wb") as dest:
+                shutil.copyfileobj(source, dest, length=1024 * 1024)
+
+    tmp_dir.rename(extract_dir)
+    print(f"Extracted archive: {extract_dir}")
+    return extract_dir
+
+
+def _nfi_dataset_path(
+    zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
+) -> str:
+    """Resolve the NFI woodland shapefile path, extracting the zip if needed."""
+    if use_vsizip:
+        return f"/vsizip/{zip_path.resolve()}"
+    extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
+    shapefiles = sorted(extracted.rglob("*.shp"))
+    if not shapefiles:
+        raise FileNotFoundError(f"No .shp found inside {zip_path}")
+    return str(shapefiles[0])
+
+
+def _geometry_column(metadata: dict, column_names: list[str]) -> str:
+    """Resolve the geometry column name from pyogrio Arrow metadata."""
+    geometry_name = metadata.get("geometry_name")
+    if geometry_name:
+        return str(geometry_name)
+    for name in ("wkb_geometry", "geometry", "geom"):
+        if name in column_names:
+            return name
+    return column_names[-1]


 def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
@ -172,26 +233,20 @@ def _accumulate_tree_metrics(
    dataset_path: str,
    points: pl.DataFrame,
    radius_m: int,
-    tow_types: tuple[str, ...] | None,
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
    workers: int,
-) -> pl.DataFrame:
+    canopy_area: np.ndarray,
+    feature_count: np.ndarray,
+    height_weighted_sum: np.ndarray,
+    height_weight: np.ndarray,
+) -> None:
    xy = points.select("x", "y").to_numpy()
    tree = cKDTree(xy)
-    n_points = points.height

-    canopy_area = np.zeros(n_points, dtype=np.float64)
-    feature_count = np.zeros(n_points, dtype=np.uint32)
-    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
-    height_weight = np.zeros(n_points, dtype=np.float64)
-
-    where = _where_for_tow_types(tow_types)
    layers = _layers(dataset_path, layer_names)
    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
-    if where:
-        print(f"TOW type filter: {where}")

    columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
    total_features_seen = 0
@ -206,7 +261,6 @@ def _accumulate_tree_metrics(
            dataset_path,
            layer=layer,
            columns=columns,
-            where=where,
            batch_size=batch_size,
            use_pyarrow=True,
        ) as (_meta, reader):
@ -297,6 +351,132 @@ def _accumulate_tree_metrics(
                        f"{total_features_used:,} features with usable centroids"
                    )

+
+def _postcode_buffers(
+    points: pl.DataFrame, radius_m: int
+) -> tuple[np.ndarray, shapely.STRtree]:
+    """Build a radius-r circle for every postcode plus an STRtree over them.
+
+    Circle index == postcode index, matching the order used by the cKDTree path.
+    """
+    xy = points.select("x", "y").to_numpy()
+    circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
+    return circles, shapely.STRtree(circles)
+
+
+def _add_nfi_batch(
+    geoms: np.ndarray,
+    category: np.ndarray,
+    circles: np.ndarray,
+    tree: shapely.STRtree,
+    canopy_area: np.ndarray,
+    feature_count: np.ndarray,
+    radius_m: int,
+) -> None:
+    """Add NFI woodland into the shared arrays by true buffer-clipped area.
+
+    Unlike the TOW centroid path, this clips each woodland polygon to each
+    nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
+    therefore cannot saturate a postcode from mere centroid proximity, and a
+    buffer-filling parcel whose centroid is outside the radius is not missed.
+    """
+    keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
+    geoms = geoms[keep]
+    if geoms.size:
+        geoms = geoms[~shapely.is_empty(geoms)]
+    if geoms.size == 0:
+        return
+
+    # dwithin(polygon, point, r) is true iff the radius-r circle around the
+    # point intersects the polygon -- exactly the candidate set we want.
+    nfi_index, postcode_index = tree.query(
+        geoms, predicate="dwithin", distance=radius_m
+    )
+    if nfi_index.size == 0:
+        return
+
+    clipped_area = shapely.area(
+        shapely.intersection(geoms[nfi_index], circles[postcode_index])
+    )
+    positive = clipped_area > 0
+    postcode_index = postcode_index[positive]
+    clipped_area = clipped_area[positive]
+
+    np.add.at(canopy_area, postcode_index, clipped_area)
+    np.add.at(feature_count, postcode_index, 1)
+
+
+def _accumulate_nfi_metrics(
+    dataset_path: str,
+    circles: np.ndarray,
+    tree: shapely.STRtree,
+    canopy_area: np.ndarray,
+    feature_count: np.ndarray,
+    radius_m: int,
+    batch_size: int,
+    max_nfi_features: int | None,
+) -> None:
+    layers = _layers(dataset_path, None)
+    print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
+
+    # Density only needs the woodland flag + geometry; area is clipped from the
+    # postcode buffer, not read from the file.
+    columns = [NFI_CATEGORY_COL]
+    features_seen = 0
+
+    for layer in layers:
+        with pyogrio.open_arrow(
+            dataset_path,
+            layer=layer,
+            columns=columns,
+            batch_size=batch_size,
+            use_pyarrow=True,
+        ) as (meta, reader):
+            for batch_index, batch in enumerate(reader, start=1):
+                if max_nfi_features is not None:
+                    remaining = max_nfi_features - features_seen
+                    if remaining <= 0:
+                        break
+                    if batch.num_rows > remaining:
+                        batch = batch.slice(0, remaining)
+
+                features_seen += batch.num_rows
+                names = batch.schema.names
+                geometry_column = _geometry_column(meta, names)
+                category = np.asarray(
+                    batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
+                        zero_copy_only=False
+                    ),
+                    dtype=object,
+                )
+                geometry = np.asarray(
+                    batch.column(names.index(geometry_column)).to_numpy(
+                        zero_copy_only=False
+                    ),
+                    dtype=object,
+                )
+                _add_nfi_batch(
+                    shapely.from_wkb(geometry),
+                    category,
+                    circles,
+                    tree,
+                    canopy_area,
+                    feature_count,
+                    radius_m,
+                )
+                if batch_index == 1 or batch_index % 25 == 0:
+                    print(f"  NFI batch {batch_index:,}: {features_seen:,} rows read")
+
+
+def _finalize_metrics(
+    points: pl.DataFrame,
+    canopy_area: np.ndarray,
+    feature_count: np.ndarray,
+    height_weighted_sum: np.ndarray,
+    height_weight: np.ndarray,
+    radius_m: int,
+) -> pl.DataFrame:
+    n_points = points.height
    density_col, area_col, count_col, height_col = _metric_columns(radius_m)
    buffer_area = math.pi * radius_m * radius_m
    density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
@ -518,6 +698,18 @@ def main() -> None:
        action="store_true",
        help="Read the geodatabase directly from the zip instead of extracting it",
    )
+    parser.add_argument(
+        "--nfi-zip",
+        type=Path,
+        default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
+        help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
+    )
+    parser.add_argument(
+        "--nfi-extract-dir",
+        type=Path,
+        default=Path("property-data/nfi_woodland_england"),
+        help="Directory where the NFI zip is extracted",
+    )
    parser.add_argument(
        "--arcgis",
        type=Path,
@ -554,11 +746,6 @@ def main() -> None:
        default=50,
        help="Radius around each postcode centroid used as the street-scale buffer",
    )
-    parser.add_argument(
-        "--tow-types",
-        default=",".join(DEFAULT_TOW_TYPES),
-        help='Comma-separated Woodland_Type values to include, or "all"',
-    )
    parser.add_argument(
        "--layers",
        default=None,
@ -588,6 +775,12 @@ def main() -> None:
        default=None,
        help="Testing only: process at most N TOW features per layer",
    )
+    parser.add_argument(
+        "--max-nfi-features",
+        type=int,
+        default=None,
+        help="Testing only: process at most N NFI woodland features",
+    )
    args = parser.parse_args()

    if (args.output_streets or args.output_addresses) and args.price_paid is None:
@ -600,18 +793,53 @@ def main() -> None:
        args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
    )
    points = _postcode_points(args.arcgis, args.max_postcodes)
-    tow_types = _parse_csv_arg(args.tow_types)
    layer_names = _parse_csv_arg(args.layers)

-    postcode_metrics = _accumulate_tree_metrics(
+    n_points = points.height
+    canopy_area = np.zeros(n_points, dtype=np.float64)
+    feature_count = np.zeros(n_points, dtype=np.uint32)
+    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
+    height_weight = np.zeros(n_points, dtype=np.float64)
+
+    _accumulate_tree_metrics(
        dataset_path=dataset_path,
        points=points,
        radius_m=args.radius_m,
-        tow_types=tow_types,
        batch_size=args.batch_size,
        layer_names=layer_names,
        max_features_per_layer=args.max_features_per_layer,
        workers=args.workers,
+        canopy_area=canopy_area,
+        feature_count=feature_count,
+        height_weighted_sum=height_weighted_sum,
+        height_weight=height_weight,
+    )
+
+    if args.nfi_zip is not None and args.nfi_zip.exists():
+        nfi_path = _nfi_dataset_path(
+            args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
+        )
+        circles, nfi_tree = _postcode_buffers(points, args.radius_m)
+        _accumulate_nfi_metrics(
+            dataset_path=nfi_path,
+            circles=circles,
+            tree=nfi_tree,
+            canopy_area=canopy_area,
+            feature_count=feature_count,
+            radius_m=args.radius_m,
+            batch_size=args.batch_size,
+            max_nfi_features=args.max_nfi_features,
+        )
+    elif args.nfi_zip is not None:
+        print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
+
+    postcode_metrics = _finalize_metrics(
+        points,
+        canopy_area,
+        feature_count,
+        height_weighted_sum,
+        height_weight,
+        args.radius_m,
    )
    postcode_metrics = _with_postcode_density_percentiles(
        postcode_metrics, args.radius_m