perfect-postcode/pipeline/transform/tree_density.py

"""Derive postcode-scale tree density metrics from Forest Research TOW + NFI data.

The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
postcode-level metric from the tree polygons so the dashboard can answer "how
green is this postcode?" without loading the full geodatabase at runtime.

Every postcode centroid is expanded into a radius-r buffer ("extended area").
Both TOW tree crowns and National Forest Inventory (NFI) woodland parcels are
accumulated by *true buffer-clipped intersection area*: only the part of each
polygon that falls inside a postcode's buffer is counted, never the area that
spills outside it. A crown straddling the buffer edge therefore contributes only
its inside portion, and a parcel reaching into the buffer from outside is still
counted -- no polygon can saturate a postcode from mere proximity.

TOW only covers trees *outside* woodland, so the NFI woodland layer is the
geometric complement of TOW and is optionally unioned in. The two products are
*assumed disjoint*: clipped TOW crown area and clipped NFI woodland area are
summed into the same per-postcode accumulator, so any spatial overlap between a
TOW crown and an NFI parcel (boundary slop where "groups of trees" meet
"woodland") would be double-counted. The final density is capped at 100% and
_finalize_metrics logs how many postcodes exceed 100% raw coverage, which is a
direct symptom of such overlap (or of overlapping crowns within one buffer); if
that count is material the products are not disjoint and the NFI clip should be
taken against the complement of TOW.
"""

from __future__ import annotations

import argparse
import math
import shutil
import zipfile
from pathlib import Path

import numpy as np
import polars as pl
import pyogrio
import shapely


TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"

# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
NFI_CATEGORY_COL = "CATEGORY"
NFI_WOODLAND_VALUE = "Woodland"
NFI_TYPE_COL = "IFT_IOA"
NFI_AREA_HA_COL = "Area_ha"


def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
    """Extract the TOW zip and return the extracted .gdb path."""
    gdb_path = extract_dir / TOW_GDB_NAME
    if gdb_path.exists() and not force:
        print(f"Using existing extracted geodatabase: {gdb_path}")
        return gdb_path

    if force and extract_dir.exists():
        shutil.rmtree(extract_dir)
    elif extract_dir.exists():
        print(f"Removing incomplete extraction directory: {extract_dir}")
        shutil.rmtree(extract_dir)

    tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True)

    root = tmp_dir.resolve()
    print(f"Extracting {zip_path} to {extract_dir}...")
    with zipfile.ZipFile(zip_path) as archive:
        for member in archive.infolist():
            target = (tmp_dir / member.filename).resolve()
            if root != target and root not in target.parents:
                raise ValueError(f"Unsafe path in zip archive: {member.filename}")

            if member.is_dir():
                target.mkdir(parents=True, exist_ok=True)
                continue

            target.parent.mkdir(parents=True, exist_ok=True)
            with archive.open(member) as source, target.open("wb") as dest:
                shutil.copyfileobj(source, dest, length=1024 * 1024)

    if not (tmp_dir / TOW_GDB_NAME).exists():
        raise FileNotFoundError(f"{TOW_GDB_NAME} was not found inside {zip_path}")

    tmp_dir.rename(extract_dir)
    print(f"Extracted geodatabase: {gdb_path}")
    return gdb_path


def _tow_dataset_path(
    zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
) -> str:
    if use_vsizip:
        return f"/vsizip/{zip_path.resolve()}/{TOW_GDB_NAME}"
    return str(_safe_extract_zip(zip_path, extract_dir, force_extract))


def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
    """Extract an arbitrary zip into extract_dir and return the directory."""
    if extract_dir.exists() and not force:
        print(f"Using existing extraction directory: {extract_dir}")
        return extract_dir
    if extract_dir.exists():
        shutil.rmtree(extract_dir)

    tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True)

    root = tmp_dir.resolve()
    print(f"Extracting {zip_path} to {extract_dir}...")
    with zipfile.ZipFile(zip_path) as archive:
        for member in archive.infolist():
            target = (tmp_dir / member.filename).resolve()
            if root != target and root not in target.parents:
                raise ValueError(f"Unsafe path in zip archive: {member.filename}")
            if member.is_dir():
                target.mkdir(parents=True, exist_ok=True)
                continue
            target.parent.mkdir(parents=True, exist_ok=True)
            with archive.open(member) as source, target.open("wb") as dest:
                shutil.copyfileobj(source, dest, length=1024 * 1024)

    tmp_dir.rename(extract_dir)
    print(f"Extracted archive: {extract_dir}")
    return extract_dir


def _nfi_dataset_path(
    zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
) -> str:
    """Resolve the NFI woodland shapefile path, extracting the zip if needed.

    Raises if the archive contains zero or more than one shapefile rather than
    silently picking one, so an ambiguous NFI release fails loudly instead of
    accumulating canopy from the wrong layer.
    """
    if use_vsizip:
        return f"/vsizip/{zip_path.resolve()}"
    extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
    shapefiles = sorted(extracted.rglob("*.shp"))
    if not shapefiles:
        raise FileNotFoundError(f"No .shp found inside {zip_path}")
    if len(shapefiles) > 1:
        names = ", ".join(path.name for path in shapefiles)
        raise ValueError(
            f"Expected exactly one shapefile inside {zip_path}, found {len(shapefiles)} "
            f"({names}); cannot unambiguously pick the NFI woodland layer"
        )
    return str(shapefiles[0])


def _geometry_column(metadata: dict, column_names: list[str]) -> str:
    """Resolve the geometry column name from pyogrio Arrow metadata."""
    geometry_name = metadata.get("geometry_name")
    if geometry_name:
        return str(geometry_name)
    for name in ("wkb_geometry", "geometry", "geom", "SHAPE"):
        if name in column_names:
            return name
    return column_names[-1]


def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
    points = (
        pl.scan_parquet(arcgis_path)
        .filter(pl.col("ctry25cd") == "E92000001")
        .filter(pl.col("doterm").is_null())
        .select(
            pl.col("pcds").alias("postcode"),
            pl.col("east1m").cast(pl.Float64).alias("x"),
            pl.col("north1m").cast(pl.Float64).alias("y"),
        )
        .drop_nulls(["postcode", "x", "y"])
        .unique("postcode")
        .sort("postcode")
    )
    if max_postcodes is not None:
        points = points.head(max_postcodes)
    df = points.collect()
    print(f"Loaded {df.height:,} active English postcode points")
    return df


def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[str]:
    available = [layer for layer, _geometry_type in pyogrio.list_layers(dataset_path)]
    if selected_layers is None:
        return available

    missing = sorted(set(selected_layers) - set(available))
    if missing:
        raise ValueError(f"Unknown TOW layer(s): {', '.join(missing)}")
    return [layer for layer in available if layer in selected_layers]


def _metric_columns(radius_m: int) -> tuple[str, str, str]:
    return (
        POSTCODE_DENSITY_COL.format(radius=radius_m),
        POSTCODE_AREA_COL.format(radius=radius_m),
        POSTCODE_HEIGHT_COL.format(radius=radius_m),
    )


def _postcode_density_percentile_col(radius_m: int) -> str:
    return POSTCODE_DENSITY_PERCENTILE_COL.format(radius=radius_m)


def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
    """Rank tree coverage on a 0-100 England-wide percentile scale.

    A single tie-consistent average-rank formula is used for every value so the
    scale is internally consistent end to end: tied values share their mean rank,
    so the lowest coverage maps toward 0 and the highest toward 100 only when they
    are not themselves tied. An all-equal (or single-value) column has no spread
    and maps to the neutral midpoint (50).
    """
    value = pl.col(column).fill_nan(None)
    non_null_count = value.count()
    rank = value.rank("average")
    return (
        pl.when(value.is_null())
        .then(None)
        .when(non_null_count > 1)
        .then(((rank - 1) / (non_null_count - 1) * 100).round(1))
        .otherwise(50.0)
        .cast(pl.Float32)
        .alias(alias)
    )


def _with_postcode_density_percentiles(
    postcode_metrics: pl.DataFrame, radius_m: int
) -> pl.DataFrame:
    density_col, _area_col, _height_col = _metric_columns(radius_m)
    return postcode_metrics.with_columns(
        _coverage_percentile_expr(
            density_col,
            _postcode_density_percentile_col(radius_m),
        )
    )


def _postcode_buffers(
    points: pl.DataFrame, radius_m: int
) -> tuple[np.ndarray, shapely.STRtree]:
    """Build a radius-r circle for every postcode plus an STRtree over them.

    Circle index == postcode index, so an STRtree match resolves directly to the
    postcode accumulator slot.
    """
    xy = points.select("x", "y").to_numpy()
    circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
    return circles, shapely.STRtree(circles)


# 0.1 mm in the BNG working CRS (EPSG:27700) — far below survey resolution; the
# same grid the postcode_boundaries overlay uses.
_OVERLAY_GRID_M = 1e-4


def _robust_intersection_area(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    """Vectorized ``area(a[i] ∩ b[i])`` that survives GEOS robustness failures.

    External Forest Research TOW/NFI polygons are occasionally invalid
    (self-intersections), and a single bad polygon makes the batched
    ``shapely.intersection`` raise ``TopologyException: side location conflict``,
    aborting the whole run. The fast path is the raw batched overlay — unchanged,
    full-speed, when the data is clean — and only a failure triggers repair.

    The repair deliberately uses a *plain* overlay rather than the fixed-precision
    (``grid_size``) one: ``make_valid`` can emit a mixed-dimension
    ``GeometryCollection`` (a polygon plus a dangling line), which OverlayNG
    rejects with ``Overlay input is mixed-dimension`` — whereas a plain overlay
    accepts it, and its non-polygonal debris has zero area and is dropped by the
    ``clipped_area > 0`` filter downstream anyway. A final pointwise coordinate
    snap (which never raises) collapses the near-coincident edges behind any
    residual full-precision robustness failure.
    """
    try:
        return shapely.area(shapely.intersection(a, b))
    except shapely.errors.GEOSException:
        pass
    a = shapely.make_valid(a)
    b = shapely.make_valid(b)
    try:
        return shapely.area(shapely.intersection(a, b))
    except shapely.errors.GEOSException:
        a = shapely.make_valid(shapely.set_precision(a, _OVERLAY_GRID_M, mode="pointwise"))
        b = shapely.make_valid(shapely.set_precision(b, _OVERLAY_GRID_M, mode="pointwise"))
        return shapely.area(shapely.intersection(a, b))


def _accumulate_clipped_area(
    geoms: np.ndarray,
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
    height: np.ndarray | None = None,
    height_weighted_sum: np.ndarray | None = None,
    height_weight: np.ndarray | None = None,
) -> None:
    """Add each polygon's in-buffer overlap area to every postcode it intersects.

    Only area(polygon ∩ circle) is accumulated -- never the area of the polygon
    that falls outside the postcode's extended buffer -- so a crown straddling
    the buffer edge contributes only its inside portion and a large parcel cannot
    saturate a postcode from mere proximity. When ``height`` is supplied the mean
    feature height is accumulated weighted by that same clipped overlap area.
    """
    keep = ~shapely.is_missing(geoms) & ~shapely.is_empty(geoms)
    geoms = geoms[keep]
    if height is not None:
        height = height[keep]
    if geoms.size == 0:
        return

    # query(predicate="intersects") over the circle STRtree returns exactly the
    # (polygon, circle) pairs whose clipped overlap can be positive -- i.e. the
    # polygon overlaps that postcode's radius-r buffer.
    geom_index, postcode_index = tree.query(geoms, predicate="intersects")
    if geom_index.size == 0:
        return

    clipped_area = _robust_intersection_area(
        geoms[geom_index], circles[postcode_index]
    )
    positive = clipped_area > 0
    geom_index = geom_index[positive]
    postcode_index = postcode_index[positive]
    clipped_area = clipped_area[positive]

    np.add.at(canopy_area, postcode_index, clipped_area)

    if height is not None:
        feature_height = height[geom_index]
        finite = np.isfinite(feature_height)
        if finite.any():
            np.add.at(
                height_weighted_sum,
                postcode_index[finite],
                feature_height[finite] * clipped_area[finite],
            )
            np.add.at(height_weight, postcode_index[finite], clipped_area[finite])


def _accumulate_tow_metrics(
    dataset_path: str,
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
    height_weighted_sum: np.ndarray,
    height_weight: np.ndarray,
    batch_size: int,
    layer_names: tuple[str, ...] | None,
    max_features_per_layer: int | None,
) -> None:
    layers = _layers(dataset_path, layer_names)
    print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")

    columns = ["MEANHT"]
    total_features_seen = 0

    for layer in layers:
        info = pyogrio.read_info(dataset_path, layer=layer)
        print(f"\nLayer {layer}: {info.get('features', 0):,} features")
        layer_features_seen = 0

        with pyogrio.open_arrow(
            dataset_path,
            layer=layer,
            columns=columns,
            batch_size=batch_size,
            use_pyarrow=True,
        ) as (meta, reader):
            for batch_index, batch in enumerate(reader, start=1):
                if max_features_per_layer is not None:
                    remaining = max_features_per_layer - layer_features_seen
                    if remaining <= 0:
                        break
                    if batch.num_rows > remaining:
                        batch = batch.slice(0, remaining)

                layer_features_seen += batch.num_rows
                total_features_seen += batch.num_rows
                names = batch.schema.names
                geometry_column = _geometry_column(meta, names)
                height = np.asarray(
                    batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
                    dtype=np.float64,
                )
                geometry = np.asarray(
                    batch.column(names.index(geometry_column)).to_numpy(
                        zero_copy_only=False
                    ),
                    dtype=object,
                )
                _accumulate_clipped_area(
                    shapely.from_wkb(geometry),
                    circles,
                    tree,
                    canopy_area,
                    height=height,
                    height_weighted_sum=height_weighted_sum,
                    height_weight=height_weight,
                )

                if batch_index == 1 or batch_index % 25 == 0:
                    print(f"  batch {batch_index:,}: {total_features_seen:,} rows read")


def _accumulate_nfi_metrics(
    dataset_path: str,
    circles: np.ndarray,
    tree: shapely.STRtree,
    canopy_area: np.ndarray,
    batch_size: int,
    max_nfi_features: int | None,
) -> None:
    layers = _layers(dataset_path, None)
    print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")

    # Density only needs the woodland flag + geometry; area is clipped from the
    # postcode buffer, not read from the file.
    columns = [NFI_CATEGORY_COL]
    features_seen = 0

    for layer in layers:
        with pyogrio.open_arrow(
            dataset_path,
            layer=layer,
            columns=columns,
            batch_size=batch_size,
            use_pyarrow=True,
        ) as (meta, reader):
            for batch_index, batch in enumerate(reader, start=1):
                if max_nfi_features is not None:
                    remaining = max_nfi_features - features_seen
                    if remaining <= 0:
                        break
                    if batch.num_rows > remaining:
                        batch = batch.slice(0, remaining)

                features_seen += batch.num_rows
                names = batch.schema.names
                geometry_column = _geometry_column(meta, names)
                category = np.asarray(
                    batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
                        zero_copy_only=False
                    ),
                    dtype=object,
                )
                geometry = np.asarray(
                    batch.column(names.index(geometry_column)).to_numpy(
                        zero_copy_only=False
                    ),
                    dtype=object,
                )
                geoms = shapely.from_wkb(geometry)
                _accumulate_clipped_area(
                    geoms[category == NFI_WOODLAND_VALUE],
                    circles,
                    tree,
                    canopy_area,
                )
                if batch_index == 1 or batch_index % 25 == 0:
                    print(f"  NFI batch {batch_index:,}: {features_seen:,} rows read")


def _finalize_metrics(
    points: pl.DataFrame,
    canopy_area: np.ndarray,
    height_weighted_sum: np.ndarray,
    height_weight: np.ndarray,
    radius_m: int,
) -> pl.DataFrame:
    n_points = points.height
    density_col, area_col, height_col = _metric_columns(radius_m)
    buffer_area = math.pi * radius_m * radius_m
    raw_density = canopy_area / buffer_area * 100.0
    density_pct = np.minimum(raw_density, 100.0)

    # Symptom of the assumed-disjoint TOW/NFI union being violated (or of
    # overlapping crowns inside one buffer): clipped areas alone cannot exceed the
    # buffer unless polygons overlap. Surface it rather than hide it behind the cap.
    over_count = int(np.count_nonzero(raw_density > 100.0))
    if over_count:
        print(
            f"  note: {over_count:,} postcode(s) exceeded 100% raw canopy and were "
            "capped — indicates overlapping TOW/NFI canopy within the buffer"
        )

    mean_height = np.divide(
        height_weighted_sum,
        height_weight,
        out=np.full(n_points, np.nan, dtype=np.float64),
        where=height_weight > 0,
    )

    return pl.DataFrame(
        {
            "postcode": points["postcode"],
            area_col: canopy_area.round(1).astype(np.float32),
            density_col: density_pct.round(1).astype(np.float32),
            height_col: np.round(mean_height, 1).astype(np.float32),
        }
    ).with_columns(
        pl.col(height_col).fill_nan(None),
    )


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Build postcode-level tree-density metrics from FR_TOW_V1_ALL.zip"
    )
    parser.add_argument(
        "--tow-zip",
        type=Path,
        default=Path("property-data/FR_TOW_V1_ALL.zip"),
        help="Forest Research TOW zip containing FR_TOW_V1_ALL.gdb",
    )
    parser.add_argument(
        "--extract-dir",
        type=Path,
        default=Path("property-data/fr_tow_v1_all"),
        help="Directory where the zip is extracted",
    )
    parser.add_argument(
        "--force-extract",
        action="store_true",
        help="Re-extract the TOW zip even if the geodatabase already exists",
    )
    parser.add_argument(
        "--use-vsizip",
        action="store_true",
        help="Read the geodatabase directly from the zip instead of extracting it",
    )
    parser.add_argument(
        "--nfi-zip",
        type=Path,
        default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
        help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
    )
    parser.add_argument(
        "--nfi-extract-dir",
        type=Path,
        default=Path("property-data/nfi_woodland_england"),
        help="Directory where the NFI zip is extracted",
    )
    parser.add_argument(
        "--arcgis",
        type=Path,
        default=Path("property-data/arcgis_data.parquet"),
        help="Postcode centroid parquet with east1m/north1m columns",
    )
    parser.add_argument(
        "--output-postcodes",
        type=Path,
        required=True,
        help="Output postcode-level tree-density parquet",
    )
    parser.add_argument(
        "--radius-m",
        type=int,
        default=50,
        help="Radius around each postcode centroid used as the extended buffer",
    )
    parser.add_argument(
        "--layers",
        default=None,
        help="Optional comma-separated subset of TOW layers for testing",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=65_536,
        help="Arrow batch size for reading TOW features",
    )
    parser.add_argument(
        "--max-postcodes",
        type=int,
        default=None,
        help="Testing only: process the first N postcode points",
    )
    parser.add_argument(
        "--max-features-per-layer",
        type=int,
        default=None,
        help="Testing only: process at most N TOW features per layer",
    )
    parser.add_argument(
        "--max-nfi-features",
        type=int,
        default=None,
        help="Testing only: process at most N NFI woodland features",
    )
    args = parser.parse_args()

    if args.radius_m <= 0:
        raise SystemExit("--radius-m must be greater than zero")

    dataset_path = _tow_dataset_path(
        args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
    )
    points = _postcode_points(args.arcgis, args.max_postcodes)
    layer_names = _parse_csv_arg(args.layers)

    n_points = points.height
    canopy_area = np.zeros(n_points, dtype=np.float64)
    height_weighted_sum = np.zeros(n_points, dtype=np.float64)
    height_weight = np.zeros(n_points, dtype=np.float64)

    circles, tree = _postcode_buffers(points, args.radius_m)

    _accumulate_tow_metrics(
        dataset_path=dataset_path,
        circles=circles,
        tree=tree,
        canopy_area=canopy_area,
        height_weighted_sum=height_weighted_sum,
        height_weight=height_weight,
        batch_size=args.batch_size,
        layer_names=layer_names,
        max_features_per_layer=args.max_features_per_layer,
    )

    if args.nfi_zip is not None and args.nfi_zip.exists():
        nfi_path = _nfi_dataset_path(
            args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
        )
        _accumulate_nfi_metrics(
            dataset_path=nfi_path,
            circles=circles,
            tree=tree,
            canopy_area=canopy_area,
            batch_size=args.batch_size,
            max_nfi_features=args.max_nfi_features,
        )
    elif args.nfi_zip is not None:
        print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")

    postcode_metrics = _finalize_metrics(
        points,
        canopy_area,
        height_weighted_sum,
        height_weight,
        args.radius_m,
    )
    postcode_metrics = _with_postcode_density_percentiles(
        postcode_metrics, args.radius_m
    )

    args.output_postcodes.parent.mkdir(parents=True, exist_ok=True)
    postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
    print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")


def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
    if value is None:
        return None
    if value.lower() == "all":
        return None
    parts = tuple(part.strip() for part in value.split(",") if part.strip())
    return parts or None


if __name__ == "__main__":
    main()