677 lines
25 KiB
Python
677 lines
25 KiB
Python
"""Derive postcode-scale tree density metrics from Forest Research TOW + NFI data.
|
|
|
|
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
|
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
|
postcode-level metric from the tree polygons so the dashboard can answer "how
|
|
green is this postcode?" without loading the full geodatabase at runtime.
|
|
|
|
Every postcode centroid is expanded into a radius-r buffer ("extended area").
|
|
Both TOW tree crowns and National Forest Inventory (NFI) woodland parcels are
|
|
accumulated by *true buffer-clipped intersection area*: only the part of each
|
|
polygon that falls inside a postcode's buffer is counted, never the area that
|
|
spills outside it. A crown straddling the buffer edge therefore contributes only
|
|
its inside portion, and a parcel reaching into the buffer from outside is still
|
|
counted -- no polygon can saturate a postcode from mere proximity.
|
|
|
|
TOW only covers trees *outside* woodland, so the NFI woodland layer is the
|
|
geometric complement of TOW and is optionally unioned in. The two products are
|
|
*assumed disjoint*: clipped TOW crown area and clipped NFI woodland area are
|
|
summed into the same per-postcode accumulator, so any spatial overlap between a
|
|
TOW crown and an NFI parcel (boundary slop where "groups of trees" meet
|
|
"woodland") would be double-counted. The final density is capped at 100% and
|
|
_finalize_metrics logs how many postcodes exceed 100% raw coverage, which is a
|
|
direct symptom of such overlap (or of overlapping crowns within one buffer); if
|
|
that count is material the products are not disjoint and the NFI clip should be
|
|
taken against the complement of TOW.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import math
|
|
import shutil
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
import pyogrio
|
|
import shapely
|
|
|
|
|
|
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
|
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
|
|
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
|
|
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
|
|
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
|
|
|
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
|
|
# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
|
|
# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
|
|
NFI_CATEGORY_COL = "CATEGORY"
|
|
NFI_WOODLAND_VALUE = "Woodland"
|
|
NFI_TYPE_COL = "IFT_IOA"
|
|
NFI_AREA_HA_COL = "Area_ha"
|
|
|
|
|
|
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
|
"""Extract the TOW zip and return the extracted .gdb path."""
|
|
gdb_path = extract_dir / TOW_GDB_NAME
|
|
if gdb_path.exists() and not force:
|
|
print(f"Using existing extracted geodatabase: {gdb_path}")
|
|
return gdb_path
|
|
|
|
if force and extract_dir.exists():
|
|
shutil.rmtree(extract_dir)
|
|
elif extract_dir.exists():
|
|
print(f"Removing incomplete extraction directory: {extract_dir}")
|
|
shutil.rmtree(extract_dir)
|
|
|
|
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
|
|
if tmp_dir.exists():
|
|
shutil.rmtree(tmp_dir)
|
|
tmp_dir.mkdir(parents=True)
|
|
|
|
root = tmp_dir.resolve()
|
|
print(f"Extracting {zip_path} to {extract_dir}...")
|
|
with zipfile.ZipFile(zip_path) as archive:
|
|
for member in archive.infolist():
|
|
target = (tmp_dir / member.filename).resolve()
|
|
if root != target and root not in target.parents:
|
|
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
|
|
|
|
if member.is_dir():
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
continue
|
|
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
with archive.open(member) as source, target.open("wb") as dest:
|
|
shutil.copyfileobj(source, dest, length=1024 * 1024)
|
|
|
|
if not (tmp_dir / TOW_GDB_NAME).exists():
|
|
raise FileNotFoundError(f"{TOW_GDB_NAME} was not found inside {zip_path}")
|
|
|
|
tmp_dir.rename(extract_dir)
|
|
print(f"Extracted geodatabase: {gdb_path}")
|
|
return gdb_path
|
|
|
|
|
|
def _tow_dataset_path(
|
|
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
|
) -> str:
|
|
if use_vsizip:
|
|
return f"/vsizip/{zip_path.resolve()}/{TOW_GDB_NAME}"
|
|
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
|
|
|
|
|
|
def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
|
"""Extract an arbitrary zip into extract_dir and return the directory."""
|
|
if extract_dir.exists() and not force:
|
|
print(f"Using existing extraction directory: {extract_dir}")
|
|
return extract_dir
|
|
if extract_dir.exists():
|
|
shutil.rmtree(extract_dir)
|
|
|
|
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
|
|
if tmp_dir.exists():
|
|
shutil.rmtree(tmp_dir)
|
|
tmp_dir.mkdir(parents=True)
|
|
|
|
root = tmp_dir.resolve()
|
|
print(f"Extracting {zip_path} to {extract_dir}...")
|
|
with zipfile.ZipFile(zip_path) as archive:
|
|
for member in archive.infolist():
|
|
target = (tmp_dir / member.filename).resolve()
|
|
if root != target and root not in target.parents:
|
|
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
|
|
if member.is_dir():
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
continue
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
with archive.open(member) as source, target.open("wb") as dest:
|
|
shutil.copyfileobj(source, dest, length=1024 * 1024)
|
|
|
|
tmp_dir.rename(extract_dir)
|
|
print(f"Extracted archive: {extract_dir}")
|
|
return extract_dir
|
|
|
|
|
|
def _nfi_dataset_path(
|
|
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
|
) -> str:
|
|
"""Resolve the NFI woodland shapefile path, extracting the zip if needed.
|
|
|
|
Raises if the archive contains zero or more than one shapefile rather than
|
|
silently picking one, so an ambiguous NFI release fails loudly instead of
|
|
accumulating canopy from the wrong layer.
|
|
"""
|
|
if use_vsizip:
|
|
return f"/vsizip/{zip_path.resolve()}"
|
|
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
|
|
shapefiles = sorted(extracted.rglob("*.shp"))
|
|
if not shapefiles:
|
|
raise FileNotFoundError(f"No .shp found inside {zip_path}")
|
|
if len(shapefiles) > 1:
|
|
names = ", ".join(path.name for path in shapefiles)
|
|
raise ValueError(
|
|
f"Expected exactly one shapefile inside {zip_path}, found {len(shapefiles)} "
|
|
f"({names}); cannot unambiguously pick the NFI woodland layer"
|
|
)
|
|
return str(shapefiles[0])
|
|
|
|
|
|
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
|
|
"""Resolve the geometry column name from pyogrio Arrow metadata."""
|
|
geometry_name = metadata.get("geometry_name")
|
|
if geometry_name:
|
|
return str(geometry_name)
|
|
for name in ("wkb_geometry", "geometry", "geom", "SHAPE"):
|
|
if name in column_names:
|
|
return name
|
|
return column_names[-1]
|
|
|
|
|
|
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
|
|
points = (
|
|
pl.scan_parquet(arcgis_path)
|
|
.filter(pl.col("ctry25cd") == "E92000001")
|
|
.filter(pl.col("doterm").is_null())
|
|
.select(
|
|
pl.col("pcds").alias("postcode"),
|
|
pl.col("east1m").cast(pl.Float64).alias("x"),
|
|
pl.col("north1m").cast(pl.Float64).alias("y"),
|
|
)
|
|
.drop_nulls(["postcode", "x", "y"])
|
|
.unique("postcode")
|
|
.sort("postcode")
|
|
)
|
|
if max_postcodes is not None:
|
|
points = points.head(max_postcodes)
|
|
df = points.collect()
|
|
print(f"Loaded {df.height:,} active English postcode points")
|
|
return df
|
|
|
|
|
|
def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[str]:
|
|
available = [layer for layer, _geometry_type in pyogrio.list_layers(dataset_path)]
|
|
if selected_layers is None:
|
|
return available
|
|
|
|
missing = sorted(set(selected_layers) - set(available))
|
|
if missing:
|
|
raise ValueError(f"Unknown TOW layer(s): {', '.join(missing)}")
|
|
return [layer for layer in available if layer in selected_layers]
|
|
|
|
|
|
def _metric_columns(radius_m: int) -> tuple[str, str, str]:
|
|
return (
|
|
POSTCODE_DENSITY_COL.format(radius=radius_m),
|
|
POSTCODE_AREA_COL.format(radius=radius_m),
|
|
POSTCODE_HEIGHT_COL.format(radius=radius_m),
|
|
)
|
|
|
|
|
|
def _postcode_density_percentile_col(radius_m: int) -> str:
|
|
return POSTCODE_DENSITY_PERCENTILE_COL.format(radius=radius_m)
|
|
|
|
|
|
def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
|
|
"""Rank tree coverage on a 0-100 England-wide percentile scale.
|
|
|
|
A single tie-consistent average-rank formula is used for every value so the
|
|
scale is internally consistent end to end: tied values share their mean rank,
|
|
so the lowest coverage maps toward 0 and the highest toward 100 only when they
|
|
are not themselves tied. An all-equal (or single-value) column has no spread
|
|
and maps to the neutral midpoint (50).
|
|
"""
|
|
value = pl.col(column).fill_nan(None)
|
|
non_null_count = value.count()
|
|
rank = value.rank("average")
|
|
return (
|
|
pl.when(value.is_null())
|
|
.then(None)
|
|
.when(non_null_count > 1)
|
|
.then(((rank - 1) / (non_null_count - 1) * 100).round(1))
|
|
.otherwise(50.0)
|
|
.cast(pl.Float32)
|
|
.alias(alias)
|
|
)
|
|
|
|
|
|
def _with_postcode_density_percentiles(
|
|
postcode_metrics: pl.DataFrame, radius_m: int
|
|
) -> pl.DataFrame:
|
|
density_col, _area_col, _height_col = _metric_columns(radius_m)
|
|
return postcode_metrics.with_columns(
|
|
_coverage_percentile_expr(
|
|
density_col,
|
|
_postcode_density_percentile_col(radius_m),
|
|
)
|
|
)
|
|
|
|
|
|
def _postcode_buffers(
|
|
points: pl.DataFrame, radius_m: int
|
|
) -> tuple[np.ndarray, shapely.STRtree]:
|
|
"""Build a radius-r circle for every postcode plus an STRtree over them.
|
|
|
|
Circle index == postcode index, so an STRtree match resolves directly to the
|
|
postcode accumulator slot.
|
|
"""
|
|
xy = points.select("x", "y").to_numpy()
|
|
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
|
|
return circles, shapely.STRtree(circles)
|
|
|
|
|
|
# 0.1 mm in the BNG working CRS (EPSG:27700) — far below survey resolution; the
|
|
# same grid the postcode_boundaries overlay uses.
|
|
_OVERLAY_GRID_M = 1e-4
|
|
|
|
|
|
def _robust_intersection_area(a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
"""Vectorized ``area(a[i] ∩ b[i])`` that survives GEOS robustness failures.
|
|
|
|
External Forest Research TOW/NFI polygons are occasionally invalid
|
|
(self-intersections), and a single bad polygon makes the batched
|
|
``shapely.intersection`` raise ``TopologyException: side location conflict``,
|
|
aborting the whole run. The fast path is the raw batched overlay — unchanged,
|
|
full-speed, when the data is clean — and only a failure triggers repair.
|
|
|
|
The repair deliberately uses a *plain* overlay rather than the fixed-precision
|
|
(``grid_size``) one: ``make_valid`` can emit a mixed-dimension
|
|
``GeometryCollection`` (a polygon plus a dangling line), which OverlayNG
|
|
rejects with ``Overlay input is mixed-dimension`` — whereas a plain overlay
|
|
accepts it, and its non-polygonal debris has zero area and is dropped by the
|
|
``clipped_area > 0`` filter downstream anyway. A final pointwise coordinate
|
|
snap (which never raises) collapses the near-coincident edges behind any
|
|
residual full-precision robustness failure.
|
|
"""
|
|
try:
|
|
return shapely.area(shapely.intersection(a, b))
|
|
except shapely.errors.GEOSException:
|
|
pass
|
|
a = shapely.make_valid(a)
|
|
b = shapely.make_valid(b)
|
|
try:
|
|
return shapely.area(shapely.intersection(a, b))
|
|
except shapely.errors.GEOSException:
|
|
a = shapely.make_valid(shapely.set_precision(a, _OVERLAY_GRID_M, mode="pointwise"))
|
|
b = shapely.make_valid(shapely.set_precision(b, _OVERLAY_GRID_M, mode="pointwise"))
|
|
return shapely.area(shapely.intersection(a, b))
|
|
|
|
|
|
def _accumulate_clipped_area(
|
|
geoms: np.ndarray,
|
|
circles: np.ndarray,
|
|
tree: shapely.STRtree,
|
|
canopy_area: np.ndarray,
|
|
height: np.ndarray | None = None,
|
|
height_weighted_sum: np.ndarray | None = None,
|
|
height_weight: np.ndarray | None = None,
|
|
) -> None:
|
|
"""Add each polygon's in-buffer overlap area to every postcode it intersects.
|
|
|
|
Only area(polygon ∩ circle) is accumulated -- never the area of the polygon
|
|
that falls outside the postcode's extended buffer -- so a crown straddling
|
|
the buffer edge contributes only its inside portion and a large parcel cannot
|
|
saturate a postcode from mere proximity. When ``height`` is supplied the mean
|
|
feature height is accumulated weighted by that same clipped overlap area.
|
|
"""
|
|
keep = ~shapely.is_missing(geoms) & ~shapely.is_empty(geoms)
|
|
geoms = geoms[keep]
|
|
if height is not None:
|
|
height = height[keep]
|
|
if geoms.size == 0:
|
|
return
|
|
|
|
# query(predicate="intersects") over the circle STRtree returns exactly the
|
|
# (polygon, circle) pairs whose clipped overlap can be positive -- i.e. the
|
|
# polygon overlaps that postcode's radius-r buffer.
|
|
geom_index, postcode_index = tree.query(geoms, predicate="intersects")
|
|
if geom_index.size == 0:
|
|
return
|
|
|
|
clipped_area = _robust_intersection_area(
|
|
geoms[geom_index], circles[postcode_index]
|
|
)
|
|
positive = clipped_area > 0
|
|
geom_index = geom_index[positive]
|
|
postcode_index = postcode_index[positive]
|
|
clipped_area = clipped_area[positive]
|
|
|
|
np.add.at(canopy_area, postcode_index, clipped_area)
|
|
|
|
if height is not None:
|
|
feature_height = height[geom_index]
|
|
finite = np.isfinite(feature_height)
|
|
if finite.any():
|
|
np.add.at(
|
|
height_weighted_sum,
|
|
postcode_index[finite],
|
|
feature_height[finite] * clipped_area[finite],
|
|
)
|
|
np.add.at(height_weight, postcode_index[finite], clipped_area[finite])
|
|
|
|
|
|
def _accumulate_tow_metrics(
|
|
dataset_path: str,
|
|
circles: np.ndarray,
|
|
tree: shapely.STRtree,
|
|
canopy_area: np.ndarray,
|
|
height_weighted_sum: np.ndarray,
|
|
height_weight: np.ndarray,
|
|
batch_size: int,
|
|
layer_names: tuple[str, ...] | None,
|
|
max_features_per_layer: int | None,
|
|
) -> None:
|
|
layers = _layers(dataset_path, layer_names)
|
|
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
|
|
|
columns = ["MEANHT"]
|
|
total_features_seen = 0
|
|
|
|
for layer in layers:
|
|
info = pyogrio.read_info(dataset_path, layer=layer)
|
|
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
|
|
layer_features_seen = 0
|
|
|
|
with pyogrio.open_arrow(
|
|
dataset_path,
|
|
layer=layer,
|
|
columns=columns,
|
|
batch_size=batch_size,
|
|
use_pyarrow=True,
|
|
) as (meta, reader):
|
|
for batch_index, batch in enumerate(reader, start=1):
|
|
if max_features_per_layer is not None:
|
|
remaining = max_features_per_layer - layer_features_seen
|
|
if remaining <= 0:
|
|
break
|
|
if batch.num_rows > remaining:
|
|
batch = batch.slice(0, remaining)
|
|
|
|
layer_features_seen += batch.num_rows
|
|
total_features_seen += batch.num_rows
|
|
names = batch.schema.names
|
|
geometry_column = _geometry_column(meta, names)
|
|
height = np.asarray(
|
|
batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
|
|
dtype=np.float64,
|
|
)
|
|
geometry = np.asarray(
|
|
batch.column(names.index(geometry_column)).to_numpy(
|
|
zero_copy_only=False
|
|
),
|
|
dtype=object,
|
|
)
|
|
_accumulate_clipped_area(
|
|
shapely.from_wkb(geometry),
|
|
circles,
|
|
tree,
|
|
canopy_area,
|
|
height=height,
|
|
height_weighted_sum=height_weighted_sum,
|
|
height_weight=height_weight,
|
|
)
|
|
|
|
if batch_index == 1 or batch_index % 25 == 0:
|
|
print(f" batch {batch_index:,}: {total_features_seen:,} rows read")
|
|
|
|
|
|
def _accumulate_nfi_metrics(
|
|
dataset_path: str,
|
|
circles: np.ndarray,
|
|
tree: shapely.STRtree,
|
|
canopy_area: np.ndarray,
|
|
batch_size: int,
|
|
max_nfi_features: int | None,
|
|
) -> None:
|
|
layers = _layers(dataset_path, None)
|
|
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
|
|
|
|
# Density only needs the woodland flag + geometry; area is clipped from the
|
|
# postcode buffer, not read from the file.
|
|
columns = [NFI_CATEGORY_COL]
|
|
features_seen = 0
|
|
|
|
for layer in layers:
|
|
with pyogrio.open_arrow(
|
|
dataset_path,
|
|
layer=layer,
|
|
columns=columns,
|
|
batch_size=batch_size,
|
|
use_pyarrow=True,
|
|
) as (meta, reader):
|
|
for batch_index, batch in enumerate(reader, start=1):
|
|
if max_nfi_features is not None:
|
|
remaining = max_nfi_features - features_seen
|
|
if remaining <= 0:
|
|
break
|
|
if batch.num_rows > remaining:
|
|
batch = batch.slice(0, remaining)
|
|
|
|
features_seen += batch.num_rows
|
|
names = batch.schema.names
|
|
geometry_column = _geometry_column(meta, names)
|
|
category = np.asarray(
|
|
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
|
|
zero_copy_only=False
|
|
),
|
|
dtype=object,
|
|
)
|
|
geometry = np.asarray(
|
|
batch.column(names.index(geometry_column)).to_numpy(
|
|
zero_copy_only=False
|
|
),
|
|
dtype=object,
|
|
)
|
|
geoms = shapely.from_wkb(geometry)
|
|
_accumulate_clipped_area(
|
|
geoms[category == NFI_WOODLAND_VALUE],
|
|
circles,
|
|
tree,
|
|
canopy_area,
|
|
)
|
|
if batch_index == 1 or batch_index % 25 == 0:
|
|
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
|
|
|
|
|
|
def _finalize_metrics(
|
|
points: pl.DataFrame,
|
|
canopy_area: np.ndarray,
|
|
height_weighted_sum: np.ndarray,
|
|
height_weight: np.ndarray,
|
|
radius_m: int,
|
|
) -> pl.DataFrame:
|
|
n_points = points.height
|
|
density_col, area_col, height_col = _metric_columns(radius_m)
|
|
buffer_area = math.pi * radius_m * radius_m
|
|
raw_density = canopy_area / buffer_area * 100.0
|
|
density_pct = np.minimum(raw_density, 100.0)
|
|
|
|
# Symptom of the assumed-disjoint TOW/NFI union being violated (or of
|
|
# overlapping crowns inside one buffer): clipped areas alone cannot exceed the
|
|
# buffer unless polygons overlap. Surface it rather than hide it behind the cap.
|
|
over_count = int(np.count_nonzero(raw_density > 100.0))
|
|
if over_count:
|
|
print(
|
|
f" note: {over_count:,} postcode(s) exceeded 100% raw canopy and were "
|
|
"capped — indicates overlapping TOW/NFI canopy within the buffer"
|
|
)
|
|
|
|
mean_height = np.divide(
|
|
height_weighted_sum,
|
|
height_weight,
|
|
out=np.full(n_points, np.nan, dtype=np.float64),
|
|
where=height_weight > 0,
|
|
)
|
|
|
|
return pl.DataFrame(
|
|
{
|
|
"postcode": points["postcode"],
|
|
area_col: canopy_area.round(1).astype(np.float32),
|
|
density_col: density_pct.round(1).astype(np.float32),
|
|
height_col: np.round(mean_height, 1).astype(np.float32),
|
|
}
|
|
).with_columns(
|
|
pl.col(height_col).fill_nan(None),
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Build postcode-level tree-density metrics from FR_TOW_V1_ALL.zip"
|
|
)
|
|
parser.add_argument(
|
|
"--tow-zip",
|
|
type=Path,
|
|
default=Path("property-data/FR_TOW_V1_ALL.zip"),
|
|
help="Forest Research TOW zip containing FR_TOW_V1_ALL.gdb",
|
|
)
|
|
parser.add_argument(
|
|
"--extract-dir",
|
|
type=Path,
|
|
default=Path("property-data/fr_tow_v1_all"),
|
|
help="Directory where the zip is extracted",
|
|
)
|
|
parser.add_argument(
|
|
"--force-extract",
|
|
action="store_true",
|
|
help="Re-extract the TOW zip even if the geodatabase already exists",
|
|
)
|
|
parser.add_argument(
|
|
"--use-vsizip",
|
|
action="store_true",
|
|
help="Read the geodatabase directly from the zip instead of extracting it",
|
|
)
|
|
parser.add_argument(
|
|
"--nfi-zip",
|
|
type=Path,
|
|
default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
|
|
help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
|
|
)
|
|
parser.add_argument(
|
|
"--nfi-extract-dir",
|
|
type=Path,
|
|
default=Path("property-data/nfi_woodland_england"),
|
|
help="Directory where the NFI zip is extracted",
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis",
|
|
type=Path,
|
|
default=Path("property-data/arcgis_data.parquet"),
|
|
help="Postcode centroid parquet with east1m/north1m columns",
|
|
)
|
|
parser.add_argument(
|
|
"--output-postcodes",
|
|
type=Path,
|
|
required=True,
|
|
help="Output postcode-level tree-density parquet",
|
|
)
|
|
parser.add_argument(
|
|
"--radius-m",
|
|
type=int,
|
|
default=50,
|
|
help="Radius around each postcode centroid used as the extended buffer",
|
|
)
|
|
parser.add_argument(
|
|
"--layers",
|
|
default=None,
|
|
help="Optional comma-separated subset of TOW layers for testing",
|
|
)
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
type=int,
|
|
default=65_536,
|
|
help="Arrow batch size for reading TOW features",
|
|
)
|
|
parser.add_argument(
|
|
"--max-postcodes",
|
|
type=int,
|
|
default=None,
|
|
help="Testing only: process the first N postcode points",
|
|
)
|
|
parser.add_argument(
|
|
"--max-features-per-layer",
|
|
type=int,
|
|
default=None,
|
|
help="Testing only: process at most N TOW features per layer",
|
|
)
|
|
parser.add_argument(
|
|
"--max-nfi-features",
|
|
type=int,
|
|
default=None,
|
|
help="Testing only: process at most N NFI woodland features",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if args.radius_m <= 0:
|
|
raise SystemExit("--radius-m must be greater than zero")
|
|
|
|
dataset_path = _tow_dataset_path(
|
|
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
|
|
)
|
|
points = _postcode_points(args.arcgis, args.max_postcodes)
|
|
layer_names = _parse_csv_arg(args.layers)
|
|
|
|
n_points = points.height
|
|
canopy_area = np.zeros(n_points, dtype=np.float64)
|
|
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
|
height_weight = np.zeros(n_points, dtype=np.float64)
|
|
|
|
circles, tree = _postcode_buffers(points, args.radius_m)
|
|
|
|
_accumulate_tow_metrics(
|
|
dataset_path=dataset_path,
|
|
circles=circles,
|
|
tree=tree,
|
|
canopy_area=canopy_area,
|
|
height_weighted_sum=height_weighted_sum,
|
|
height_weight=height_weight,
|
|
batch_size=args.batch_size,
|
|
layer_names=layer_names,
|
|
max_features_per_layer=args.max_features_per_layer,
|
|
)
|
|
|
|
if args.nfi_zip is not None and args.nfi_zip.exists():
|
|
nfi_path = _nfi_dataset_path(
|
|
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
|
|
)
|
|
_accumulate_nfi_metrics(
|
|
dataset_path=nfi_path,
|
|
circles=circles,
|
|
tree=tree,
|
|
canopy_area=canopy_area,
|
|
batch_size=args.batch_size,
|
|
max_nfi_features=args.max_nfi_features,
|
|
)
|
|
elif args.nfi_zip is not None:
|
|
print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
|
|
|
|
postcode_metrics = _finalize_metrics(
|
|
points,
|
|
canopy_area,
|
|
height_weighted_sum,
|
|
height_weight,
|
|
args.radius_m,
|
|
)
|
|
postcode_metrics = _with_postcode_density_percentiles(
|
|
postcode_metrics, args.radius_m
|
|
)
|
|
|
|
args.output_postcodes.parent.mkdir(parents=True, exist_ok=True)
|
|
postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
|
|
print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")
|
|
|
|
|
|
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
|
|
if value is None:
|
|
return None
|
|
if value.lower() == "all":
|
|
return None
|
|
parts = tuple(part.strip() for part in value.split(",") if part.strip())
|
|
return parts or None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|